# Pre On Boarding assingment

### 1) Tokenizer

In [16]:
import re

class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov' : 0}
        self.fit_checker = False
    
    def preprocessing(self, sequences):
        result = []
        for sentence in sequences:
            string = re.sub(r'[^a-zA-Z0-9 ]', '', sentence.lower())
            result.append(string.split())
        return result
    
    def fit(self, sequences):
        self.fit_checker = False
        tokens = self.preprocessing(sequences)
        idx = 1
        for token in tokens:
            for word in token:
                if word not in self.word_dict:
                    self.word_dict[word] = idx
                    idx += 1
        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for token in tokens:
                indexes = []
                for word in token:
                    if word in self.word_dict:
                        indexes.append(self.word_dict[word])
                    else:
                        indexes.append(self.word_dict['oov'])
                result.append(indexes)
            return result
        else:
            raise Exception('Tokenizer instance is not fitted yet')
    
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [22]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!', "I don't like it."]

print(test.preprocessing(test_input))

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza'], ['i', 'dont', 'like', 'it']]


In [23]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit(test_input)
print(test.word_dict)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}


In [19]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit(test_input)
print(test.transform(test_input))

[[1, 2, 3, 4], [1, 5, 6]]


In [20]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit_transform(test_input)

[[1, 2, 3, 4], [1, 5, 6]]

### 2) TfIdf Vecotrizer

In [63]:
import math

class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
    
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        N = len(tokenized)
        words = list(self.tokenizer.word_dict.values())

        self.idf_vector = [0]*len(words)
        for i in range(len(words)):
            df = 0
            for sentence in tokenized:
                if words[i] in sentence:
                    df += 1
            self.idf_vector[i] = math.log(N/(1+df))
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            N = len(tokenized)
            words = list(self.tokenizer.word_dict.values())
            
            self.tfidf_matrix = [[0]*len(words)for i in range(N)]
            for i in range(N):
                for j in range(len(words)):
                    tfidf = tokenized[i].count(words[j]) * self.idf_vector[i]
                    self.tfidf_matrix[i][j] = tfidf
            
            return self.tfidf_matrix
        else:
            raise Exception('TfidfVectorizer instance is not fitted yet')
        
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [64]:
test_input = ['I go to school.', 'I LIKE pizza!']
tfidf_test = TfidfVectorizer(test)

tfidf_test.fit(test_input)
tfidf_test.idf_vector

[0.6931471805599453, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]

In [65]:
tfidf_test.transform(test_input)

[[0.0, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [74]:
test_input = ['You know I want your love',
             'I like you',
             'What should I do',
             'Deep Learning from Scratch',
             "You don't know me"]

test2 = Tokenizer()
tfidf_test2 = TfidfVectorizer(test2)
from pprint import pprint
pprint(tfidf_test2.fit_transform(test_input))
pprint(tfidf_test2.tokenizer.word_dict)

[[0.0,
  0.22314355131420976,
  0.5108256237659907,
  0.22314355131420976,
  0.9162907318741551,
  0.9162907318741551,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.22314355131420976,
  0.0,
  0.22314355131420976,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.22314355131420976,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.9162907318741551,
  0.9162907318741551,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.9162907318741551,
  0.9162907318741551,
  0.9162907318741551,
  0.0,
  0.0],
 [0.0,
  0.22314355131420976,
  0.5108256237659907,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.9162907318741551,
  0.9162907318741551]]
{'deep': 11,
 'do': 10,
 'dont': 15,
 'from': 13,
 'i': 3,
 'know': 2,
 'learni