# Pre On Boarding assingment

### 1) Tokenizer

In [1]:
import re

class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov' : 0}
        self.fit_checker = False
    
    def preprocessing(self, sequences):
        result = []
        '''
        문제 1-1.
        '''
        for sentence in sequences:
            string = re.sub(r'[^a-zA-Z0-9가-힣 ]', '', sentence.lower())
            result.append(string.split())
        return result
    
    def fit(self, sequences):
        self.fit_checker = False
        '''
        문제 1-2.
        '''
        tokens = self.preprocessing(sequences)
        idx = 1
        for token in tokens:
            for word in token:
                if word not in self.word_dict:
                    self.word_dict[word] = idx
                    idx += 1
        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            '''
            문제 1-3.
            '''
            for token in tokens:
                indexes = []
                for word in token:
                    if word in self.word_dict:
                        indexes.append(self.word_dict[word])
                    else:
                        indexes.append(self.word_dict['oov'])
                result.append(indexes)
            return result
        else:
            raise Exception("Tokenizer instance is not fitted yet.")
    
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [2]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!', "I don't like it."]

print(test.preprocessing(test_input))

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza'], ['i', 'dont', 'like', 'it']]


In [3]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit(test_input)
print(test.word_dict)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}


In [4]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit(test_input)
print(test.transform(test_input))

[[1, 2, 3, 4], [1, 5, 6]]


In [5]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit_transform(test_input)

[[1, 2, 3, 4], [1, 5, 6]]

### 2) TfIdf Vecotrizer

In [6]:
import math

class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
    
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        '''
        문제 2-1.
        '''
        # 문장의 갯수 N
        N = len(tokenized)                                
        # 말뭉치 전체에 포함된 단어 리스트 (oov 제외)
        words = list(self.tokenizer.word_dict.values())[1:]   
        
        self.idf_vector = [0.0] * len(words)
        
        for i in range(len(words)):
            df = 0
            for sentence in tokenized:
                df += words[i] in sentence
            self.idf_vector[i] = math.log(N/(1+df))
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            '''
            문제 2-2.
            '''
            N = len(tokenized)
            words = list(self.tokenizer.word_dict.values())[1:]
            # (N x t) 형태의 중첩 리스트 정의
            self.tfidf_matrix = [[0.0]*len(words) for i in range(N)]  
            
            for i in range(N):
                sent_len = len(tokenized[i])
                for j in range(len(words)):
                    tf = (tokenized[i].count(words[j])) / sent_len
                    tfidf = tf * self.idf_vector[j]
                    self.tfidf_matrix[i][j] = tfidf
            
            return self.tfidf_matrix
        else:
            raise Exception("TfidfVectorizer instance is not fitted yet.")
        
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [7]:
test_input = ['I go to school.', 'I LIKE pizza!']
tfidf_test = TfidfVectorizer(test)

tfidf_test.fit(test_input)
tfidf_test.idf_vector

[-0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]

In [8]:
tfidf_test.transform(test_input)

[[-0.10136627702704111, 0.0, 0.0, 0.0, 0.0, 0.0],
 [-0.1351550360360548, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [9]:
test_input = ['먹고 싶은 사과',
              '먹고 싶은 바나나',
              '길고 노란 바나나 바나나',
              '저는 과일이 좋아요']

test2 = Tokenizer()
tfidf_test2 = TfidfVectorizer(test2)

test2_output = tfidf_test2.fit_transform(test_input)
tfidf_test2.idf_vector

[0.28768207245178085,
 0.28768207245178085,
 0.6931471805599453,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453]

In [10]:
import pandas as pd
pd.DataFrame(test2_output, columns=list(test2.word_dict.keys())[1:])

Unnamed: 0,먹고,싶은,사과,바나나,길고,노란,저는,과일이,좋아요
0,0.095894,0.095894,0.231049,0.0,0.0,0.0,0.0,0.0,0.0
1,0.095894,0.095894,0.0,0.095894,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.143841,0.173287,0.173287,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.231049,0.231049,0.231049
