# Pre On Boarding assingment

### 1) Tokenizer

In [1]:
import re

class Tokenizer():
    def __init__(self):
        self.word_dict = {'oov' : 0}
        self.fit_checker = False
    
    def preprocessing(self, sequences):
        result = []
        for sentence in sequences:
            string = re.sub(r'[^a-zA-Z0-9가-힣 ]', '', sentence.lower())
            result.append(string.split())
        return result
    
    def fit(self, sequences):
        self.fit_checker = False
        tokens = self.preprocessing(sequences)
        idx = 1
        for token in tokens:
            for word in token:
                if word not in self.word_dict:
                    self.word_dict[word] = idx
                    idx += 1
        self.fit_checker = True
    
    def transform(self, sequences):
        result = []
        tokens = self.preprocessing(sequences)
        if self.fit_checker:
            for token in tokens:
                indexes = []
                for word in token:
                    if word in self.word_dict:
                        indexes.append(self.word_dict[word])
                    else:
                        indexes.append(self.word_dict['oov'])
                result.append(indexes)
            return result
        else:
            raise Exception('Tokenizer instance is not fitted yet')
    
    def fit_transform(self, sequences):
        self.fit(sequences)
        result = self.transform(sequences)
        return result

In [2]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!', "I don't like it."]

print(test.preprocessing(test_input))

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza'], ['i', 'dont', 'like', 'it']]


In [3]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit(test_input)
print(test.word_dict)

{'oov': 0, 'i': 1, 'go': 2, 'to': 3, 'school': 4, 'like': 5, 'pizza': 6}


In [4]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit(test_input)
print(test.transform(test_input))

[[1, 2, 3, 4], [1, 5, 6]]


In [5]:
test = Tokenizer()
test_input = ['I go to school.', 'I LIKE pizza!']

test.fit_transform(test_input)

[[1, 2, 3, 4], [1, 5, 6]]

In [6]:
wiki_python = ["Python is an interpreted high-level general-purpose programming language.",
               "Its design philosophy emphasizes code readability with its use of significant indentation.",
               "Its language constructs as well as its object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects.",
               "Python is dynamically-typed and garbage-collected.",
               "It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming.",
               "It is often described as a 'batteries included' language due to its comprehensive standard library.",
               "Guido van Rossum began working on Python in the late 1980s, as a successor to the ABC programming language, and first released it in 1991 as Python 0.9.0.",
               "Python 2.0 was released in 2000 and introduced new features, such as list comprehensions and a cycle-detecting garbage collection system (in addition to reference counting).",
               "Python 3.0 was released in 2008 and was a major revision of the language that is not completely backward-compatible.",
               "Python 2 was discontinued with version 2.7.18 in 2020.",
               "Python consistently ranks as one of the most popular programming languages."]
test1 = Tokenizer()
test1.fit_transform(wiki_python)

[[1, 2, 3, 4, 5, 6, 7, 8],
 [9, 10, 11, 12, 13, 14, 15, 9, 16, 17, 18, 19],
 [9,
  8,
  20,
  21,
  22,
  21,
  9,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  13,
  32,
  33,
  34,
  35,
  36],
 [1, 2, 37, 34, 38],
 [39, 40, 41, 7, 42, 43, 44, 45, 46, 23, 34, 47, 7],
 [39, 2, 48, 49, 21, 50, 51, 52, 8, 53, 26, 9, 54, 55, 56],
 [57,
  58,
  59,
  60,
  61,
  62,
  1,
  63,
  64,
  65,
  66,
  21,
  50,
  67,
  26,
  64,
  68,
  7,
  8,
  34,
  69,
  70,
  39,
  63,
  71,
  21,
  1,
  72],
 [1,
  73,
  74,
  70,
  63,
  75,
  34,
  76,
  77,
  78,
  79,
  21,
  80,
  81,
  34,
  50,
  82,
  83,
  84,
  85,
  63,
  86,
  26,
  87,
  88],
 [1, 89, 74, 70, 63, 90, 34, 74, 50, 91, 92, 17, 64, 8, 93, 2, 94, 95, 96],
 [1, 97, 74, 98, 15, 99, 100, 63, 101],
 [1, 102, 103, 21, 104, 17, 64, 105, 106, 7, 107]]

### 2) TfIdf Vecotrizer

In [14]:
import math

class TfidfVectorizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.fit_checker = False
    
    def fit(self, sequences):
        tokenized = self.tokenizer.fit_transform(sequences)
        N = len(tokenized)                      # 문장의 갯수 N
        words = list(self.tokenizer.word_dict.values())    # 말뭉치 전체에 포함된 단어 리스트
        
        self.idf_vector = [0.0]*len(words)
        
        for i in range(len(words)):
            df = 0
            for sentence in tokenized:
                df += words[i] in sentence
            self.idf_vector[i] = math.log(N/(1+df))
        self.fit_checker = True
    
    def transform(self, sequences):
        if self.fit_checker:
            tokenized = self.tokenizer.transform(sequences)
            N = len(tokenized)     # 문장의 갯수 N
            words = list(self.tokenizer.word_dict.values())    # 말뭉치 전체에 포함된 단어 리스트
            
            # (N x t) 형태의 리스트 정의
            self.tfidf_matrix = [[0.0]*len(words) for i in range(N)]
            
            for i in range(N):
                sent_len = len(tokenized[i])
                for j in range(len(words)):
                    tf = (tokenized[i].count(words[j])) #/sent_len
                    tfidf = tf * self.idf_vector[j]
                    self.tfidf_matrix[i][j] = tfidf
            
            return self.tfidf_matrix
        else:
            raise Exception('TfidfVectorizer instance is not fitted yet')
        
    def fit_transform(self, sequences):
        self.fit(sequences)
        return self.transform(sequences)

In [15]:
test_input = ['I go to school.', 'I LIKE pizza!']
tfidf_test = TfidfVectorizer(test)

tfidf_test.fit(test_input)
# 모든 문장에서 등장하는 단어의 경우 idf score = ln(N/1+N) 으로 음수로 계산된다.
# 문장을 충분히 많이 입력하면 해결되는 문제라 생각
tfidf_test.idf_vector

[0.6931471805599453, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]

In [16]:
tfidf_test.transform(test_input)

[[0.0, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, -0.40546510810816444, 0.0, 0.0, 0.0, 0.0, 0.0]]

In [28]:
test_input = ['먹고 싶은 사과',
              '먹고 싶은 바나나',
              '길고 노란 바나나 바나나',
              '저는 과일이 좋아요']

test2 = Tokenizer()
tfidf_test2 = TfidfVectorizer(test2)

test2_output = tfidf_test2.fit_transform(test_input)
test2_output
import numpy as np
np.array(tfidf_test2.idf_vector)

array([1.38629436, 0.28768207, 0.28768207, 0.69314718, 0.28768207,
       0.69314718, 0.69314718, 0.69314718, 0.69314718, 0.69314718])

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer as sktfidf

skbanana = sktfidf()

sklearn_banana = skbanana.fit_transform(test_input)
import pandas as pd
pd.DataFrame(sklearn_banana.todense(), columns=skbanana.get_feature_names_out())
skbanana.idf_

array([1.91629073, 1.91629073, 1.91629073, 1.51082562, 1.51082562,
       1.91629073, 1.51082562, 1.91629073, 1.91629073])

In [19]:
import pandas as pd
pd.DataFrame(test2_output, columns=list(test2.word_dict.keys()))

Unnamed: 0,oov,먹고,싶은,사과,바나나,길고,노란,저는,과일이,좋아요
0,0.0,0.287682,0.287682,0.693147,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.575364,0.693147,0.693147,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147,0.693147


In [20]:
test3 = Tokenizer()
tfidf_test3 = TfidfVectorizer(test3)

test3_output = tfidf_test3.fit_transform(wiki_python)
test3_output

[[0.0,
  0.3184537311185346,
  0.7884573603642703,
  1.7047480922384253,
  1.7047480922384253,
  1.7047480922384253,
  1.7047480922384253,
  0.7884573603642703,
  0.6061358035703155,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  2.0232018233569597,
  1.7047480922384253,
  1.704748092238

In [None]:
pd.DataFrame(tfidf_test3.fit_transform(wiki_python), columns=list(test3.word_dict.keys()))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer as sktfidf

tfidf = sktfidf()
sklearn_output = tfidf.fit_transform(wiki_python)

pd.DataFrame(sklearn_output.todense(), columns=tfidf.get_feature_names_out())