<a href="https://colab.research.google.com/github/epochee/wanted_pre_onboarding/blob/main/%EC%9B%90%ED%8B%B0%EB%93%9C_%ED%94%84%EB%A6%AC%EC%98%A8%EB%B3%B4%EB%94%A9_%EC%BD%94%EC%8A%A4_%EC%82%AC%EC%A0%84%EA%B3%BC%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 문제 1) Tokenizer **생성하기**

In [1]:
class Tokenizer():
  def __init__(self):
    self.word_dict = {'oov': 0}
    self.fit_checker = False

  def preprocessing(self, sequences):
    import re 

    result = []

    regex = r"[^a-zA-Z0-9 ]"
    subst = ""
    for text in sequences:
      tokens = re.sub(regex, subst, text)
      tokens = tokens.lower().split()
      result.append(tokens)

    return result
  
  def fit(self, sequences):
    self.fit_checker = False
    tokens = self.preprocessing(sequences)
    b = [item for sublist in tokens for item in sublist]
    c = list(dict.fromkeys(b))
    d = {string : i+1 for i,string in enumerate(c)}
    self.word_dict.update(d)
    self.fit_checker = True
  
  def transform(self, sequences):
    tokens = self.preprocessing(sequences)
    if self.fit_checker:
      for i,string in enumerate(tokens):
        for j in range(len(string)) :
          if tokens[i][j] in self.word_dict :
            tokens[i][j]=self.word_dict[tokens[i][j]]
          else :
            tokens[i][j] = self.word_dict['oov']
      result = tokens
      return result
    else:
      raise Exception("Tokenizer instance is not fitted yet.")
      
  def fit_transform(self, sequences):
    self.fit(sequences)
    result = self.transform(sequences)
    return result

In [2]:
tokenizer = Tokenizer()

# 1-1
input = ['I go to school.', 'I LIKE pizza!']

tokenizer.preprocessing(input)

[['i', 'go', 'to', 'school'], ['i', 'like', 'pizza']]

In [3]:
# 1-2

tokenizer.fit(input)

tokenizer.word_dict

{'go': 2, 'i': 1, 'like': 5, 'oov': 0, 'pizza': 6, 'school': 4, 'to': 3}

In [4]:
# 1-3

print(f'transform() : {tokenizer.transform(input)}')

# 조건 1: 어휘 사전(self.word_dict)에 없는 단어는 'oov'의 index로 변환합니다.
test_input = ['I go to school.', 'I LIKE pizza!', 'i am lala']
print(f'조건 1 test : {tokenizer.transform(test_input)}')

transform() : [[1, 2, 3, 4], [1, 5, 6]]
조건 1 test : [[1, 2, 3, 4], [1, 5, 6], [1, 0, 0]]


# 문제 2) TfidfVectorizer **생성하기**

In [5]:
class TfidfVectorizer:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer
    self.fit_checker = False
  
  def fit(self, sequences):
    from numpy import log as ln
    tokenized = self.tokenizer.fit_transform(sequences)
    result = []
    N = len(sequences) 
    w_d = self.tokenizer.word_dict
    flat_list = [item for sublist in tokenized for item in sublist]
    for i in range(1,len(w_d)):
      df = flat_list.count(i)
      result.append(ln(N/(df+1)))

    self.fit_checker = True
    return result
    

  def transform(self, sequences):
    if self.fit_checker:
      result = []
      df_idf = []
      tokenized = self.tokenizer.transform(sequences)
      for i in tokenized:
        result.append([])
        for j in range(1,len(self.tokenizer.word_dict)):
          a = i.count(j)
          result[-1].append(a)
      for lst in result:
        df_idf.append([x*y for x,y in zip(lst,self.fit(sequences))])
      return df_idf
    else:
      raise Exception("TfidfVectorizer instance is not fitted yet.")

  
  def fit_transform(self, sequences):
    self.fit(sequences)
    return self.transform(sequences)

In [6]:
tokenizer = Tokenizer()

tfidf_vectorizer = TfidfVectorizer(tokenizer)

# 2-1
input = ['I go to school.',
         'i go to home',
         'I LIKE pizza!',
         'I LIKE  banana banana']

idf = tfidf_vectorizer.fit(input)
idf

[-0.2231435513142097,
 0.28768207245178085,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453,
 0.28768207245178085,
 0.6931471805599453,
 0.28768207245178085]

In [7]:
import pandas as pd

tokenizer.fit(input)
vocab = list(tokenizer.word_dict.keys())[1:]


idf_ = pd.DataFrame(idf, index=vocab)
idf_

Unnamed: 0,0
i,-0.223144
go,0.287682
to,0.287682
school,0.693147
home,0.693147
like,0.287682
pizza,0.693147
banana,0.287682


In [8]:
# 확인

from numpy import log as ln

input = ['I go to school.',
         'i go to home',
         'I LIKE pizza!',
         'I LIKE  banana banana']
# i  3문장 모두 포함, go 2문장 포함

print(f'i 의 idf : {ln(3/(3+1))}')
print(f'go 의 idf : {ln(3/(2+1))}')

i 의 idf : -0.2876820724517809
go 의 idf : 0.0


In [9]:
tf_idf = tfidf_vectorizer.transform(input)
tf_idf

[[-0.2231435513142097,
  0.28768207245178085,
  0.28768207245178085,
  0.6931471805599453,
  0.0,
  0.0,
  0.0,
  0.0],
 [-0.2231435513142097,
  0.28768207245178085,
  0.28768207245178085,
  0.0,
  0.6931471805599453,
  0.0,
  0.0,
  0.0],
 [-0.2231435513142097,
  0.0,
  0.0,
  0.0,
  0.0,
  0.28768207245178085,
  0.6931471805599453,
  0.0],
 [-0.2231435513142097,
  0.0,
  0.0,
  0.0,
  0.0,
  0.28768207245178085,
  0.0,
  0.5753641449035617]]

In [10]:
tf_idf_ = pd.DataFrame(tf_idf, columns=vocab)
tf_idf_

Unnamed: 0,i,go,to,school,home,like,pizza,banana
0,-0.223144,0.287682,0.287682,0.693147,0.0,0.0,0.0,0.0
1,-0.223144,0.287682,0.287682,0.0,0.693147,0.0,0.0,0.0
2,-0.223144,0.0,0.0,0.0,0.0,0.287682,0.693147,0.0
3,-0.223144,0.0,0.0,0.0,0.0,0.287682,0.0,0.575364
