# TF-IDF 직접구현

In [2]:
docs= ['오늘 동물원에서 원숭이와 코끼리를 봤어', 
      '동물원에서 원숭이에게 바나나를 줬어 바나나를']

 ## 토큰생성

In [3]:
doc_ls=[]
for doc in docs:
    doc_ls.append(doc.split()) # 공백으로 토큰화 
doc_ls

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [4]:
from collections import defaultdict

word2id = defaultdict(lambda: len(word2id))
for doc in doc_ls:
    for token in doc:
        word2id[token]
        
print(word2id)

defaultdict(<function <lambda> at 0x000001CDCD6CF378>, {'오늘': 0, '동물원에서': 1, '원숭이와': 2, '코끼리를': 3, '봤어': 4, '원숭이에게': 5, '바나나를': 6, '줬어': 7})


## TDM 구하기

In [6]:
import numpy as np
TDM = np.zeros((len(doc_ls),len(word2id)), dtype=int)
print(TDM)

for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[i, word2id[token] ] +=1
TDM

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [7]:
TDM[0].sum()

5

## TF 계산

In [19]:
# TF 계산 -> 특정단어등장빈도/ 문서내 전체등장단어빈도

In [8]:
def computeTF(TDM):
    doc_len=len(TDM) # 문서개수 2개
    word_len=len(TDM[0]) # 단어개수 8개
    
    tf=np.zeros((doc_len,word_len))

    for doc_i in range(doc_len):
        for word_i in range(word_len):
            tf[doc_i,word_i]= TDM[doc_i,word_i]/TDM[doc_i].sum()
    return tf

computeTF(TDM)

array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

## IDF 계산

In [10]:
import math
# IDF 계산 : -log(단어가 등장한 문서수/ 총 문서수 )

In [12]:
def computeIDF(TDM):
    doc_len=len(TDM)
    word_len=len(TDM[0])
    
    idf=np.zeros(word_len)
    
    for i in range(word_len):
        idf[i]= -math.log10(np.count_nonzero(TDM[:,i]/doc_len))# 로그 계산법
    return idf

computeIDF(TDM)

array([-0.     , -0.30103, -0.     , -0.     , -0.     , -0.     ,
       -0.     , -0.     ])

## TF-IDF 계산

In [13]:
# TF- IDF 곱

In [14]:
def computeTFIDF(TDM):
    tf = computeTF(TDM)
    idf= computeIDF(TDM)
    tfidf=np.zeros(tf.shape)
    for doc_i in range(tf.shape[0]):
        for word_i in range(tf.shape[1]):
            tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]
    return tfidf

computeTFIDF(TDM)

array([[-0.      , -0.060206, -0.      , -0.      , -0.      , -0.      ,
        -0.      , -0.      ],
       [-0.      , -0.060206, -0.      , -0.      , -0.      , -0.      ,
        -0.      , -0.      ]])