# 7장. 문장의 표현 (Sentence Representation)

# 1 BoW (Bag of Words)

<img src="https://image.slidesharecdn.com/vector-space-models-170118145044/95/cs571-vector-space-models-3-638.jpg?cb=1485433004" />

https://en.wikipedia.org/wiki/Bag-of-words_model
https://www.slideshare.net/jchoi7s/cs571-vector-space-models

## 1.1 직접구현

In [145]:
import pandas as pd

In [146]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [147]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [None]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc ]     
word2id   

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

### 3) BoW 생성

In [None]:
import numpy as np 

BoW_ls = []

for i, doc in enumerate(doc_ls):
  bow = np.zeros(len(word2id), dtype=int)
  for token in doc:
      bow[word2id[token]] += 1 # 해당 토큰의 위치(column)
  BoW_ls.append(bow.tolist())
BoW_ls

[[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [None]:
from IPython.core import display as ICD

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
for i in range(len(docs)) :
  print("문서{} : {}".format(i, docs[i]))
  ICD.display(pd.DataFrame([BoW_ls[i]], columns=vocab))
  print("\n\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,1,1,0,0,0,0





문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,0,2,1,0,0,0





문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0,1,0,0,0,1,2,1









---





## 1.2 단어 순서를 고려하지 않은 BoW

In [None]:
docs = ['나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해',
        '나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 싫어해']

### 1) 띄어쓰기 단위로 토큰화

In [None]:
doc_ls = [doc.split() for doc in docs]
doc_ls

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [None]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc ]     
word2id   

### 3) BoW 생성

In [None]:
import numpy as np 

BoW_ls = []

for i, doc in enumerate(doc_ls):
  bow = np.zeros(len(word2id), dtype=int)
  for token in doc:
      bow[word2id[token]] += 1 # 해당 토큰의 위치(column)
  BoW_ls.append(bow.tolist())
BoW_ls

In [None]:
from IPython.core import display as ICD

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
for i in range(len(docs)) :
  print("문서{} : {}".format(i, docs[i]))
  ICD.display(pd.DataFrame([BoW_ls[i]], columns=vocab))
  print("\n\n")



---



https://en.wikipedia.org/wiki/Document-term_matrix

## 1.3 sklearn 활용

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
BoW = count_vect.fit_transform(docs)

BoW.toarray()[0]

In [None]:
from IPython.core import display as ICD

vocab = count_vect.get_feature_names()
for i in range(len(docs)) :
  print("문서{} : {}".format(i, docs[i]))
  ICD.display(pd.DataFrame([BoW.toarray()[i]], columns=vocab))
  print("\n\n")



---


## 1.4 gensim 활용

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [None]:
import gensim
import numpy as np
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
BoW = [id2word.doc2bow(doc) for doc in doc_ls]
BoW[0]

In [None]:
from gensim.matutils import sparse2full
from IPython.core import display as ICD

vocab = [id2word[i] for i in id2word.keys()]
for i in range(len(docs)) :
  print("문서{} : {}".format(i, docs[i]))
  ICD.display(pd.DataFrame([sparse2full(BoW[i], len(vocab))], columns=vocab))
  print("\n\n")



---



# 2 TDM(Term-Document Matrix)

## 2.1 직접구현

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [None]:
doc_ls = [doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [None]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc ]     
word2id        

### 3) TDM 생성

In [None]:
import numpy as np 

TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
for i, doc in enumerate(doc_ls):
  for token in doc:
      TDM[word2id[token], i] += 1 # 해당 토큰의 위치(column)
TDM

In [None]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

## 2.2 sklearn 활용

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

In [None]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
vocab = count_vect.get_feature_names()
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')



---


## 2.3 gensim 활용

In [None]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [None]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs] #공백으로 토큰화
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM

In [None]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]

df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

In [None]:
DTM_matrix

---

# 3 TF-IDF (Term Frequency-Inverse Document Frequency)

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" />



*  TF(단어 빈도, Term Frequency) : 단어가 문서 내에 등장하는 빈도
*  IDF(역문서 빈도, Inverse Document Frequency) : 단어가 여러 문서에 공통적으로 등장하는 빈도
*  한 문서 내에 자주 등장하고 다른 문서에 자주 등장하지 않는 단어를 주요 단어로 판별할 수 있음


https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## 3.1 직접계산하기 1

weighting schema|weight
--|--
tf (term frequency)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />
idf(inverse document frequency) |<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

In [2]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

### 1) 띄어쓰기 단위로 토큰화

In [103]:
doc_ls=[doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [104]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[token] for doc in doc_ls for token in doc]
word2id        

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

### 3) DTM 생성

In [105]:
import numpy as np 


TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
for i, doc in enumerate(doc_ls):
  for token in doc:
      TDM[word2id[token], i] += 1 # 해당 토큰의 위치(column)
DTM=TDM.transpose()
DTM


array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [106]:
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [107]:
word2id

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 3,
             '오늘': 0,
             '원숭이를': 2,
             '원숭이에게': 5,
             '줬어': 7,
             '코끼리를': 4})

In [108]:
TDM=np.zeros((len(word2id), len(doc_ls)), dtype=int)
for i, doc in enumerate(doc_ls):
  for token in doc:
    print(token)
    TDM[word2id[token],i]+=1
    print(TDM)
DTM=TDM.transpose()
DTM

오늘
[[1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
동물원에서
[[1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
원숭이를
[[1 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
봤어
[[1 0 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
오늘
[[1 1 0]
 [1 0 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
동물원에서
[[1 1 0]
 [1 1 0]
 [1 0 0]
 [1 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
코끼리를
[[1 1 0]
 [1 1 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
봤어
[[1 1 0]
 [1 1 0]
 [1 0 0]
 [1 1 0]
 [0 1 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
봤어
[[1 1 0]
 [1 1 0]
 [1 0 0]
 [1 2 0]
 [0 1 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
동물원에서
[[1 1 0]
 [1 1 1]
 [1 0 0]
 [1 2 0]
 [0 1 0]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
원숭이에게
[[1 1 0]
 [1 1 1]
 [1 0 0]
 [1 2 0]
 [0 1 0]
 [0 0 1]
 [0 0 0]
 [0 0 0]]
바나나를
[[1 1 0]
 [1 1 1]
 [1 0 0]
 [1 2 0]
 [0 1 0]
 [0 0 1]
 [0 0 1]
 [0 0 0]]
줬어
[[1 1 0]
 [1 1 1]
 [1 0 0]
 [1 2 0]
 [0 1 0]
 [0 0 1]
 [0 0 1]
 [0 

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [109]:
TDM

array([[1, 1, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 2, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 1]])

### 4) TF 계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />

TF = 문서내 토크빈도/ 문서내 전체토큰갯수

In [110]:
TDM

array([[1, 1, 0],
       [1, 1, 1],
       [1, 0, 0],
       [1, 2, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 2],
       [0, 0, 1]])

In [111]:
DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [112]:
len(DTM)

3

In [113]:
len(TDM)

8

In [114]:
DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [115]:
DTM.shape

(3, 8)

In [116]:
def computeTF(DTM):
  DTMc=DTM.copy()
  doc_len=np.zeros(shape=(len(DTM),1))
  for idx,dtm in enumerate(DTM):
    leng=0
    for jdx,dt in enumerate(dtm):
      if dt!=0:
        if dt>1:
          DTMc[idx,jdx]=1
        leng+=1
    doc_len[idx]=leng
  return DTMc/doc_len


In [117]:
computeTF(DTM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.25, 0.25, 0.  , 0.25, 0.25, 0.  , 0.  , 0.  ],
       [0.  , 0.25, 0.  , 0.  , 0.  , 0.25, 0.25, 0.25]])

In [118]:
DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

### 5) IDF  계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

IDF = log(총문서수/토큰이등장한문서수)

In [119]:
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [120]:
len(DTM)

3

In [121]:
DTM/len(DTM)

array([[0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.        ,
        0.        , 0.        , 0.        ],
       [0.33333333, 0.33333333, 0.        , 0.66666667, 0.33333333,
        0.        , 0.        , 0.        ],
       [0.        , 0.33333333, 0.        , 0.        , 0.        ,
        0.33333333, 0.66666667, 0.33333333]])

In [122]:
computeTF(DTM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.25, 0.25, 0.  , 0.25, 0.25, 0.  , 0.  , 0.  ],
       [0.  , 0.25, 0.  , 0.  , 0.  , 0.25, 0.25, 0.25]])

In [123]:
DTM.shape

(3, 8)

In [124]:
len(DTM.transpose())

8

In [125]:
DTM/len(DTM)

array([[0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.        ,
        0.        , 0.        , 0.        ],
       [0.33333333, 0.33333333, 0.        , 0.66666667, 0.33333333,
        0.        , 0.        , 0.        ],
       [0.        , 0.33333333, 0.        , 0.        , 0.        ,
        0.33333333, 0.66666667, 0.33333333]])

In [126]:
import math

def computeIDF(DTM):
  arr_sum=np.zeros(shape=(1,len(DTM.transpose())))
  for idx, dtm in enumerate(DTM):
    for jdx, dt in enumerate(dtm):
      if dt!=0:
        arr_sum[0,jdx]+=1

  return np.log10(len(DTM)/arr_sum)
    
computeIDF(DTM)

array([[0.17609126, 0.        , 0.47712125, 0.17609126, 0.47712125,
        0.47712125, 0.47712125, 0.47712125]])

### 6) TF-IDF 계산

In [127]:
def computeTFIDF(DTM):
  return computeTF(DTM)*computeIDF(DTM)
  
computeTFIDF(DTM)

array([[0.04402281, 0.        , 0.11928031, 0.04402281, 0.        ,
        0.        , 0.        , 0.        ],
       [0.04402281, 0.        , 0.        , 0.04402281, 0.11928031,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.11928031, 0.11928031, 0.11928031]])

In [128]:
computeTF(DTM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.25, 0.25, 0.  , 0.25, 0.25, 0.  , 0.  , 0.  ],
       [0.  , 0.25, 0.  , 0.  , 0.  , 0.25, 0.25, 0.25]])

In [129]:
computeIDF(DTM)

array([[0.17609126, 0.        , 0.47712125, 0.17609126, 0.47712125,
        0.47712125, 0.47712125, 0.47712125]])

In [185]:
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
tfidf= computeTFIDF(DTM)
pd.DataFrame(tfidf, columns=vocab)

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0.044023,0.0,0.11928,0.044023,0.0,0.0,0.0,0.0
1,0.044023,0.0,0.0,0.044023,0.11928,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.11928,0.11928,0.11928


In [214]:
word2id.keys()

dict_keys(['오늘', '동물원에서', '원숭이를', '봤어', '코끼리를', '원숭이에게', '바나나를', '줬어'])

## 3.2 직접계산하기2

weighting schema|weight|설명
--|--|--
tf(double normalization 0.5)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/45badc1c70ec2caa00ed8c21ed75bd9f8d3e650c" />|=0.5 + 0.5(토큰빈도/문서내최빈토큰)
idf(inverse document frequency smooth)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25f4d6690acaaef1f15f308d24f6f8a439de971d" />|=log(문서갯수/(1+토큰빈도)) + 1

In [90]:
len(docs)

3

In [225]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [226]:
docs[0]

'오늘 동물원에서 원숭이를 봤어'

In [227]:
docs[1].count('봤어')

2

In [238]:
from math import log10
import numpy as np
from collections import defaultdict

# document 내 토큰이 등장한 빈도수 계산
def f(t, d):
  return d.count(t)

# tf 계산
def tf(t, d):
  ta=0
  for tok in tokenizer(d):
    tata=tokenizer(d).count(tok)
    if tata>ta:
      ta=tata
  return 0.5+ 0.5*(tokenizer(d).count(t)/ta)

# idf 계산
def idf(t, D):
  df=0
  for d in D:
    if f(t,d) !=0:
      df+=1
  return np.log(len(D)/(df+1))+1

# tf-idf 계산
def tfidf_score(t, d, D):
  return tf(t,d)*idf(t,D)

# 공백을 기준으로 토큰과
def tokenizer(d):
  return d.split()
  


# tfidf 계산  
def tfidfScorer(D):
  token=[]
  for d in D:
    token.append(tokenizer(d))
    word2id = defaultdict(lambda : len(word2id))
    [word2id[token] for doc in token for token in doc]
  arr=np.zeros(shape=(len(D),len(word2id)))
  for idx,d in enumerate(D):
    for jdx,t in enumerate(word2id.keys()):
      arr[idx,jdx]=tfidf_score(t,d,D)
  sorted_vocab = sorted((value, key) for key, value in word2id.items())
  vocab = [v[1] for v in sorted_vocab]
  return arr, vocab

tfidfScorer(docs)

(array([[1.        , 0.71231793, 1.40546511, 1.        , 0.70273255,
         0.70273255, 0.70273255, 0.70273255],
        [0.75      , 0.53423845, 0.70273255, 1.        , 1.05409883,
         0.70273255, 0.70273255, 0.70273255],
        [0.5       , 0.53423845, 0.70273255, 0.5       , 0.70273255,
         1.05409883, 1.40546511, 1.05409883]]),
 ['오늘', '동물원에서', '원숭이를', '봤어', '코끼리를', '원숭이에게', '바나나를', '줬어'])

In [239]:
import pandas as pd
tfidf, vocab = tfidfScorer(docs)
pd.DataFrame(tfidf, columns=vocab)

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1.0,0.712318,1.405465,1.0,0.702733,0.702733,0.702733,0.702733
1,0.75,0.534238,0.702733,1.0,1.054099,0.702733,0.702733,0.702733
2,0.5,0.534238,0.702733,0.5,0.702733,1.054099,1.405465,1.054099


In [240]:
pd.DataFrame(docs).T

Unnamed: 0,0,1,2
0,오늘 동물원에서 원숭이를 봤어,오늘 동물원에서 코끼리를 봤어 봤어,동물원에서 원숭이에게 바나나를 줬어 바나나를


weighting schema|weight|설명
--|--|--
tf(double normalization 0.5)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/45badc1c70ec2caa00ed8c21ed75bd9f8d3e650c" />|=0.5 + 0.5(토큰빈도/문서내최빈토큰)
idf(inverse document frequency smooth)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25f4d6690acaaef1f15f308d24f6f8a439de971d" />|=log(문서갯수/(1+토큰빈도)) + 1