# DTM과 TDM

- BoW의 방법 중 하나
- 문서에 등장하는 각 단어의 등장빈도를 행렬로 표현

In [98]:
import numpy as np
import pandas as pd

In [99]:
docs=["동물원 코끼리",
     '동물원 원숭이 바나나',
     '엄마 코끼리 아기 코끼리',
     '원숭이 바나나 코끼리 바나나']

In [100]:
doc_ls=[]
for doc in docs:
    doc_ls.append(doc.split(" "))
doc_ls

[['동물원', '코끼리'],
 ['동물원', '원숭이', '바나나'],
 ['엄마', '코끼리', '아기', '코끼리'],
 ['원숭이', '바나나', '코끼리', '바나나']]

In [101]:
from collections import defaultdict

word2id=defaultdict(lambda : len(word2id))
for doc in doc_ls:
    for token in doc:
        word2id[token]
    
word2id

defaultdict(<function __main__.<lambda>()>,
            {'동물원': 0, '코끼리': 1, '원숭이': 2, '바나나': 3, '엄마': 4, '아기': 5})

In [102]:
DTM=[]
for i,doc in enumerate(doc_ls):
    bow=np.zeros(len(word2id),dtype=int)
    for token in doc:
        bow[word2id[token]]+=1
    DTM.append(bow.tolist())
DTM

[[1, 1, 0, 0, 0, 0],
 [1, 0, 1, 1, 0, 0],
 [0, 2, 0, 0, 1, 1],
 [0, 1, 1, 2, 0, 0]]

In [103]:
DTM=pd.DataFrame(BoW_ls,index=["문서1","문서2","문서3","문서4"],columns=word2id.keys())

In [104]:
DTM

Unnamed: 0,동물원,코끼리,원숭이,바나나,엄마,아기
문서1,1,1,0,0,0,0
문서2,1,0,1,1,0,0
문서3,0,2,0,0,1,1
문서4,0,1,1,2,0,0


In [105]:
TDM=np.zeros((len(word2id),len(doc_ls)),dtype=int)
for i,doc in enumerate(doc_ls):
    for token in doc:
        TDM[word2id[token],i] +=1
TDM

array([[1, 1, 0, 0],
       [1, 0, 2, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [106]:
TDM=pd.DataFrame(TDM,columns=["문서1","문서2","문서3","문서4"],index=word2id.keys())

In [107]:
TDM

Unnamed: 0,문서1,문서2,문서3,문서4
동물원,1,1,0,0
코끼리,1,0,2,1
원숭이,0,1,0,1
바나나,0,1,0,2
엄마,0,0,1,0
아기,0,0,1,0


## 사이킷런을 활용한 DTM , TDM

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
# sklearn은 DTM으로 만들어지게 설정되어있음

count_vect=CountVectorizer()
DTM=count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 2],
       [0, 2, 0, 0, 1, 1]], dtype=int64)

In [109]:
TDM=DTM.T.toarray()
TDM


array([[1, 1, 0, 0],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 1],
       [1, 0, 2, 1]], dtype=int64)

## gensim을 활용한 DTM , TDM

In [110]:
import gensim
from gensim import corpora

doc_ls=[doc.split() for doc in docs]
id2word=corpora.Dictionary(doc_ls)
TDM=[id2word.doc2bow(doc) for doc in doc_ls]
TDM


[[(0, 1), (1, 1)],
 [(0, 1), (2, 1), (3, 1)],
 [(1, 2), (4, 1), (5, 1)],
 [(1, 1), (2, 2), (3, 1)]]

In [111]:
from gensim.matutils import sparse2full
doc_names=["문서" + str(i) for i in range(len(doc_ls))]
vocab=[id2word[i] for i in id2word.keys()]
DTM_matrix=[sparse2full(doc,len(vocab)).tolist() for doc in TDM]

df_TDM=pd.DataFrame(np.array(DTM_matrix,dtype=int).T)
df_TDM["단어"]=vocab
df_TDM.set_index("단어")

Unnamed: 0_level_0,0,1,2,3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
바나나,0,1,0,2
원숭이,0,1,0,1
아기,0,0,1,0
엄마,0,0,1,0


# TF-IDF

 - 단어의 상대적 중요도를 계산가능하다.
 - TF(t,d) = 특정단어등장빈도/문서내전체등장단어빈도 : 특정 단어가 많이 등장할 수록 TF가 커진다. (상대적 중요도가 커진다.)
 - IDF(t,d) = log (총 문서수 / (1+단어가등장한문서수) : 단어가 여러 문서에 등장할 수록 IDF는 작아진다.(상대적 중요도가 떨어진다.)

In [150]:
docs=["오늘 동물원에서 원숭이와 코끼리를 봤어",
     "동물원에서 원숭이에게 바나나를 줬어 바나나를"]

In [151]:
doc_ls=[]
for doc in docs:
    doc_ls.append(doc.split())
doc_ls

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [152]:
from collections import defaultdict

word2id=defaultdict(lambda : len(word2id))
for doc in doc_ls:
    for token in doc:
        word2id[token]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이와': 2,
             '코끼리를': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [153]:
DTM=[]

for i,doc in enumerate(doc_ls):
    bow=np.zeros(len(word2id),dtype=int)
    for token in doc:
        bow[word2id[token]]+=1
    DTM.append(bow)
        
DTM

[array([1, 1, 1, 1, 1, 0, 0, 0]), array([0, 1, 0, 0, 0, 1, 2, 1])]

In [160]:
# 문서 내 특정 토큰의 등장 횟수 / 문서 내 전체 토큰의 수 

def tf(t,d,n):
    return d[n][word2id[t]]/len(d[n])

In [161]:
tf("오늘",DTM,0)

0.125

In [163]:
from math import log

In [164]:
def idf(t,d,n):
    df=0 #특정 단어 t가 등장한 문서의 수 
    
    for i in d:
        if t in i:
            df+=1
            
            
    return log(len(d)/(1+df))

In [165]:
idf("오늘",DTM,0)

  """


0.6931471805599453

In [166]:
def tf_idf(t,d,n):
    return tf(t,d,n)*idf(t,d,n)

In [169]:
tf_idf("바나나를",DTM,1)

  """


0.17328679513998632

In [223]:
tf_idf_list=[]


for n in range(len(DTM)):
    temp=[]
    for token in word2id:     
        try:
            temp.append(tf_idf(token,DTM,n))
        except:
            continue
    tf_idf_list.append(temp)

  """


In [224]:
tf_idf_list

[[0.08664339756999316,
  0.08664339756999316,
  0.08664339756999316,
  0.08664339756999316,
  0.08664339756999316,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.08664339756999316,
  0.0,
  0.0,
  0.0,
  0.08664339756999316,
  0.17328679513998632,
  0.08664339756999316]]

In [203]:
docs

['오늘 동물원에서 원숭이와 코끼리를 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [210]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidv=TfidfVectorizer()
tfidv=tfidv.fit(docs)
tfidv.transform(docs).toarray()
vocab=tfidv.get_feature_names()

In [212]:
pd.DataFrame(tfidv.transform(docs).toarray(),columns=vocab)

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이에게,원숭이와,줬어,코끼리를
0,0.335176,0.0,0.471078,0.471078,0.0,0.471078,0.0,0.471078
1,0.278943,0.784088,0.0,0.0,0.392044,0.0,0.392044,0.0


In [216]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc_ls=[doc.split() for doc in docs]
id2word=corpora.Dictionary(doc_ls)
TDM=[id2word.doc2bow(doc) for doc in doc_ls]
model=TfidfModel(TDM)
tfidf=model[TDM]
tfidf[0]

[(1, 0.5), (2, 0.5), (3, 0.5), (4, 0.5)]

In [None]:
from gensim.matutils import sparse2full

vocab=[id2word[i]]