# TF-IDF Matrix

- DTM = TF, 즉 term frequency를 해당 document의 value로 하여 벡터화하는 거라고 생각하면 될 듯

In [64]:
import pandas as pd 
from math import log

In [None]:
# 4 docs
docs = [
  'I love deep learning',
  'I love machine learning',
  'deep learning is amazing',
  'learning deep learning is love'
]

# Build vocab (tokenization based on spaces)
vocab = sorted(set(w.lower() for doc in docs for w in doc.split()))
vocab.sort()

- 간단한 문장을 space 단위로 tokenization하여 vocab 생성

In [58]:
print(vocab)

['amazing', 'deep', 'i', 'is', 'learning', 'love', 'machine']


In [67]:
N = len(docs)

print('count of total docs : ', N)

count of total docs :  4


## Functions to Calculate TF, IDF, and TF-IDF

- $tf(t,d)$ 는 document에서 term frequency를 나타냄

In [46]:
def tf(t, d):
  return d.count(t)

- $df(t)$ 는 특정 term이 등장한 document의 수
- $idf(t)$는 df(t)의 inverse로 단순 역수가 아니라 $idf(d,t) = \log(n / (1+df(t)))$ 로 나타남 <br><br>
- log를 사용하지 않으면 rare term에 매우 높은 가중치가 부여될 수 있으므로 scaling 개념으로 log를 씌워줌

In [47]:
def idf(t):
  df = 0
  for doc in docs:
    df += t in doc
  return log(N/(df+1))

- TF-IDF 는 tf(t,d)와 idf(t)의 곱으로 나타나며, 다수의 문서에서 자주 등장하는 단어는 중요도가 낮고 
- 특정 문서에서만 자주 등장하는 단어는 중요도가 높다고 판단함

In [None]:
def tfidf(t, d):
  return tf(t,d) * idf(t)

### Calculate TF, i.e. DTM / Print DTM in Dataframe

In [68]:
result = []

# each doc ~ 
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    # Calculate TF
    result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)

In [69]:
tf_

Unnamed: 0,amazing,deep,i,is,learning,love,machine
0,0,1,1,0,1,1,0
1,0,0,2,0,1,1,1
2,1,1,3,1,1,0,0
3,0,1,3,1,2,1,0


### Calculate IDF in each word

In [None]:
result = []

# each word ~
for j in range(len(vocab)):
    t = vocab[j]
    # Calculate IDF
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
amazing,0.693147
deep,0.0
i,-0.223144
is,0.287682
learning,-0.223144
love,0.0
machine,0.693147


### Print TF-IDF Matrix

In [None]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    # Calculate TF-IDF
    result[-1].append(tfidf(t,d))

# TF-IDF Matrix
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,amazing,deep,i,is,learning,love,machine
0,0.0,0.0,-0.223144,0.0,-0.223144,0.0,0.0
1,0.0,0.0,-0.446287,0.0,-0.223144,0.0,0.693147
2,0.693147,0.0,-0.669431,0.287682,-0.223144,0.0,0.0
3,0.0,0.0,-0.669431,0.287682,-0.446287,0.0,0.0


# DTM and TF-IDF Using sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

vector = CountVectorizer()

# print freq in each word from corpus
print(vector.fit_transform(corpus).toarray())

# print index mapping each word
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
