# TF-IDF 
## TermFrequency - Inverse Document Frequency

## TF-IDF 개념

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
text = ['I go to my home my home is very large', 
        'I went out my home I go to the market', 
        'I bought a yellow lemon I go back to home'] 

In [18]:
tfidf_vectorizer = TfidfVectorizer().fit(text)

In [19]:
# 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다
print(tfidf_vectorizer.vocabulary_)

# 보기 좋게 정렬
print(sorted(tfidf_vectorizer.vocabulary_.items()))

{'go': 2, 'to': 11, 'my': 8, 'home': 3, 'is': 4, 'very': 12, 'large': 5, 'went': 13, 'out': 9, 'the': 10, 'market': 7, 'bought': 1, 'yellow': 14, 'lemon': 6, 'back': 0}
[('back', 0), ('bought', 1), ('go', 2), ('home', 3), ('is', 4), ('large', 5), ('lemon', 6), ('market', 7), ('my', 8), ('out', 9), ('the', 10), ('to', 11), ('very', 12), ('went', 13), ('yellow', 14)]


## 2. TF, DF, IDF 벡터화 과정의 이해

In [20]:
# text로 부터 각 단어의 빈도 수를 기록한다.
tfidf_vectorizer.transform(text).toarray()

array([[0.        , 0.        , 0.2170186 , 0.4340372 , 0.36744443,
        0.36744443, 0.        , 0.        , 0.55890191, 0.        ,
        0.        , 0.2170186 , 0.36744443, 0.        , 0.        ],
       [0.        , 0.        , 0.24902824, 0.24902824, 0.        ,
        0.        , 0.        , 0.42164146, 0.3206692 , 0.42164146,
        0.42164146, 0.24902824, 0.        , 0.42164146, 0.        ],
       [0.44514923, 0.44514923, 0.26291231, 0.26291231, 0.        ,
        0.        , 0.44514923, 0.        , 0.        , 0.        ,
        0.        , 0.26291231, 0.        , 0.        , 0.44514923]])

## IDF 벡터화 내부

In [21]:
tfidf_vectorizer.idf_

array([1.69314718, 1.69314718, 1.        , 1.        , 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.        , 1.69314718, 1.69314718, 1.69314718])

In [22]:
tfidf_vectorizer.idf_.shape

(15,)

In [26]:
import numpy as np 
DF_vec = np.array([1, 1, 3, 3, 1,
                  1, 1, 1, 2, 1,
                  1, 3, 1, 1, 1])

### 수식

$$
ln({1+n \over 1+df}) + 1
$$


In [27]:
def idf_func(n, df):
    rst = np.log((1 + n)/(1 + df)) + 1
    
    return rst

In [29]:
n = 3
idf_func(n, DF_vec)

array([1.69314718, 1.69314718, 1.        , 1.        , 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.        , 1.69314718, 1.69314718, 1.69314718])

- 함수와 위의 idf의 값이 같음을 알 수 있다.

## 4. TF-IDF 벡터화 내부

In [30]:
tfidf_vectorizer.transform(text).toarray()

array([[0.        , 0.        , 0.2170186 , 0.4340372 , 0.36744443,
        0.36744443, 0.        , 0.        , 0.55890191, 0.        ,
        0.        , 0.2170186 , 0.36744443, 0.        , 0.        ],
       [0.        , 0.        , 0.24902824, 0.24902824, 0.        ,
        0.        , 0.        , 0.42164146, 0.3206692 , 0.42164146,
        0.42164146, 0.24902824, 0.        , 0.42164146, 0.        ],
       [0.44514923, 0.44514923, 0.26291231, 0.26291231, 0.        ,
        0.        , 0.44514923, 0.        , 0.        , 0.        ,
        0.        , 0.26291231, 0.        , 0.        , 0.44514923]])

In [31]:
count_vec = np.array([
    0, 0, 1, 2, 1,
    1, 0, 0, 2, 0,
    0, 1, 1, 0, 0
])

In [32]:
tfidf_vectorizer.idf_

array([1.69314718, 1.69314718, 1.        , 1.        , 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.28768207, 1.69314718,
       1.69314718, 1.        , 1.69314718, 1.69314718, 1.69314718])

- 행렬의 곱 X
- 각각의 곱셈을 해야함


In [33]:
np.multiply(count_vec, tfidf_vectorizer.idf_)

array([0.        , 0.        , 1.        , 2.        , 1.69314718,
       1.69314718, 0.        , 0.        , 2.57536414, 0.        ,
       0.        , 1.        , 1.69314718, 0.        , 0.        ])

- L2 정규화가 필요

In [35]:
from sklearn import preprocessing

tf_idf_before_L2 = np.multiply(count_vec, tfidf_vectorizer.idf_)
tf_idf_before_L2 = tf_idf_before_L2.reshape(1, -1)
tf_idf_after_L2 = preprocessing.normalize(tf_idf_before_L2, norm='l2')

In [36]:
tf_idf_after_L2

array([[0.        , 0.        , 0.2170186 , 0.4340372 , 0.36744443,
        0.36744443, 0.        , 0.        , 0.55890191, 0.        ,
        0.        , 0.2170186 , 0.36744443, 0.        , 0.        ]])