In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # frequency based DTM
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf based DTM

In [3]:
TEXT = ['banana apple apple eggplant', 
        'orange carrot banana eggplant', 
        'apple carrot banana banana', 
        'orange banana grape'
]

# CountVectorizer 이용하기

In [15]:
tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,2))
tf_features = tf_vectorizer.fit_transform(TEXT)

In [5]:
tf_features

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [16]:
features = np.array(tf_features.todense())
features

array([[2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1],
       [1, 0, 1, 0, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0]], dtype=int64)

In [17]:
features.shape

(4, 16)

In [8]:
features

array([[2, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 1],
       [1, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 1]], dtype=int64)

In [8]:
features[0]

array([2, 1, 0, 1, 0, 0], dtype=int64)

In [10]:
features[1]

array([0, 1, 1, 1, 0, 1], dtype=int64)

In [10]:
np.linalg.norm(features[1]-features[0])

2.449489742783178

In [9]:
np.linalg.norm(features[1]-features[0])

2.449489742783178

In [10]:
np.linalg.norm(features[1]-features[2])

2.0

In [11]:
np.dot(features[0], features[1])/(np.linalg.norm(features[0])*np.linalg.norm(features[1]))

0.4082482904638631

In [12]:
np.dot(features[0], features[2])/(np.linalg.norm(features[0])*np.linalg.norm(features[2]))

0.6666666666666667

In [18]:
feature_names = tf_vectorizer.get_feature_names()
feature_names

['apple',
 'apple apple',
 'apple carrot',
 'apple eggplant',
 'banana',
 'banana apple',
 'banana banana',
 'banana eggplant',
 'banana grape',
 'carrot',
 'carrot banana',
 'eggplant',
 'grape',
 'orange',
 'orange banana',
 'orange carrot']

In [19]:
import pandas as pd
df = pd.DataFrame(data=features, columns=feature_names)
print(df)

   apple  apple apple  apple carrot  apple eggplant  banana  banana apple  \
0      2            1             0               1       1             1   
1      0            0             0               0       1             0   
2      1            0             1               0       2             0   
3      0            0             0               0       1             0   

   banana banana  banana eggplant  banana grape  carrot  carrot banana  \
0              0                0             0       0              0   
1              0                1             0       1              1   
2              1                0             0       1              1   
3              0                0             1       0              0   

   eggplant  grape  orange  orange banana  orange carrot  
0         1      0       0              0              0  
1         1      0       1              0              1  
2         0      0       0              0              0  
3      

# TfidfVectorizer 이용하기

In [20]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
tfidf_features = tfidf_vectorizer.fit_transform(TEXT)

In [21]:
tfidf_features = np.array(tfidf_features.todense())
tfidf_features

array([[0.85764287, 0.28383251, 0.        , 0.42882143, 0.        ,
        0.        ],
       [0.        , 0.35696573, 0.53931298, 0.53931298, 0.        ,
        0.53931298],
       [0.51623315, 0.68337886, 0.51623315, 0.        , 0.        ,
        0.        ],
       [0.        , 0.37919167, 0.        , 0.        , 0.72664149,
        0.5728925 ]])

In [22]:
tfidf_features.shape

(4, 6)

In [8]:
tfidf_features[0]

array([0.85764287, 0.28383251, 0.        , 0.42882143, 0.        ,
       0.        ])