In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # frequency based DTM
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf based DTM

In [None]:
TEXT = ['banana apple apple eggplant',
        'orange carrot banana eggplant',
        'apple carrot banana banana',
        'orange banana grape'
]

TXT = "banana apple apple eggplant orange carrot banana eggplant apple carrot banana banana orange banana grape"


# CountVectorizer 이용하기

In [None]:
# tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
tf_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,2))  # 최수 빈도수 정의 # 그램 정의(유니그램(단일 단어)만 고려)
tf_features = tf_vectorizer.fit_transform(TEXT)

In [None]:
# n 그램 이해
import nltk
from nltk import ngrams

def getNgramWord(N,txt):
    txt = txt.split()
    ngrams = [txt[i:i+N] for i in range(len(txt)-N+1)]
    return ngrams

print("1-gram : " + str(getNgramWord(1, TXT))) # unigram
print("2-gram : " + str(getNgramWord(2, TXT))) # bigram
print("3-gram : " + str(getNgramWord(3, TXT))) # trigram

1-gram : [['banana'], ['apple'], ['apple'], ['eggplant'], ['orange'], ['carrot'], ['banana'], ['eggplant'], ['apple'], ['carrot'], ['banana'], ['banana'], ['orange'], ['banana'], ['grape']]
2-gram : [['banana', 'apple'], ['apple', 'apple'], ['apple', 'eggplant'], ['eggplant', 'orange'], ['orange', 'carrot'], ['carrot', 'banana'], ['banana', 'eggplant'], ['eggplant', 'apple'], ['apple', 'carrot'], ['carrot', 'banana'], ['banana', 'banana'], ['banana', 'orange'], ['orange', 'banana'], ['banana', 'grape']]
3-gram : [['banana', 'apple', 'apple'], ['apple', 'apple', 'eggplant'], ['apple', 'eggplant', 'orange'], ['eggplant', 'orange', 'carrot'], ['orange', 'carrot', 'banana'], ['carrot', 'banana', 'eggplant'], ['banana', 'eggplant', 'apple'], ['eggplant', 'apple', 'carrot'], ['apple', 'carrot', 'banana'], ['carrot', 'banana', 'banana'], ['banana', 'banana', 'orange'], ['banana', 'orange', 'banana'], ['orange', 'banana', 'grape']]


In [None]:
tf_features

<4x16 sparse matrix of type '<class 'numpy.int64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [None]:
features = np.array(tf_features.todense())
features

array([[2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1],
       [1, 0, 1, 0, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0]])

In [None]:
features

array([[2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1],
       [1, 0, 1, 0, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0]])

In [None]:
features[0]

array([2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [None]:
features[1]

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])

In [None]:
np.linalg.norm(features[1]-features[0])

3.4641016151377544

In [None]:
np.linalg.norm(features[1]-features[2])

2.8284271247461903

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
feature_names = tf_vectorizer.get_feature_names_out()
feature_names

array(['apple', 'apple apple', 'apple carrot', 'apple eggplant', 'banana',
       'banana apple', 'banana banana', 'banana eggplant', 'banana grape',
       'carrot', 'carrot banana', 'eggplant', 'grape', 'orange',
       'orange banana', 'orange carrot'], dtype=object)

In [None]:
import pandas as pd
df = pd.DataFrame(data=features, columns=feature_names)
print(df)

   apple  banana  carrot  eggplant  grape  orange
0      2       1       0         1      0       0
1      0       1       1         1      0       1
2      1       2       1         0      0       0
3      0       1       0         0      1       1


# TfidfVectorizer 이용하기

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
tfidf_features = tfidf_vectorizer.fit_transform(TEXT)

In [None]:
tfidf_features = np.array(tfidf_features.todense())
tfidf_features

array([[0.85764287, 0.28383251, 0.        , 0.42882143, 0.        ,
        0.        ],
       [0.        , 0.35696573, 0.53931298, 0.53931298, 0.        ,
        0.53931298],
       [0.51623315, 0.68337886, 0.51623315, 0.        , 0.        ,
        0.        ],
       [0.        , 0.37919167, 0.        , 0.        , 0.72664149,
        0.5728925 ]])

In [None]:
tfidf_features[0]

array([0.85764287, 0.28383251, 0.        , 0.42882143, 0.        ,
       0.        ])