# 010. Vectorization of Statement(문장의 벡터화)

- BOW(Bag of Words)
- TF-IDF(Term frequency - Inverse Document Frequency)
- Word embedding - Keras word API 사용

In [7]:
import sklearn

In [8]:
import pandas as pd

sentences = ['I love my dog.',
             'I love my cat.',
             'I love my dog and love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?']

# 1. Bag of Word (BOW)
-CountVectorizer
  - min_df : vocabularly에 들어갈 최소 발생 빈도
  - ngram_range : (1,1)-unigram only, (1,2) - unigram + bigram
  - max_features = top_max_features만으로 vocabularly 구성

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
features = count_vectorizer.fit_transform(sentences)
features

<5x10 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [10]:
features.shape

(5, 10)

In [11]:
vectorized_sentence = features.toarray()
vectorized_sentence

array([[0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [0, 1, 1, 0, 1, 0, 2, 2, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 1]])

In [12]:
feature_names = count_vectorizer.get_feature_names_out()
feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'is', 'love', 'my', 'think',
       'you'], dtype=object)

In [13]:
df = pd.DataFrame(vectorized_sentence,columns = feature_names)
df

Unnamed: 0,amazing,and,cat,do,dog,is,love,my,think,you
0,0,0,0,0,1,0,1,1,0,0
1,0,0,1,0,0,0,1,1,0,0
2,0,1,1,0,1,0,2,2,0,0
3,0,0,0,0,1,0,1,1,0,1
4,1,0,0,1,1,1,0,1,1,1


## 2. TF-IDF

- TF-IDF(Term Frequency - Inverse Document Frequency)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer

In [15]:
tfidf_sentence = tfidf_vectorizer.fit_transform(sentences)
tfidf_sentence

<5x10 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [16]:
tfidf_vect_sentence  = tfidf_sentence.toarray()
tfidf_vect_sentence

array([[0.        , 0.        , 0.        , 0.        , 0.60685614,
        0.        , 0.60685614, 0.51327503, 0.        , 0.        ],
       [0.        , 0.        , 0.73792244, 0.        , 0.        ,
        0.        , 0.51528988, 0.43582888, 0.        , 0.        ],
       [0.        , 0.49110884, 0.39622352, 0.        , 0.27668216,
        0.        , 0.55336431, 0.46803199, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.45805379,
        0.        , 0.45805379, 0.38741896, 0.        , 0.65595732],
       [0.43872423, 0.        , 0.        , 0.43872423, 0.24716958,
        0.43872423, 0.        , 0.20905445, 0.43872423, 0.35395995]])

In [17]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_feature_names

array(['amazing', 'and', 'cat', 'do', 'dog', 'is', 'love', 'my', 'think',
       'you'], dtype=object)

In [18]:
tfidf_df = pd.DataFrame(tfidf_vect_sentence,columns = tfidf_feature_names)
tfidf_df

Unnamed: 0,amazing,and,cat,do,dog,is,love,my,think,you
0,0.0,0.0,0.0,0.0,0.606856,0.0,0.606856,0.513275,0.0,0.0
1,0.0,0.0,0.737922,0.0,0.0,0.0,0.51529,0.435829,0.0,0.0
2,0.0,0.491109,0.396224,0.0,0.276682,0.0,0.553364,0.468032,0.0,0.0
3,0.0,0.0,0.0,0.0,0.458054,0.0,0.458054,0.387419,0.0,0.655957
4,0.438724,0.0,0.0,0.438724,0.24717,0.438724,0.0,0.209054,0.438724,0.35396


# Word Embedding
-Encoding

In [19]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Tokenizer

In [20]:
tokenizer = Tokenizer(num_words=100,oov_token = "<OOV>")

# Word index Vocabularly

In [21]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'cat': 6,
 'you': 7,
 'and': 8,
 'do': 9,
 'think': 10,
 'is': 11,
 'amazing': 12}

In [22]:
index_word = tokenizer.index_word
index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'cat',
 7: 'you',
 8: 'and',
 9: 'do',
 10: 'think',
 11: 'is',
 12: 'amazing'}

# text의 sequence로 변환 및 padding

In [26]:
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)
print()
padded = pad_sequences(sequences,padding='post',truncating='post')
print(padded)

[[5, 3, 2, 4], [5, 3, 2, 6], [5, 3, 2, 4, 8, 3, 2, 6], [7, 3, 2, 4], [9, 7, 10, 2, 4, 11, 12]]

[[ 5  3  2  4  0  0  0  0]
 [ 5  3  2  6  0  0  0  0]
 [ 5  3  2  4  8  3  2  6]
 [ 7  3  2  4  0  0  0  0]
 [ 9  7 10  2  4 11 12  0]]


In [27]:
tokenizer.index_word

{1: '<OOV>',
 2: 'my',
 3: 'love',
 4: 'dog',
 5: 'i',
 6: 'cat',
 7: 'you',
 8: 'and',
 9: 'do',
 10: 'think',
 11: 'is',
 12: 'amazing'}

# sequence word를 word sentence로 환원

In [33]:
for sequence in sequences:
  sent=[]
  for index in sequence:
    sent.append(tokenizer.index_word[index])
  print(" ".join(sent))


i love my dog
i love my cat
i love my dog and love my cat
you love my dog
do you think my dog is amazing


In [35]:
padded

array([[ 5,  3,  2,  4,  0,  0,  0,  0],
       [ 5,  3,  2,  6,  0,  0,  0,  0],
       [ 5,  3,  2,  4,  8,  3,  2,  6],
       [ 7,  3,  2,  4,  0,  0,  0,  0],
       [ 9,  7, 10,  2,  4, 11, 12,  0]], dtype=int32)

In [34]:
to_categorical(padded)

array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

    