# 1. Bag of words(Bow)

단어의 등장 순서를 고려하지 않은 빈도수 기반의 단어 표현 방법
1. 각 단어의 고유한 정수 인덱스를 부여
2. 각 인덱스 위치에 단어 토큰의 등장 횟수를 기록한 벡터를 만든다.

doc1 = 'John likes to watch movies. Mary likes movies too'  
Bow1 = {"Jonh" : 1, "likes" :2, "to" : 1, "watch" :1, "movies" : 2, "Mary" : 1, "too" : 1}

In [1]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.6 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 39.7 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


In [2]:
from konlpy.tag import Okt
import re

In [3]:
okt = Okt()

# 정규표현식을 통해 온점을 제거하는 정제 작업
token = re.sub("(\.)", "", "소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.")

token = okt.morphs(token)

In [4]:
token

['소비자', '는', '주로', '소비', '하는', '상품', '을', '기준', '으로', '물가상승률', '을', '느낀다']

In [6]:
word2index = {}
bow = []

for vocab in token:
    if vocab not in word2index.keys(): # 처음나오는 단어
        word2index[vocab] = len(word2index) # 키:단어, 값:인덱스
        bow.insert(len(word2index)-1, 1)
    else:
        index = word2index.get(vocab)
        bow[index] += 1
print(word2index)

{'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}


In [7]:
bow

[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]

## Tensorflow의 keras Tokenizer를 활용한 BOW

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
sentence = ["John likes to watch movies. \
            Mary likes movies too! \
            Mary also likes to watch football games."]

In [10]:
def print_bow(sentence):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentence)

    bow = dict(tokenizer.word_counts) # 각 단어와 각 단어의 빈도를 bow에 저장

    print("Bag of words :", bow)
    print("단어장(vocabulary)의 크기 :", len(tokenizer.word_counts))

In [11]:
print_bow(sentence)

Bag of words : {'john': 1, 'likes': 3, 'to': 2, 'watch': 2, 'movies': 2, 'mary': 2, 'too': 1, 'also': 1, 'football': 1, 'games': 1}
단어장(vocabulary)의 크기 : 10


## scikit-learn의 CountVectorizer을 활용한 Bow

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
sentence = ["John likes to watch movies. \
            Mary likes movies too! \
            Mary also likes to watch football games."]

In [14]:
vector = CountVectorizer()

In [16]:
print("Bag of words :", vector.fit_transform(sentence).toarray())
print("각 단어의 인덱스 :", vector.vocabulary_)

Bag of words : [[1 1 1 1 3 2 2 2 1 2]]
각 단어의 인덱스 : {'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


## 불용어를 제거한 BOW만들기

### 사용자가 직접 정의한 부용어 사용

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["Family is not an important thing. It's everything"]

vect = CountVectorizer(stop_words=["the",'a','an', 'is', 'not'])

print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


### CountVectoryzer에서 제공하는 자체 불용어 사용

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["Family is not an important thing. It's everything"]

vect = CountVectorizer(stop_words='english')

print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


### NLTK에서 지원하는 불용어 사용

In [22]:
!pip install nltk



In [23]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

text = ["Family is not an important thing. It's everything"]

sw = stopwords.words('english')

vect = CountVectorizer(stop_words=sw)

print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


# 2. DTM(Document-Term Matrix)

다수의 문서에서 등장하는 각 단어들의 빈도를 행렬로 표현한 것  
다수의 문서에 대해서 bow를 하나의 행렬로 표현하고 부르는 용어

- 문서 1 : I like dog
- 문서 2 : I like cat
- 문서 3 : I like cat I like cat

In [25]:
import pandas as pd
content = [[0, 1, 1, 1], [1, 0, 1, 1], [2, 0, 2, 2]]
df = pd.DataFrame(content)
df.index = ['(문서1) I like dog', '(문서2) I like cat', '(문서3) I like cat I like cat']
df.columns = ['cat', 'dog', 'I', 'like']
df

Unnamed: 0,cat,dog,I,like
(문서1) I like dog,0,1,1,1
(문서2) I like cat,1,0,1,1
(문서3) I like cat I like cat,2,0,2,2


In [26]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [27]:
doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

In [28]:
def cos_sim(a, b):
    return dot(a,b)/(norm(a) * norm(b))

In [29]:
# 1에 가까울 수록 유사도가 높다
print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))

0.6666666666666667
0.6666666666666667
1.0000000000000002


## Scikit-learn의 CountVectorizer을 활용한 DTM

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["John likes to watch movies.",
            "Mary likes movies too!",
            "Mary also likes to watch football games."
]

vector = CountVectorizer()

print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 0 0 1 1 0 1 1 0 1]
 [0 0 0 0 1 1 1 0 1 0]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


# 3. TF-IDF (Term-Frequency Inverse Document Frequency

모든 문서에 자주 등장하는 단어는 중요도가 낮다고 판단하고, 특정 문서에서만 자주 등장하는 단어는 중요도가 높다고 판단하는 것

In [32]:
from math import log
import pandas as pd

docs = [
        'John likes to watch movies and Mary likes movies too',
        'James likes to watch TV',
        'Mary also likes to watch football games'
]

In [33]:
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [34]:
print('단어장의 크기 :', len(vocab))
print(vocab)

단어장의 크기 : 13
['James', 'John', 'Mary', 'TV', 'also', 'and', 'football', 'games', 'likes', 'movies', 'to', 'too', 'watch']


In [35]:
N = len(docs)
N

3

1. tf(d, t) : 특정 문서 d에서의 특정 단어 t의 등장 횟수
2. df(t) : 특정 단어 t가 등장한 문서의 수
3. idf(d,t) : df(t)에 반비례하는 수

$$ idf(d, t) = log\frac{n}{1+df(t)}$$

In [37]:
def tf(t, d): # 특정 문서 d에서의 특정 단어 t의 등장 횟수
    return d.count(t)

In [44]:
def idf(t): 
    df = 0 # 특정 단어 t가 등장한 문서의 수
    for doc in docs:
        df += t in doc
    return log(N/(df+1))+1 # 특정 단어 t가 등장한 문서의 수에 반비례하는 수

In [45]:
def tfidf(t,d):
    return tf(t,d) * idf(t)

In [46]:
result = []
for i in range(N):
    result.append([])
    d = docs[i] # 문서
    for j in range(len(vocab)):
        t = vocab[j]

        result[-1].append(tf(t,d))

In [47]:
tf_ = pd.DataFrame(result, columns=vocab)
tf_

Unnamed: 0,James,John,Mary,TV,also,and,football,games,likes,movies,to,too,watch
0,0,1,1,0,0,1,0,0,2,2,2,1,1
1,1,0,0,1,0,0,0,0,1,0,1,0,1
2,0,0,1,0,1,0,1,1,1,0,1,0,1


In [48]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
idf_

Unnamed: 0,IDF
James,1.405465
John,1.405465
Mary,1.0
TV,1.405465
also,1.405465
and,1.405465
football,1.405465
games,1.405465
likes,0.712318
movies,1.405465


In [49]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]

        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns=vocab)
tfidf_

Unnamed: 0,James,John,Mary,TV,also,and,football,games,likes,movies,to,too,watch
0,0.0,1.405465,1.0,0.0,0.0,1.405465,0.0,0.0,1.424636,2.81093,1.424636,1.405465,0.712318
1,1.405465,0.0,0.0,1.405465,0.0,0.0,0.0,0.0,0.712318,0.0,0.712318,0.0,0.712318
2,0.0,0.0,1.0,0.0,1.405465,0.0,1.405465,1.405465,0.712318,0.0,0.712318,0.0,0.712318


## scikit-learn을 활용한 TF-IDF 구현

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
corpus = [
          'you konw I want your love',
          'I like you',
          'what should I do'
]

In [52]:
vector = CountVectorizer()

In [54]:
print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'konw': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]


In [57]:
print(tfidfv.vocabulary_)

{'you': 7, 'konw': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}
