### 출처: Won Joon Yoo, Introduction to Deep Learning for Natural Language Processing, Wikidocs</br>
### https://wikidocs.net/book/2155

### 4) TF-IDF(Term Frequency-Inverse Document Frequency)

#### tf(d,t) : 특정 문서 d에서의 특정 단어 t의 등장 횟수.

#### df(t) : 특정 단어 t가 등장한 문서의 수.

#### idf(d, t) : df(t)에 반비례하는 수.

**idf(d,t)=log(n/1+df(t))**

In [2]:
# import math
# # n: 총 문서의 수
# # d: 특정 문서
# # t: 특정 단어

# def tf_idf(d, t, n):
#     return math.log(n / (1 + df(t))
                    
# # 특정 단어                    
# def tf(d, t):

In [3]:
import pandas as pd
from math import log

In [4]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 

In [5]:
vocab = list(set(w for doc in docs for w in doc.split()))

In [6]:
vocab

['저는', '과일이', '사과', '길고', '바나나', '좋아요', '먹고', '노란', '싶은']

In [7]:
vocab.sort()

In [8]:
vocab

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [9]:
N = len(docs) # 총 문서의 수

In [10]:
N

4

In [11]:
# N: 총 문서의 수
# d: 특정 문서
# t: 특정 단어
def tf(t, d):
    return d.count(t)

In [12]:
# df(t) : 특정 단어 t가 등장한 문서의 수
# idf(t)는 모든 문서에서 빈도수가 높은 단어의 가중치를 낮추는 역할
def idf(t):
    df = 0
    for doc in docs:
        df += t in doc # 문장에 단어가 있으면 True(1)이므로 df의 값이 1씩 증가
    return log(N/(df + 1))

In [13]:
def tfidf(t, d):
    return tf(t, d) * idf(t)

In [14]:
# DTM 출력
result = []
for i in range(N): # 각 문서에 대해서 아래 명령을 수행
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t, d))

In [15]:
result

[[0, 0, 0, 1, 0, 1, 1, 0, 0],
 [0, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 1, 0, 2, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0, 0, 1, 1]]

In [16]:
tf_ = pd.DataFrame(result, columns = vocab)

In [17]:
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [18]:
# IDF
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

In [19]:
# 각 단어별 가중치
result

[0.6931471805599453,
 0.6931471805599453,
 0.6931471805599453,
 0.28768207245178085,
 0.28768207245178085,
 0.6931471805599453,
 0.28768207245178085,
 0.6931471805599453,
 0.6931471805599453]

In [20]:
# DTM과 IDF를 행렬곱(내적)하기 위해서 행과 열을 맞춰줌
idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])

In [21]:
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [22]:
result = []
for i in range(N): # 문서의 개수만큼 반복
    result.append([])
    d = docs[i]
    for j in range(len(vocab)): # 전체 단어 개수만큼 반복
        t = vocab[j] # 단어
        result[-1].append(tfidf(t,d)) # result[-1]는 위에서 추가한 빈 리스트

In [23]:
result

[[0.0,
  0.0,
  0.0,
  0.28768207245178085,
  0.0,
  0.6931471805599453,
  0.28768207245178085,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.28768207245178085,
  0.28768207245178085,
  0.0,
  0.28768207245178085,
  0.0,
  0.0],
 [0.0,
  0.6931471805599453,
  0.6931471805599453,
  0.0,
  0.5753641449035617,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.6931471805599453,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.6931471805599453,
  0.6931471805599453]]

In [24]:
tfidf_ = pd.DataFrame(result, columns = vocab)

In [25]:
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


### 사이킷런을 이용한 DTM과 TF-IDF 실습

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

In [28]:
vector = CountVectorizer()

In [29]:
# DTM
vector.fit_transform(corpus).toarray()

array([[0, 1, 0, 1, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 0]], dtype=int64)

In [30]:
# 단어별 인덱스
vector.vocabulary_

{'you': 7,
 'know': 1,
 'want': 5,
 'your': 8,
 'love': 3,
 'like': 2,
 'what': 6,
 'should': 4,
 'do': 0}

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

In [33]:
tfidfv = TfidfVectorizer().fit(corpus)

In [34]:
tfidfv.transform(corpus).toarray()

array([[0.        , 0.46735098, 0.        , 0.46735098, 0.        ,
        0.46735098, 0.        , 0.35543247, 0.46735098],
       [0.        , 0.        , 0.79596054, 0.        , 0.        ,
        0.        , 0.        , 0.60534851, 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.57735027, 0.        , 0.        ]])

In [35]:
tfidfv.vocabulary_

{'you': 7,
 'know': 1,
 'want': 5,
 'your': 8,
 'love': 3,
 'like': 2,
 'what': 6,
 'should': 4,
 'do': 0}

### 코사인 유사도(Cosine Similarity)

In [36]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
# norm은 L2 regulation
def cos_sim(A, B):
    return dot(A, B)/(norm(A) * norm(B))

In [37]:
doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

In [38]:
cos_sim(doc1, doc2)

0.6666666666666667

In [39]:
cos_sim(doc1, doc3)

0.6666666666666667

In [40]:
# theta값이 동일하므로 1
cos_sim(doc2, doc3)

1.0000000000000002

### 유사도를 이용한 추천 시스템 구현하기(영화 줄거리 데이터)

In [41]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [42]:
data = pd.read_csv('./movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [44]:
data = data.head(20000)

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  20000 non-null  object 
 1   belongs_to_collection  2399 non-null   object 
 2   budget                 20000 non-null  object 
 3   genres                 20000 non-null  object 
 4   homepage               3055 non-null   object 
 5   id                     20000 non-null  object 
 6   imdb_id                19993 non-null  object 
 7   original_language      19999 non-null  object 
 8   original_title         20000 non-null  object 
 9   overview               19865 non-null  object 
 10  popularity             19998 non-null  object 
 11  poster_path            19907 non-null  object 
 12  production_companies   19999 non-null  object 
 13  production_countries   19999 non-null  object 
 14  release_date           19983 non-null  object 
 15  re

In [46]:
# 결측치 개수 확인
data['overview'].isnull().sum()

135

In [47]:
# overview 결측치를 빈문자열로 채움
data['overview'] = data['overview'].fillna('')

In [48]:
data['overview'].isnull().sum()

0

In [49]:
# 불용어 제거
tfidf = TfidfVectorizer(stop_words='english')

In [50]:
# overview에 대해서 tf-idf 수행
tfidf_matrix = tfidf.fit_transform(data['overview'])

In [51]:
tfidf_matrix.shape

(20000, 47487)

In [52]:
# 단어별 인덱스 딕셔너리
tfidf.vocabulary_

{'led': 24361,
 'woody': 46617,
 'andy': 2051,
 'toys': 43131,
 'live': 24957,
 'happily': 18727,
 'room': 36185,
 'birthday': 4736,
 'brings': 5775,
 'buzz': 6290,
 'lightyear': 24759,
 'scene': 37166,
 'afraid': 1254,
 'losing': 25229,
 'place': 32291,
 'heart': 19047,
 'plots': 32424,
 'circumstances': 7974,
 'separate': 37778,
 'owner': 30780,
 'duo': 12920,
 'eventually': 14393,
 'learns': 24325,
 'aside': 2811,
 'differences': 11645,
 'siblings': 38509,
 'judy': 22536,
 'peter': 31851,
 'discover': 11888,
 'enchanted': 13728,
 'board': 5054,
 'game': 16798,
 'opens': 30240,
 'door': 12442,
 'magical': 25725,
 'world': 46664,
 'unwittingly': 44575,
 'invite': 21663,
 'alan': 1495,
 'adult': 1135,
 'trapped': 43304,
 'inside': 21286,
 '26': 430,
 'years': 46965,
 'living': 24973,
 'hope': 19854,
 'freedom': 16355,
 'finish': 15562,
 'proves': 33496,
 'risky': 35885,
 'running': 36455,
 'giant': 17265,
 'rhinoceroses': 35691,
 'evil': 14422,
 'monkeys': 28035,
 'terrifying': 42245,


In [53]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [55]:
cosine_sim

array([[1.        , 0.01575748, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01575748, 1.        , 0.04907345, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04907345, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.08375766],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08375766, 0.        ,
        1.        ]])

In [56]:
cosine_sim.shape

(20000, 20000)

In [57]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

In [58]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [59]:
# 특정 영화 인덱스 확인
idx = indices['Father of the Bride Part II']

In [60]:
idx

4

In [63]:
def get_reco(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당되는 인덱스를 받아옵니다. 이제 선택한 영화를 가지고 연산할 수 있습니다.
    idx = indices[title]

    # 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬합니다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아옵니다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
    movie_indices = [i[0] for i in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴합니다.
    return data['title'].iloc[movie_indices]

In [64]:
get_reco('The Dark Knight Rises')

12481                            The Dark Knight
150                               Batman Forever
1328                              Batman Returns
15511                 Batman: Under the Red Hood
585                                       Batman
9230          Batman Beyond: Return of the Joker
18035                           Batman: Year One
19792    Batman: The Dark Knight Returns, Part 1
3095                Batman: Mask of the Phantasm
10122                              Batman Begins
Name: title, dtype: object

### 유클리드 거리(Euclidean distance) 유사도

In [65]:
import numpy as np

In [66]:
def dist(x, y):
    return np.sqrt(np.sum((x-y)**2))

In [67]:
doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))

In [68]:
print(dist(doc1,docQ))
print(dist(doc2,docQ))
print(dist(doc3,docQ))

2.23606797749979
3.1622776601683795
2.449489742783178


### 자카드 유사도(Jaccard similarity)

합집합에서 교집합의 비율로 유사도를 구함
</br>서로 다른 문서 간에 유사도를 구할 수 있음

In [69]:
# 다음과 같은 두 개의 문서가 있습니다.
# 두 문서 모두에서 등장한 단어는 apple과 banana 2개.
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"

In [70]:
# 토큰화
tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

In [71]:
tokenized_doc1

['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']

In [72]:
tokenized_doc2

['apple', 'banana', 'coupon', 'passport', 'love', 'you']

In [73]:
# 합집합
union = set(tokenized_doc1).union(set(tokenized_doc2))

In [74]:
union

{'apple',
 'banana',
 'card',
 'coupon',
 'everyone',
 'holder',
 'like',
 'likey',
 'love',
 'passport',
 'watch',
 'you'}

In [75]:
# 교집합
intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))

In [76]:
intersection

{'apple', 'banana'}

In [77]:
# 자카드 유사도
len(intersection)/len(union)

0.16666666666666666

### 잠재 의미 분석(Latent Semantic Analysis, LSA)

#### SVD 특이점 분해. 행렬 연산을 이용하여 연산 속도를 높임

In [78]:
import numpy as np
A=np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])

In [79]:
np.shape(A)

(4, 9)

full SVD 수행

In [80]:
U, s, VT = np.linalg.svd(A, full_matrices = True)

In [81]:
U.round(2)

array([[-0.24,  0.75,  0.  , -0.62],
       [-0.51,  0.44, -0.  ,  0.74],
       [-0.83, -0.49, -0.  , -0.27],
       [-0.  , -0.  ,  1.  ,  0.  ]])

In [82]:
U.shape

(4, 4)

In [84]:
VT.shape

(9, 9)

In [87]:
# 대각행렬 주대각선에 있는 특이값만 나옴
s

array([2.68731789, 2.04508425, 1.73205081, 0.77197992])

In [86]:
s.shape

(4,)

In [90]:
s1 = np.diag(s)
s1

array([[2.68731789, 0.        , 0.        , 0.        ],
       [0.        , 2.04508425, 0.        , 0.        ],
       [0.        , 0.        , 1.73205081, 0.        ],
       [0.        , 0.        , 0.        , 0.77197992]])

In [91]:
np.diag(s1)

array([2.68731789, 2.04508425, 1.73205081, 0.77197992])

In [92]:
S = np.zeros((4, 9)) # 대각 행렬의 크기 4 x 9 맞춰주기

In [93]:
S

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [94]:
S[:4, :4] = np.diag(s)

In [95]:
S.round(2)

array([[2.69, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 2.05, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 1.73, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.77, 0.  , 0.  , 0.  , 0.  , 0.  ]])

In [96]:
VT

array([[-9.05730252e-17, -3.08298331e-01, -3.08298331e-01,
        -2.77536539e-01, -8.04917216e-01, -8.92159849e-02,
        -2.77536539e-01, -4.41881458e-17, -4.41881458e-17],
       [ 3.38857308e-17, -2.38904821e-01, -2.38904821e-01,
         5.84383395e-01, -2.60689306e-01,  3.67263060e-01,
         5.84383395e-01, -3.42420311e-17, -3.42420311e-17],
       [ 5.77350269e-01, -5.50814942e-17,  5.81928632e-17,
         1.38606015e-16, -1.41746067e-16,  1.66877179e-16,
        -8.34385896e-17,  5.77350269e-01,  5.77350269e-01],
       [ 2.32493354e-16, -3.54477649e-01, -3.54477649e-01,
         1.60978141e-01,  2.48851639e-01, -7.96828796e-01,
         1.60978141e-01, -5.08069894e-17, -5.08069894e-17],
       [-9.71445147e-17, -7.79217669e-01, -1.33887309e-02,
        -1.98151600e-01,  3.96303200e-01,  3.96303200e-01,
        -1.98151600e-01,  4.26271409e-17,  4.26271409e-17],
       [-2.88675135e-01,  3.11555335e-01, -7.80527867e-01,
        -2.42243133e-01,  2.34486266e-01,  2.344862

In [97]:
VT.round(2)

array([[-0.  , -0.31, -0.31, -0.28, -0.8 , -0.09, -0.28, -0.  , -0.  ],
       [ 0.  , -0.24, -0.24,  0.58, -0.26,  0.37,  0.58, -0.  , -0.  ],
       [ 0.58, -0.  ,  0.  ,  0.  , -0.  ,  0.  , -0.  ,  0.58,  0.58],
       [ 0.  , -0.35, -0.35,  0.16,  0.25, -0.8 ,  0.16, -0.  , -0.  ],
       [-0.  , -0.78, -0.01, -0.2 ,  0.4 ,  0.4 , -0.2 ,  0.  ,  0.  ],
       [-0.29,  0.31, -0.78, -0.24,  0.23,  0.23,  0.01,  0.14,  0.14],
       [-0.29, -0.1 ,  0.26, -0.59, -0.08, -0.08,  0.66,  0.14,  0.14],
       [-0.5 , -0.06,  0.15,  0.24, -0.05, -0.05, -0.19,  0.75, -0.25],
       [-0.5 , -0.06,  0.15,  0.24, -0.05, -0.05, -0.19, -0.25,  0.75]])

In [101]:
# 기존 행렬 A와 동일한지 확인
np.allclose(A, np.dot(np.dot(U, S), VT))

True

In [102]:
S = S[:2, :2]

In [103]:
S

array([[2.68731789, 0.        ],
       [0.        , 2.04508425]])

In [104]:
U = U[:, :2]

In [105]:
U

array([[-2.39751712e-01,  7.51083898e-01],
       [-5.06077194e-01,  4.44029376e-01],
       [-8.28495619e-01, -4.88580485e-01],
       [-7.19783140e-17, -2.24042335e-17]])

In [106]:
VT = VT[:2, :]

In [107]:
VT

array([[-9.05730252e-17, -3.08298331e-01, -3.08298331e-01,
        -2.77536539e-01, -8.04917216e-01, -8.92159849e-02,
        -2.77536539e-01, -4.41881458e-17, -4.41881458e-17],
       [ 3.38857308e-17, -2.38904821e-01, -2.38904821e-01,
         5.84383395e-01, -2.60689306e-01,  3.67263060e-01,
         5.84383395e-01, -3.42420311e-17, -3.42420311e-17]])

In [108]:
VT.round(2)

array([[-0.  , -0.31, -0.31, -0.28, -0.8 , -0.09, -0.28, -0.  , -0.  ],
       [ 0.  , -0.24, -0.24,  0.58, -0.26,  0.37,  0.58, -0.  , -0.  ]])

In [110]:
A_prime = np.dot(np.dot(U,S), VT)

In [111]:
A_prime.round(2)

array([[ 0.  , -0.17, -0.17,  1.08,  0.12,  0.62,  1.08, -0.  , -0.  ],
       [ 0.  ,  0.2 ,  0.2 ,  0.91,  0.86,  0.45,  0.91,  0.  ,  0.  ],
       [ 0.  ,  0.93,  0.93,  0.03,  2.05, -0.17,  0.03,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ]])

In [112]:
A

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]])

#### 뉴스그룹 데이터 실습

In [113]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [114]:
# random_state에 고정값을 주는 이유는 똑같은 결과를 얻기 위함
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

In [116]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [184]:
len(dataset.target_names)

20

In [117]:
# 20개의 토픽
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [118]:
dataset.target

array([17,  0, 17, ...,  9,  4,  9])

In [119]:
dataset.filenames

array(['C:\\Users\\leok8\\scikit_learn_data\\20news_home\\20news-bydate-train\\talk.politics.mideast\\76141',
       'C:\\Users\\leok8\\scikit_learn_data\\20news_home\\20news-bydate-train\\alt.atheism\\53281',
       'C:\\Users\\leok8\\scikit_learn_data\\20news_home\\20news-bydate-train\\talk.politics.mideast\\76350',
       ...,
       'C:\\Users\\leok8\\scikit_learn_data\\20news_home\\20news-bydate-train\\rec.sport.baseball\\105105',
       'C:\\Users\\leok8\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.sys.mac.hardware\\51575',
       'C:\\Users\\leok8\\scikit_learn_data\\20news_home\\20news-bydate-train\\rec.sport.baseball\\104908'],
      dtype='<U95')

In [121]:
documents = dataset.data

In [122]:
len(documents)

11314

In [124]:
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [125]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

#### 텍스트 전처리

In [128]:
news_df = pd.DataFrame({'document':documents})

In [129]:
news_df

Unnamed: 0,document
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."
...,...
11309,"Danny Rubenstein, an Israeli journalist, will ..."
11310,\n
11311,\nI agree. Home runs off Clemens are always m...
11312,I used HP DeskJet with Orange Micros Grappler ...


In [132]:
# 데이터프레임 형식을 문자열 형식으로 변경
# 정규표현식으로 특수문자 제거
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')

  news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')


In [133]:
news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,Well i m not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",Yeah do you expect people to read the ...
2,Although I realize that principle is not one o...,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ...",Well I will have to change the scoring on my ...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",Danny Rubenstein an Israeli journalist will ...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,I agree Home runs off Clemens are always me...
11312,I used HP DeskJet with Orange Micros Grappler ...,I used HP DeskJet with Orange Micros Grappler ...


In [134]:
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [135]:
news_df['clean_doc']

0        Well sure about story seem biased What disagre...
1        Yeah expect people read actually accept hard a...
2        Although realize that principle your strongest...
3        Notwithstanding legitimate fuss about this pro...
4        Well will have change scoring playoff pool Unf...
                               ...                        
11309    Danny Rubenstein Israeli journalist will speak...
11310                                                     
11311    agree Home runs Clemens always memorable Kinda...
11312    used DeskJet with Orange Micros Grappler Syste...
11313    argument with Murphy scared hell when came las...
Name: clean_doc, Length: 11314, dtype: object

In [136]:
# 전체 단어 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [139]:
news_df.clean_doc[:3]

0    well sure about story seem biased what disagre...
1    yeah expect people read actually accept hard a...
2    although realize that principle your strongest...
Name: clean_doc, dtype: object

In [152]:
from nltk.corpus import stopwords

In [153]:
stop_words = stopwords.words('english')

In [154]:
tokenized_doc = news_df['clean_doc'].apply(lambda x:x.split())

In [155]:
tokenized_doc = tokenized_doc.apply(lambda x:[item for item in x if item not in stop_words])

#### TF-IDF 행렬 만들기

In [156]:
# 역 토큰화 작업
detokenized_doc = []

In [157]:
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

In [159]:
detokenized_doc

['well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away look jews treating races power unfortunate',
 'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons',
 'although realize principle strongest points would still like know question sort arab countries want continue think tank charade fixation israel must stop might start asking sort questions arab countries well realize would work arab countries treatment jews last sever

In [160]:
news_df['clean_doc'] = detokenized_doc

In [161]:
news_df['clean_doc'][0]

'well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away look jews treating races power unfortunate'

In [164]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [165]:
vectorizer = TfidfVectorizer(stop_words='english',
                            max_features = 1000, # 상위 1000개 단어 보존
                            max_df = 0.5,
                            smooth_idf=True)

In [166]:
X = vectorizer.fit_transform(news_df['clean_doc'])

In [167]:
X.shape

(11314, 1000)

In [172]:
# tfidf 값
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.20185845, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.23080563, 0.        ,
        0.        ]])

#### 토픽 모델링. 절단된 SVD(Truncated SVD) 사용
뉴스그룹 데이터가 20개 카테고리를 갖고 있으므로 20개 토픽으로 가정

In [173]:
from sklearn.decomposition import TruncatedSVD

In [174]:
svd_model = TruncatedSVD(n_components = 20, algorithm = 'randomized', n_iter = 100,
                        random_state = 122)

In [175]:
svd_model.fit(X)

TruncatedSVD(n_components=20, n_iter=100, random_state=122)

In [176]:
# components_는 VT에 해당
len(svd_model.components_)

20

In [177]:
np.shape(svd_model.components_)

(20, 1000)

In [178]:
# 단어 집합. 1,000개의 단어가 저장
terms =vectorizer.get_feature_names()

In [179]:
terms

['ability',
 'able',
 'accept',
 'access',
 'according',
 'account',
 'action',
 'actions',
 'actual',
 'actually',
 'added',
 'addition',
 'additional',
 'address',
 'administration',
 'advance',
 'advice',
 'agencies',
 'agree',
 'algorithm',
 'allow',
 'allowed',
 'allows',
 'amendment',
 'america',
 'american',
 'americans',
 'analysis',
 'angeles',
 'anonymous',
 'answer',
 'answers',
 'anti',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'apple',
 'application',
 'applications',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'april',
 'arab',
 'archive',
 'area',
 'areas',
 'argument',
 'arguments',
 'armenia',
 'armenian',
 'armenians',
 'arms',
 'army',
 'article',
 'articles',
 'asked',
 'asking',
 'assume',
 'assuming',
 'atheism',
 'atheists',
 'attack',
 'attempt',
 'author',
 'authority',
 'available',
 'average',
 'avoid',
 'away',
 'background',
 'base',
 'baseball',
 'based',
 'basic',
 'basically',
 'basis',
 'begin',
 'beginning',
 'belief

In [182]:
def get_topics(components, feature_names, n = 5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
        # .argsort: sorting한 것의 인덱스를 리턴

In [183]:
# 각 20개의 행의 각 1,000개의 열 중 가장 값이 큰 5개의 값을 찾아서 단어로 출력
get_topics(svd_model.components_, terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10

### 실습1

In [186]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [185]:
from gensim import corpora

In [187]:
dictionary = corpora.Dictionary(tokenized_doc)

In [188]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x1cd8cebd490>

In [189]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [193]:
# 두번째 뉴스 출력. 단어 인덱스와 개수
corpus[1]

[(52, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 2),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 2),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 2),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 2),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1)]

In [194]:
dictionary[211]

'unfortunately'

In [195]:
len(dictionary)

64281

In [208]:
import gensim

In [209]:
NUM_TOPICS = 20 # 20개의 토픽

In [210]:
# LDA 확률을 무작위로 할당하여 확률을 반복해서 수정
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word = dictionary, passes = 15)
# id2word: 숫자를 문자로 변환. dictionary를 참고하여

In [211]:
topics = ldamodel.print_topics(num_words=4)

In [212]:
for topic in topics:
    print(topic)

(0, '0.009*"system" + 0.008*"part" + 0.007*"ripem" + 0.007*"theory"')
(1, '0.017*"gordon" + 0.017*"pitt" + 0.016*"banks" + 0.015*"soon"')
(2, '0.030*"game" + 0.023*"team" + 0.023*"games" + 0.016*"play"')
(3, '0.012*"candida" + 0.009*"terminals" + 0.008*"accelerator" + 0.007*"syndrome"')
(4, '0.025*"year" + 0.015*"players" + 0.014*"league" + 0.013*"last"')
(5, '0.020*"president" + 0.011*"going" + 0.008*"jobs" + 0.008*"think"')
(6, '0.038*"scsi" + 0.018*"remark" + 0.011*"judges" + 0.011*"byte"')
(7, '0.010*"like" + 0.009*"would" + 0.008*"time" + 0.006*"good"')
(8, '0.045*"jesus" + 0.023*"bible" + 0.022*"christian" + 0.019*"christ"')
(9, '0.033*"space" + 0.012*"nasa" + 0.008*"data" + 0.007*"entries"')
(10, '0.016*"would" + 0.014*"people" + 0.008*"think" + 0.007*"know"')
(11, '0.018*"period" + 0.013*"chicago" + 0.013*"boston" + 0.012*"power"')
(12, '0.026*"health" + 0.022*"medical" + 0.014*"disease" + 0.012*"patients"')
(13, '0.016*"armenian" + 0.014*"israel" + 0.014*"armenians" + 0.013*"j

In [213]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [214]:
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.

In [None]:
pyLDAvis.display(vis)