<a href="https://colab.research.google.com/github/Neulvo/TIL/blob/master/12_Prac_Topic_Modeling_LSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#토픽 모델링 (Topic Modeling)

## 0 환경

## 1. 잠재의미 분석 (Latent Semantic Analysis LSA)

### 1.1 직접 구현

1) 직접 구현

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import randomized_svd

class LSA:
    def __init__(self, doc_ls, topic_num):
      self.doc_ls = doc_ls
      self.topic_num = topic_num

      self.term2idx, self.idx2term = self.toIdxDict( ' '.join(doc_ls).split())

      self.doc2idx, self.idx2doc = self.toIdxDict(doc_ls)

      self.tdm  = self.TDM(doc_ls)
      self.U, self.s, self.VT = self.SVD(self.tdm)
      
      self.term_mat = self.TermVectorMatrix(self.U, self.s, topic_num)
      self.doc_mat = self.DocVectorMatrix(self.s, self.VT, topic_num)

      self.term_doc_mat = self.TermDocVectorMatrix(self.U, self.s, self.VT, topic_num)

    # 리스트내 값을 index로 변환하는 dict과 
    # index를 리스트내 값으로 변환하는 dict
    def toIdxDict(self, ls):
      any2idx = defaultdict(lambda : len(any2idx))
      idx2any = defaultdict()

      for item in ls:
        any2idx[item]
        idx2any[any2idx[item]] = item

      return any2idx, idx2any

    
    def TDM(self, doc_ls):
      tdm = np.zeros([len(self.term2idx), len(doc_ls)])
      for doc_idx, doc in enumerate(doc_ls):
        for term in doc.split():
          tdm[self.term2idx[term], doc_idx] += 1

      return tdm
        
    
    # 특이값 분해
    def SVD(self, tdm):
      U, s, VT = randomized_svd(tdm, n_components = 6, n_iter=20, random_state=None)
      return U, s, VT


    # 토픽별 주요 키워드 출력
    def TopicModeling(self, topic_num = 3):
      for i in range(topic_num):
        score = self.U[:, i:i+1].T
        sorted_index = np.argsort(-score)

        a = []
        for j in sorted_index[0, : topic_num]:
          a.append((self.idx2term[j], score[0, j].round(3)))

        print("Topic {} - {}".format(i + 1, a))

    
    def TermDocVectorMatrix(self, u, s, vt, topic_num):
      term_doc_mat = np.matrix(u[:, :topic_num]) * np.diag(s[:topic_num]) * np.matrix(vt[:topic_num, :])
      return term_doc_mat  

    # 키워드를 입력했을 때 단어 벡터 반환
    def GetTermVector(self, term):
      vec = self.term_mat[self.term2idx[term]]
      print("{} = {}".format(term, vec))
      return vec
        
            
    # 문서를 입력했을 때 문서 벡터 반환
    def GetDocVector(self, doc):
      vec = self.doc_mat[self.doc2idx[doc]]
      print("{} = {}".format(doc, vec))
      return vec
    
    
    def TermVectorMatrix(self, u, s, topic_num):
      term_mat = np.matrix(u[:, :topic_num]) * np.diag(s[:topic_num]) 
      return term_mat
        
    def DocVectorMatrix(self, s, vt, topic_num):
      doc_mat = np.diag(s[:topic_num])  * np.matrix(vt[:topic_num, :])
      return doc_mat.T
        
    def GetTermSimilarity(self, term1, term2):
      sim = cosine_similarity(self.GetTermVector(term1), self.GetTermVector(term2))
      print('({},{}) term similarity = {}'.format(term1, term2, sim[0][0]))
      return sim
        
    def GetDocSimilarity(self, doc1, doc2):
      sim  = cosine_similarity(self.GetDocVector(doc1), self.GetDocVector(doc2))
      print('({},{}) doc similarity = {}'.format(doc1, doc2, sim[0][0]))
      return sim
    
    def Compression(self, round_num=0):        
      print(self.tdm)
      print(self.term_doc_mat.round(round_num))
        
        
    

In [None]:
doc_ls = ['바나나 사과 포도 포도 짜장면',
         '사과 포도',
         '포도 바나나',
         '짜장면 짬뽕 탕수육',
         '볶음밥 탕수육',
         '짜장면 짬뽕',
         '라면 스시',
         '스시 짜장면',
         '가츠동 스시 소바',
         '된장찌개 김치찌개 김치',
         '김치 된장 짜장면',
         '비빔밥 김치'
         ]

lsa = LSA(doc_ls,3)
X=lsa.TDM(doc_ls)
print(X)
print('== 토픽 모델링 ==')
lsa.TopicModeling(3)

print('\n== 단어 벡터 ==')
lsa.GetTermVector('사과')
lsa.GetTermVector('포도')
print('\n== 단어 유사도 ==')
lsa.GetTermSimilarity('사과','바나나')
lsa.GetTermSimilarity('사과','짜장면')
lsa.GetTermSimilarity('포도','짜장면')
lsa.GetTermSimilarity('사과','스시')
print('\n== 문서 벡터 ==')
lsa.GetDocVector('사과 포도')
lsa.GetDocVector('짜장면 짬뽕')
print('\n== 문서 유사도 ==')
lsa.GetDocSimilarity('사과 포도', '포도 바나나')
lsa.GetDocSimilarity('사과 포도', '바나나 사과 포도 포도 짜장면')
print('\n== 토픽 차원수로 압축 ==')
lsa.Compression(0)

[[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [2. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
== 토픽 모델링 ==
Topic 1 - [('포도', 0.697), ('짜장면', 0.486), ('바나나', 0.348)]
Topic 2 - [('짜장면', 0.584), ('짬뽕', 0.356), ('김치', 0.337)]
Topic 3 - [('김치', 0.611), ('된장찌개', 0.264), ('김치찌개', 0.264)]

== 단어 벡터 ==
사과 = [[ 1.1233207  -0.47387139  0.03306518]]
포도 = [[ 2.24664139 -0.94774279  0.06613036]]

== 단어 유사도 ==
사과 = [[ 1.1233207  -0.47387139  0.03306518]]
바나나 = [[ 1.1233207  -0.47