# 토픽 모델링

## 1. 데이터 불러오기 및 전처리

In [1]:
!pip install pyLDAvis
!pip install konlpy



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd drive/MyDrive/토픽

/content/drive/MyDrive/토픽


In [4]:
import numpy as np
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(2021)

In [5]:
import pandas as pd
import numpy as np

data = pd.read_csv('review_checked_final.csv', encoding='utf-8')

In [None]:
data

Unnamed: 0,title,review_checked
0,(무삭제) 귀멸의 칼날,사실 만화가 명작이라기보단 유포 더블이 레전드다 하는 게 더 옳음 사실 만화의 전개...
1,(무삭제) 귀멸의 칼날,이걸 굳이 잔인하게 만들어서 세 걸어 놓은 이유랑 왤캐 인기가 많은지 모르겠네 그냥...
2,(무삭제) 귀멸의 칼날,나도 귀탈 나쁘지 않게 킬링 욕으로 봤는데 귀탈이 강연 진격을 넘는다는 이상한 말만...
3,(무삭제) 귀멸의 칼날,욕먹고 망해가는 작품을 애니가 살림 연출 작화 사운드 모두 만점이지만 진부한 전개와...
4,(무삭제) 귀멸의 칼날,솔직히 애니는 진짜 잘 만들었다 작화도 좋고 액션 연출 브금 타이밍까지 잘 만든 애...
...,...,...
168424,소나기,이거 학교에서 보여줬는데 눈물은 안 났음
168425,소나기,솔직히 연출이나 이어나가는 스토리가 그렇게 자연스러운 편이 아니라고 생각함 근데 너...
168426,소나기,보고 싶다
168427,소나기,일단 개인적인 생각인데 설명이 좀 부족한 것 같음 갑작스러운 연출도 적지 않게 있기...


In [None]:
from konlpy.tag import Komoran
komo = Komoran()
stopwords = ['애니', '감사', '이랑', '생각', '이건', '극장판', '마지막', '정도', '때문', '정주', '기대', '일본', '최고','이번','멤버','사랑','천관','사복','소장','판권','화성','사랑','모자이크','사랑해','다음','한국','중국']


# 명사 2글자 이상
def tokenize(doc):
    wordlist = []
    for word in komo.nouns(doc):
      if len(word) > 1 and word not in stopwords:
         wordlist.append(word)
    return(wordlist)

In [None]:
#토큰화
token_data=[] 

for value in data['review_checked']:
    tok = tokenize(value)
    token_data.append(tok)

In [None]:
drop_corpus = []

for corpus in token_data:
    if len(set(corpus)) < 3:   # 같은 단어 1-2개만 반복되는 corpus도 지우기 위해 set()을 사용
        drop_corpus.append(corpus)
    
for corpus in drop_corpus:
    token_data.remove(corpus)

## 2. LDA 파라미터 튜닝

### BoW

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(token_data)
corpus = [dictionary.doc2bow(text) for text in token_data]
print(corpus[0])

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 3), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1)]


In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamulticore import LdaMulticore

### 2-1. epoch반복횟수 설정

In [None]:
coherences=[]
perplexities=[]

for i in range(10):   
    ntopics = 10
    if i==0:
        p=1
    else:
        p=i*5
    lda = LdaMulticore(corpus, id2word=dictionary, num_topics=ntopics, passes=p, random_state=2021, workers=3)
    print('epoch',p)

    cm = CoherenceModel(model=lda, texts=token_data, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()
    print("Cpherence",coherence)
    coherences.append(coherence)
    print('Perplexity: ', lda.log_perplexity(corpus),'\n\n')
    perplexities.append(lda.log_perplexity(corpus))

In [None]:
import matplotlib.pyplot as plt

x=range(0,50,5)
plt.subplot(211)
plt.plot(x,coherences)
plt.xlabel('number of passes')
plt.ylabel('coherence score')

plt.subplot(212)
plt.plot(x,perplexities)
plt.xlabel('number of passes')
plt.ylabel('perplexity score')

### 2-2. 토픽 개수 설정

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
coherencesT=[]
perplexitiesT=[]

for i in range(1,11):

    lda4 = LdaMulticore(corpus, id2word=dictionary, num_topics=i,  passes=3, random_state=2021, workers=3)
    print('ntopics',i)

    cm = CoherenceModel(model=lda4, texts=token_data, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()
    print("Coherence",coherence)
    coherencesT.append(coherence)
    print('Perplexity: ', lda4.log_perplexity(corpus),'\n\n')
    perplexitiesT.append(lda4.log_perplexity(corpus))

In [None]:
import matplotlib.pyplot as plt

x=range(1,11)
plt.subplot(211)
plt.plot(x,coherencesT)
plt.xlabel('number of topics')
plt.ylabel('coherence score')

plt.subplot(212)
x=range(1,11)
plt.plot(x,perplexitiesT)
plt.xlabel('number of topics')
plt.ylabel('perplexity score')

### TF-IDF

In [None]:
from gensim import corpora, models
dictionary = corpora.Dictionary(token_data)
corpus = [dictionary.doc2bow(text) for text in token_data]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
coherencesTF=[]
perplexitiesTF=[]

for i in range(1,11):

    lda4 = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=i,  passes=30, random_state=2021)
    print('ntopics',i)

    cm = CoherenceModel(model=lda4, texts=token_data, dictionary=dictionary, coherence='c_v')
    coherence = cm.get_coherence()
    print("Coherence",coherence)
    coherencesT.append(coherence)
    print('Perplexity: ', lda4.log_perplexity(corpus),'\n\n')
    perplexitiesT.append(lda4.log_perplexity(corpus))

In [None]:
import matplotlib.pyplot as plt

x=range(1,11)
plt.subplot(211)
plt.plot(x,coherencesTF)
plt.plot(x,coherencesT)
plt.xlabel('number of topics')
plt.ylabel('coherence score')

plt.subplot(212)
plt.plot(x,perplexitiesTF)
plt.plot(x,perplexitiesTF)
plt.xlabel('number of topics')
plt.ylabel('perplexity score')

## 3. 최적모델 시각화

In [None]:
!pip install pyLDAvis
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(2021)

In [7]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

In [6]:
from gensim import corpora
import pickle

In [8]:
dictionary = corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))

In [13]:
model3 = LdaModel(corpus, id2word=dictionary, num_topics=3,  passes=30, random_state=2021)

In [11]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [14]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(model3, corpus, dictionary)
pyLDAvis.display(vis)


  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [15]:
pyLDAvis.save_html(vis, 'lda.html')

## 4. Lda2Vec

In [None]:
from lda2vec import LDA2Vec

In [None]:
model_vec = LDA2Vec()