In [1]:
import pandas as pd

### LDA 실습
- 'movie_review.csv' 다운로드 및 전처리 
- LDA 모델 생성 (num of topics : 10)
- 토픽 별 가장 중요한 단어 5개 출력

### 1. 데이터 불러오기 

In [2]:
df = pd.read_csv('./movie_review.csv')

In [3]:
review = df[['review']].iloc[:20000].copy()

### 2. 전처리

In [4]:
import re

In [5]:
# 텍스트 cleaning 함수
def clean_text(text):
    
    pattern = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
    text = re.sub(pattern = pattern,repl = ' ',string = text) #한글삭제(인코딩 오류인지,,, 셿 이런 단어가 생기길래 삭제,,)
    
    pattern = '(\[a-zA-Z0-9\_.+-\]+@\[a-zA-Z0-9]+.\[a-zA-Z0-9-.\]+)' # email제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = re.compile(r'([^\w]?\d+\.?\,?\)?\d*)+') # 숫자 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = '<[^>]*>' # html 태그 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = '[\r|\n]' # \r,\n 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern =  '[^\w\s]' # 특수기호 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)
    
    pattern = re.compile(r'\s+')  #  이중 space 제거
    text = re.sub(pattern = pattern,repl = ' ',string = text)

    text = text.lower()
    return text

In [6]:
review = review['review'].apply(clean_text)
review

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically there s a family where a little boy ...
4        petter mattei s love in the time of money is a...
                               ...                        
19995    ok for starters taxi driver is amazing this th...
19996    it s sort of hard for me to say it because i s...
19997    i still liked it though warren beatty is only ...
19998    we could still use black adder even today imag...
19999    this so called documentary tries to tell that ...
Name: review, Length: 20000, dtype: object

In [7]:
# Tokenizing
from nltk.tokenize import word_tokenize

In [8]:
review_token = review.apply(word_tokenize)
review_token

0        [one, of, the, other, reviewers, has, mentione...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, this, was, a, wonderful, way, to,...
3        [basically, there, s, a, family, where, a, lit...
4        [petter, mattei, s, love, in, the, time, of, m...
                               ...                        
19995    [ok, for, starters, taxi, driver, is, amazing,...
19996    [it, s, sort, of, hard, for, me, to, say, it, ...
19997    [i, still, liked, it, though, warren, beatty, ...
19998    [we, could, still, use, black, adder, even, to...
19999    [this, so, called, documentary, tries, to, tel...
Name: review, Length: 20000, dtype: object

In [None]:
[print(x) for x in review_token if ]

In [9]:
#불용어 제거
import nltk
from nltk.corpus import stopwords
import string

In [10]:
stopwords   = set(stopwords.words('english'))

In [11]:
def remove_stopwords(text):
    remove_sw = [w for w in text if w not in stopwords]
    return remove_sw

In [12]:
review_sw = review_token.apply(remove_stopwords)
review_sw

0        [one, reviewers, mentioned, watching, oz, epis...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
19995    [ok, starters, taxi, driver, amazing, taxi, dr...
19996    [sort, hard, say, greatly, enjoyed, targets, p...
19997    [still, liked, though, warren, beatty, fair, c...
19998    [could, still, use, black, adder, even, today,...
19999    [called, documentary, tries, tell, usa, faked,...
Name: review, Length: 20000, dtype: object

In [15]:
#stemming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [16]:
def stemming(text):
    stemmed_text = [porter.stem(t) for t in text]
    return stemmed_text

In [17]:
review_stem = review_sw.apply(stemming)
review_stem

0        [one, review, mention, watch, oz, episod, hook...
1        [wonder, littl, product, film, techniqu, unass...
2        [thought, wonder, way, spend, time, hot, summe...
3        [basic, famili, littl, boy, jake, think, zombi...
4        [petter, mattei, love, time, money, visual, st...
                               ...                        
19995    [ok, starter, taxi, driver, amaz, taxi, driver...
19996    [sort, hard, say, greatli, enjoy, target, pape...
19997    [still, like, though, warren, beatti, fair, co...
19998    [could, still, use, black, adder, even, today,...
19999    [call, documentari, tri, tell, usa, fake, moon...
Name: review, Length: 20000, dtype: object

In [18]:
review_stem = review_stem.tolist()

In [19]:
# Encode tokens to integers
from gensim import corpora
dictionary = corpora.Dictionary(review_stem)
dictionary.save('en.dict')  # save dictionary to file for future use

In [20]:
#Calculate TF-IDF
from gensim import models
tf = [dictionary.doc2bow(text) for text in review_stem]
tfidf_model = models.TfidfModel(tf)
tfidf = tfidf_model[tf]
corpora.MmCorpus.serialize('en.mm', tfidf) # save corpus to file for future use

### 3. LDA

In [21]:
ntopics, nwords = 10, 15

In [22]:
import numpy as np; np.random.seed(42)  # optional
lda = models.ldamodel.LdaModel(tfidf, id2word=dictionary, num_topics=ntopics)

### 4. 토픽 별 중요한 단어 5개씩 출력

In [23]:
for i in range(ntopics):
    print(f"Topic {i+1 }:")
    words = lda.show_topic(i, topn = 5)
    [print(j[0], end = " ") for j in words]
    print('\n')

Topic 1:
movi film watch good like 

Topic 2:
robin lynch hitler york mad 

Topic 3:
vietnam scarlett murphi elvi poetri 

Topic 4:
snake rental cain muppet columbo 

Topic 5:
twin damm doo scoobi scariest 

Topic 6:
madonna hollow scarecrow cabin email 

Topic 7:
fairi keaton superman sinatra york 

Topic 8:
felix jan latin priceless kutcher 

Topic 9:
sandra tarzan bullock turkish leland 

Topic 10:
snipe nun wesley dud argento 

