# 네이버 영화 감성분석
 - Tokenizer 함수
 - Tfidfvectorizer

In [1]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.1 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 33.0 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import files
up = files.upload()

Saving naver_movie_test_전처리완료.tsv to naver_movie_test_전처리완료.tsv
Saving naver_movie_train_전처리완료.tsv to naver_movie_train_전처리완료.tsv


In [28]:
list(up.keys())

['naver_movie_test_전처리완료.tsv', 'naver_movie_train_전처리완료.tsv']

In [31]:
train_df = pd.read_csv(list(up.keys())[1], sep='\t')
test_df = pd.read_csv(list(up.keys())[0] , sep='\t')
train_df.shape, test_df.shape

((145393, 3), (48852, 3))

In [32]:
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [33]:
from konlpy.tag import Okt
okt = Okt()

In [34]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ', '에게', '에', '구만', '구먼']

# string으로 stopwords 만드는 법
# a = '은 는 이 가'.split()

In [35]:
# 문자열을 토큰화하는 함수 (tokenizer 정의)
def okt_tokenizer(text):
    morphs = okt.morphs(text, stem=True)
    tokens = [word for word in morphs if word not in stopwords]
    return tokens

In [36]:
okt_tokenizer('열심히 일한 당신 주말엔 여행을 떠나봐요.')

['열심히', '일', '당신', '주말', '엔', '여행', '떠나다', '보다', '.']

 - pipeline으로 feature 변환과 분류를 동시에 진행

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [38]:
pipeline = Pipeline([('TFIDF', TfidfVectorizer(tokenizer = okt_tokenizer)), 
                     ('LR', LogisticRegression(random_state=2022))])
%time pipeline.fit(train_df.document, train_df.label)

CPU times: user 5min 24s, sys: 6.35 s, total: 5min 31s
Wall time: 5min 17s


Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(tokenizer=<function okt_tokenizer at 0x7ff64a6beb90>)),
                ('LR', LogisticRegression(random_state=2022))])

In [40]:
pipeline.score(test_df.document, test_df.label)

0.8427290591992139

- 실제 데이터 적용

In [47]:
import re
reviews = ['모든 국민이 봤으면 하는 영화입니다.',
           '생각보다 지루하고 별로였네요... 보면서 좀 졸았습니다.']
reviews = map(lambda x: re.sub('[^가-힣]',' ',x), reviews)

In [48]:
pipeline.predict(reviews)

array([1, 0])

 - 최적 파라미터 찾기
  * 매 시행마다 한글 형태소 분석하느라 시간이 오래 걸린다
  * 최적 파라미터를 찾으려면 한글 형태소 분석을 먼저 한 데이터로 해야 한다

In [27]:
from sklearn.model_selection import GridSearchCV
params = {'TVECT__ngram_range': [(1,1),(1,2)], 'TFIDF__max_df': [0.95, 0.98], 'LR__C': [1,5]}

In [51]:
grid_pipe = GridSearchCV(pipeline, params, scoring = 'accuracy', cv = 3)
#%time grid_pipe.fit(train_df.document, train_df.label) # 너무 오래걸림

 - CountVectorizer 사례에서 찾은 최적 파라미터로 평가하기



In [45]:
pipeline = Pipeline([('TFIDF', TfidfVectorizer(tokenizer = okt_tokenizer, max_df=0.95, ngram_range=(1,2))), 
                     ('LR', LogisticRegression(random_state=2022))])
%time pipeline.fit(train_df.document, train_df.label)

CPU times: user 8min 20s, sys: 20.3 s, total: 8min 40s
Wall time: 8min 23s


Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(max_df=0.95, ngram_range=(1, 2),
                                 tokenizer=<function okt_tokenizer at 0x7ff64a6beb90>)),
                ('LR', LogisticRegression(random_state=2022))])

In [50]:
pipeline.score(test_df.document, test_df.label)

0.8584704822729878