In [1]:
import numpy as np
import pandas as pd
import os
import re

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tensorflow.keras.models import load_model

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer


from pykospacing import Spacing
spacing = Spacing()

from konlpy.tag import Mecab
mecab = Mecab() #형태소 분석기 선언

In [2]:
BASIC_PATH = "dataset/"

In [3]:
cat = pd.read_csv(BASIC_PATH+'moives/캣츠review.csv')
iron = pd.read_csv(BASIC_PATH+'moives/아이언맨2review.csv')
animal = pd.read_csv(BASIC_PATH+'moives/주토피아review.csv')
live = pd.read_csv(BASIC_PATH+'moives/살아있다review.csv')

In [4]:
word_vector_dim = 100
vocab_size = 20744
max_len = 50

In [5]:
import pickle

with open(BASIC_PATH+'final/Tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## [1] data 전처리

### [1-1] null drop

In [6]:
cat = cat.dropna(axis=0)

In [7]:
iron = iron.dropna(axis=0)

In [8]:
animal = animal.dropna(axis=0)

In [9]:
live = live.dropna(axis=0)

### [1-2] 부정, 긍정 labeling

#### 0 ~ 6 부정 / 7 ~ 10긍정

In [10]:
cat['Rank'] = [1 if r >= 7 else 0 for r in cat['Rank']]

In [11]:
iron['Rank'] = [1 if r >= 7 else 0 for r in iron['Rank']]

In [12]:
animal['Rank'] = [1 if r >= 7 else 0 for r in animal['Rank']]

In [13]:
live['Rank'] = [1 if r >= 7 else 0 for r in live['Rank']]

In [14]:
cat.head(5)

Unnamed: 0,Review,Rank
0,3편나와라~~,1
1,귀여우니까 용서가되요,1
2,1편만큼 재밌는데ㅋㅋ비둘기개웃김ㅋㅋㅋㅋ 걍 가볍게보기좋음,1
3,재밌쪙 3편 빨뤼 만들어줘용,1
4,재밌어요,1


In [15]:
iron.head(5)

Unnamed: 0,Review,Rank
0,"1,3 에 비해 떨여졌지만 없어선 안될 내용",1
1,솔직히 다들 1은 좋은데 2는 별로였던거 공감했었겠지.악당이 너무 약하고 스케일도 ...,1
2,그거암? 토니는 날려고 영화의 반을 잡아먹었는데 저 흑형은 걍 남,1
3,아이언맨은 그냥 나와줘서 감사합니다 ㅋㅋ,1
4,이정도면 눈도 즐겁고 재밋지,1


### [1-3] 데이터 전처리

In [27]:
class Preprocessing():
    def __init__(self, reviews):
        self.stop_words = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','흠','아','더']
        self.clean_train_review = []
        self.reviews = reviews
    
    def preprocess(self, review):
        # 1. 한글 및 공백을 제외한 모든 문자 제거 -> 한글만 남기겠다.
        review_text = re.sub("[^가-힣ㄱ-ㅎㅏ - | \\s]", "", review)

        # 2. 띄어쓰기 규칙
        review_text = spacing(review_text)

        # 3. mecab 객체를 활용해 형태소 단위로 문장을 나눈다(tokenzing)
        word_review = mecab.morphs(review_text)

        # 4. 불용어 제거하기
        word_review = [token for token in word_review if not token in self.stop_words]

        return word_review
    
    def clean_data_get(self):
        print("ASASDASD")
        for review in tqdm(self.reviews):
            self.clean_train_review.append(self.preprocess(review))
            
        print("전체 리뷰 개수 : ", len(self.clean_train_review))
        print("띄어쓰기 이상한 리뷰 개수 :", len([clean for clean in self.clean_train_review if not clean]))
        
        ## 지우면 안됨 => [] 로 저장
        for idx, clean in enumerate(self.clean_train_review):
            if clean == None: self.clean_train_review[idx] = []
        
        return
    

In [31]:
CAT_review = Preprocessing(cat['Review'])
CAT_review.clean_data_get()

iron_review = Preprocessing(iron['Review'])
iron_review.clean_data_get()

animal_review = Preprocessing(animal['Review'])
animal_review.clean_data_get()

live_review = Preprocessing(live['Review'])
live_review.clean_data_get()

ASASDASD


100%|██████████| 239/239 [00:06<00:00, 37.07it/s]


전체 리뷰 개수 :  239
띄어쓰기 이상한 리뷰 개수 : 2
ASASDASD


100%|██████████| 6492/6492 [02:55<00:00, 37.10it/s]


전체 리뷰 개수 :  6492
띄어쓰기 이상한 리뷰 개수 : 37
ASASDASD


100%|██████████| 17852/17852 [08:01<00:00, 37.05it/s]


전체 리뷰 개수 :  17852
띄어쓰기 이상한 리뷰 개수 : 129
ASASDASD


100%|██████████| 17897/17897 [08:04<00:00, 36.95it/s]

전체 리뷰 개수 :  17897
띄어쓰기 이상한 리뷰 개수 : 41





In [32]:
CAT_review.clean_train_review[3]

['재밌', '쪙', '편', '빨', '뤼', '만들', '어', '줘', '용']

In [33]:
cat['Review'] = CAT_review.clean_train_review
iron['Review'] = iron_review.clean_train_review
animal['Review'] = animal_review.clean_train_review
live['Review'] = live_review.clean_train_review

### [1-4] 정수 인코딩

In [34]:
X_cat = cat['Review']
y_cat = np.array(list(cat['Rank'].tolist()))

X_iron = iron['Review']
y_iron = np.array(list(iron['Rank'].tolist()))

X_animal = animal['Review']
y_animal = np.array(list(animal['Rank'].tolist()))

X_live = live['Review']
y_live = np.array(list(live['Rank'].tolist()))

In [35]:
data_list = {
    'cat' : [X_cat, y_cat],
    'iron' : [X_iron, y_iron],
    'animal' : [X_animal, y_animal],
    'live' : [X_live, y_live]
}

In [37]:
for movie, data in data_list.items():
    encoded = tokenizer.texts_to_sequences(data[0]) 
    pad_new = pad_sequences(encoded, maxlen = max_len)
    
    data_list[movie][0] = pad_new

In [38]:
data_list[movie][0][4]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,  339,   30,
        310, 5904, 1366,   17,   12,  384,   50,    2,  496,  103,   28,
        142,  123,   31,   42,   42,   85,  140, 1366,   17,   12,    3,
        909,   66,   13,    6,  207,    2,    6,   26, 2028,  519, 2807,
       2062, 1164, 1270, 2028,  689,   36], dtype=int32)

## [2] Model predict

In [50]:
CNN = load_model(BASIC_PATH+'final/CNN_model_hyper2.h5')
biLSTM = load_model(BASIC_PATH+'final/BiLSTM_model.h5')
GRU = load_model(BASIC_PATH+'final/GRU_model.h5')
LSTM = load_model(BASIC_PATH+'final/LSTM_model.h5')
CNN_LSTM = load_model(BASIC_PATH+'final/CNN_LSTM_model.h5')

In [51]:
models = {
    'cnn' : CNN,
    'BiLSTM' : biLSTM,
    'gru' : GRU,
    'lstm': LSTM,
    'cnn_lstm' : CNN_LSTM
}

In [52]:
for name, dataset in data_list.items():
    print("movie : ", name)
    for m_name, model in models.items():
        print(name, " + ", m_name, " Accuracy result  =>>> ", model.evaluate(dataset[0], dataset[1])[1]*100, "%")
        print()
    print()

movie :  cat
cat  +  cnn  Accuracy result  =>>>  82.42678046226501 %

cat  +  BiLSTM  Accuracy result  =>>>  81.17154836654663 %

cat  +  gru  Accuracy result  =>>>  81.58996105194092 %

cat  +  lstm  Accuracy result  =>>>  81.58996105194092 %

cat  +  cnn_lstm  Accuracy result  =>>>  76.98744535446167 %


movie :  iron
iron  +  cnn  Accuracy result  =>>>  76.2476921081543 %

iron  +  BiLSTM  Accuracy result  =>>>  75.90880990028381 %

iron  +  gru  Accuracy result  =>>>  76.83302760124207 %

iron  +  lstm  Accuracy result  =>>>  76.87923312187195 %

iron  +  cnn_lstm  Accuracy result  =>>>  70.85644006729126 %


movie :  animal
animal  +  cnn  Accuracy result  =>>>  91.88326001167297 %

animal  +  BiLSTM  Accuracy result  =>>>  90.06834030151367 %

animal  +  gru  Accuracy result  =>>>  92.499440908432 %

animal  +  lstm  Accuracy result  =>>>  91.6592001914978 %

animal  +  cnn_lstm  Accuracy result  =>>>  86.28724813461304 %


movie :  live
live  +  cnn  Accuracy result  =>>>  82.79