In [1]:
import numpy as np
import pandas as pd
import os
import re

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from tensorflow.keras.models import load_model

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer


from pykospacing import Spacing
spacing = Spacing()

from konlpy.tag import Mecab
mecab = Mecab() #형태소 분석기 선언

In [2]:
BASIC_PATH = "/home/ai-sjjy/workspace/juyoung/AI/project/dataset/"

In [3]:
cat = pd.read_csv(BASIC_PATH+'moives/캣츠review.csv')
iron = pd.read_csv(BASIC_PATH+'moives/아이언맨2review.csv')
animal = pd.read_csv(BASIC_PATH+'moives/주토피아review.csv')
live = pd.read_csv(BASIC_PATH+'moives/살아있다review.csv')

In [4]:
import pickle

with open(BASIC_PATH+'final/Tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [6]:
CNN = load_model(BASIC_PATH+'final/CNN_model_hyper2.h5')
biLSTM = load_model(BASIC_PATH+'final/BiLSTM_model.h5')
GRU = load_model(BASIC_PATH+'final/GRU_model.h5')
LSTM = load_model(BASIC_PATH+'final/LSTM_model.h5')

In [42]:
class All_process():
    def __init__(self, movie_name, movie):
        self.stop_words = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','흠','아','더']
        self.movie_name = movie_name
        self.movie = movie
        
        self.X_train = None
        self.y_train = None
        
        self.max_len = 50
        
        self.models = {
            'cnn' : CNN,
            'BiLSTM' : biLSTM,
            'gru' : GRU,
            'lstm' : LSTM
        }
    
    def null_and_labeling(self):
        self.movie = self.movie.dropna(axis=0)
        
        self.movie['Rank'] = [1 if r >= 7 else 0 for r in self.movie['Rank']]
        
    def data_cleaning(self, review):
        # 1. 한글 및 공백을 제외한 모든 문자 제거 -> 한글만 남기겠다.
        review_text = re.sub("[^가-힣ㄱ-ㅎㅏ - | \\s]", "", review)

        # 2. 띄어쓰기 규칙
        review_text = spacing(review_text)

        # 3. mecab 객체를 활용해 형태소 단위로 문장을 나눈다(tokenzing)
        word_review = mecab.morphs(review_text)

        # 4. 불용어 제거하기
        word_review = [token for token in word_review if not token in self.stop_words]

        return word_review
    
    def clean_data_get(self):
        print("Movie Name : ", self.movie_name)
        clean_train_review = []
        for review in tqdm(self.movie['Review']):
            clean_train_review.append(self.data_cleaning(review))
            
        #print("전체 리뷰 개수 : ", len(clean_train_review))
        #print("띄어쓰기 이상한 리뷰 개수 :", len([clean for clean in clean_train_review if not clean]))
        
        ## 지우면 안됨 => [] 로 저장
        for idx, clean in enumerate(clean_train_review):
            if clean == None: clean_train_review[idx] = []
        
        self.movie['Review'] = clean_train_review
        return
    
    def encoding(self):
        self.X_train = self.movie['Review']
        self.y_train = np.array(list(self.movie['Rank'].tolist()))
        
        self.X_train = tokenizer.texts_to_sequences(self.X_train)
        
        self.X_train = pad_sequences(self.X_train, maxlen = self.max_len)
        
        print("shape : ", self.X_train.shape)
        return
        
    def sentiment_evaluate(self):
        self.null_and_labeling()
        
        self.clean_data_get()
        
        self.encoding()
        
        print("Movie Name : ", self.movie_name)
        for name, model in self.models.items():
            print("Model => ", name)
            print("모델 정확도: %.4f" % (model.evaluate(self.X_train, self.y_train)[1]))
        
        return
    
    def sentiment_predict(self):
        self.null_and_labeling()
        
        self.clean_data_get()
        
        self.encoding()
        
        result = self.models['cnn'].predict(self.X_train)
        result = [1 if r>0.5 else 0 for r in result]
        
        self.movie['P/N'] = result
        
        return
        

In [40]:
dfs = {
    'Cat' : cat,
    'Iron' : iron, 
    'Animal' : animal,
    'Live' : live
}

In [44]:
result = {
    'Cat' : None,
    'Iron' : None, 
    'Animal' : None,
    'Live' : None
}
for name, moive in dfs.items():
    process = All_process(name, moive)
    process.sentiment_predict()
    result[name] = process.movie
    print(process.movie.head(5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Movie Name :  Cat


100%|██████████| 239/239 [00:06<00:00, 37.80it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


shape :  (239, 50)
                                              Review  Rank  P/N
0                                           [편, 나와라]     1    1
1                                [귀여우, 니까, 용서, 되, 요]     1    1
2  [편, 만큼, 재밌, 는데, ㅋㅋ, 비둘기, 개웃, 김, ㅋㅋ, ㅋㅋ, 가볍, 게,...     1    1
3                      [재밌, 쪙, 편, 빨, 뤼, 만들, 어, 줘, 용]     1    1
4                                           [재밌, 어요]     1    1
Movie Name :  Iron


100%|██████████| 6492/6492 [02:54<00:00, 37.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


shape :  (6492, 50)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                                              Review  Rank  P/N
0                 [비해, 떨, 여졌, 지만, 없, 어, 선, 안, 될, 내용]     1    0
1  [솔직히, 다, 좋, 데, 별로, 였, 던, 거, 공감, 했었, 겠, 지, 악당, ...     1    0
2  [그거, 암, 토니, 날, 려고, 영화, 반, 을, 잡, 먹, 었, 는데, 저, 흑...     1    1
3                    [아이언맨, 그냥, 나와, 줘서, 감사, 합니다, ㅋㅋ]     1    1
4                           [정도, 면, 눈, 즐겁, 고, 재, 밋지]     1    1
Movie Name :  Animal


100%|██████████| 17852/17852 [08:00<00:00, 37.14it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


shape :  (17852, 50)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


                                              Review  Rank  P/N
0  [네, 닉, 한테, 반했, 어요, 아니, 무슨, 내, 영화, 나오, 캐릭터, 한테,...     1    1
1  [단언, 컨대, 년, 상반기, 영화, 최고, 버디, 플레이, 강, 동원, 황정민, ...     1    1
2                        [하, 나무, 늘, 보, 졸, 귀엽, 네, ㅋㅋ]     1    1
3  [유치, 애니메이션, 영화, 처럼, 보인다고, 묻히, 기, 엔, 너무, 아쉬운, 영...     1    1
4  [선입견, 을, 날려, 버리, 고, 불, 가능, 을, 가능, 만드, 애니메이션, 한...     1    1
Movie Name :  Live


100%|██████████| 17897/17897 [08:03<00:00, 37.02it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


shape :  (17897, 50)
                                              Review  Rank  P/N
0                            [개연, 성, 진짜, 할, 말, 없, 음]     0    0
1  [영화, 보다, 중간, 나온, 적, 처음, 저런, 좋, 배우, 가지, 고, 따구, ...     0    0
2      [유아, 인, 그, 똑똑, 배우, 왜, 이런, 영화, 택한, 건지, 의문, 에요]     0    0
3  [개연, 성, 없, 던, 영화, 안, 씻, 고, 굶, 어도, 탱탱, 피부, 탈색, ...     0    0
4  [요즘, 같, 양질, 영상, 유튜브, 썩, 어, 나, 세상, 이런, 영화, 영화관,...     0    0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
