<a href="https://colab.research.google.com/github/HYUNSOOLEE-6839/colab-Deep-Learning/blob/main/naver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 네이버 영화평 감성분류

In [1]:
# Konlpy 설치
!pip install Konlpy

Collecting Konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/67/c3/6bed87f3b1e5ed2f34bd58bf7978e308c86e255193916be76e5a5ce5dfca/tweepy-3.10.0-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/af/93f92b38ec1ff3091cd38982ed19cea2800fefb609b5801c41fc43c0781e/JPype1-1.2.1-cp36-cp36m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 53.5MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e2270723

In [2]:
import konlpy
konlpy.__version__

'0.5.2'

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
import numpy as np

In [5]:
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

### 파일 업로드

In [6]:
from google.colab import files

uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving ratings_test.txt to ratings_test.txt


In [7]:
from google.colab import files

uploaded = files.upload()
trainname = list(uploaded.keys())[0]

Saving ratings_train.txt to ratings_train.txt


In [8]:
import pandas as pd
train_df = pd.read_table(trainname)
test_df = pd.read_table(filename)

In [9]:
train_df['document'].nunique()

146182

In [10]:
# 중복샘플제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146183, 3)

In [11]:
# null 값 제거
train_df = train_df.dropna(how='any')
train_df.shape

(146182, 3)

### 테스트 셋에 적용하기

In [12]:
# 중복 제거
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49158, 3)

In [13]:
# Null값 제거
test_df = test_df.dropna(how='any')
test_df.shape

(49157, 3)

### 한글 텍스트 전처리

In [14]:
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [15]:
train_df['document'].replace('', np.nan, inplace=True)
train_df.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [16]:
train_df = train_df.dropna(how='any')
train_df.shape

(145791, 3)

### 테스트 셋에도 적용

In [17]:
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣]","")
test_df.head(3)

Unnamed: 0,id,document,label
0,6270596,굳ㅋ,1
1,9274899,,0
2,8544678,뭐야이평점들은나쁘진않지만점짜리는더더욱아니잖아,0


In [18]:
test_df['document'].replace('', np.nan, inplace=True)
test_df.isnull().sum()

id            0
document    305
label         0
dtype: int64

In [19]:
test_df = test_df.dropna(how='any')
test_df.shape

(48852, 3)

### 토큰화와 불용어 제거

In [20]:
from konlpy.tag import Okt
import tqdm.notebook as tn
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']
okt = Okt()

In [21]:
X_train=[]
for sentence in tn.tqdm(train_df['document']):
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_train.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [22]:
X_test=[]
for sentence in tn.tqdm(test_df['document']):
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_test.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=48852.0), HTML(value='')))




In [23]:
# 정수 인코딩
max_words = 35000
tokenizer = Tokenizer(num_words=max_words) # 상위 35,000개의 단어만 보존
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [24]:
# 전체 데이터의 길이 분포
print('리뷰의 최대 길이:', max(len(s) for s in X_train))
print('리뷰의 평균 길이:', sum(map(len, X_train))/len(X_train))

리뷰의 최대 길이: 69
리뷰의 평균 길이: 10.911133060339802


In [25]:
# X_train과 X_test의 모든 샘플 길이를 동일하게 30으로 셋팅
max_len = 30
# 전체 데이터의 길이는 30으로 맞춘다.
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [26]:
y_train = train_df['label'].values
y_test = test_df['label'].values

### 이전 실행결과
- LSTM : 0.8417
- SIMPLERNN : 0.8324
- LSTM + CNN : 0.768

### 심층 RNN모델(Deep RNN)

In [39]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.layers import Conv1D, Dropout, MaxPooling1D

In [28]:
model = Sequential([
                    Embedding(max_words, 100),
                    LSTM(128),
                    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         3500000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,617,377
Trainable params: 3,617,377
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

In [31]:
from keras.callbacks import ModelCheckpoint,EarlyStopping
earlyStopping = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
modelpath = "model/naver-lstm-best-model.hdf5"
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', 
                               verbose=1, save_best_only=True)

In [33]:
history = model.fit(X_train, y_train, epochs=10, batch_size=60,
                    validation_split=0.2, verbose=1, callbacks=[checkpointer,earlyStopping])

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.35079, saving model to model/naver-lstm-best-model.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.35079 to 0.34285, saving model to model/naver-lstm-best-model.hdf5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.34285
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.34285
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.34285
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.34285


In [34]:
from keras.models import load_model
model = load_model(modelpath)

In [35]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test, 
                                           verbose=2)[1]))

1527/1527 - 4s - loss: 0.3614 - accuracy: 0.8411

 테스트 정확도: 0.8411


In [40]:
model2 = Sequential([
                    Embedding(max_words, 100),
                    MaxPooling1D(pool_size=4),
                    Dropout(0.25),
                    Conv1D(128, 5, padding='valid', activation='relu', strides=1),
                    Dropout(0.25),
                    LSTM(128),
                    Dropout(0.25),
                    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 100)         3500000   
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 100)         0         
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 128)         64128     
_________________________________________________________________
dropout_7 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)              

In [41]:
model2.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

In [42]:
from keras.callbacks import ModelCheckpoint
modelpath = "model/naver-best-model.hdf5"
earlyStopping = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', 
                               verbose=1, save_best_only=True)

In [43]:
history2 = model2.fit(X_train, y_train, epochs=5, batch_size=160,
                    validation_split=0.2, verbose=1, callbacks=[checkpointer,earlyStopping])

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.44539, saving model to model/naver-best-model.hdf5
Epoch 2/5

Epoch 00002: val_loss did not improve from 0.44539
Epoch 3/5

Epoch 00003: val_loss did not improve from 0.44539
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.44539
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.44539


In [44]:
print("\n 테스트 정확도: %.4f" % (model2.evaluate(X_test, y_test, 
                                           verbose=2)[1]))

1527/1527 - 3s - loss: 0.5761 - accuracy: 0.7550

 테스트 정확도: 0.7550


In [60]:
model3 = Sequential()
model3.add(Embedding(max_words, output_dim=256))
model3.add(LSTM(128))
model3.add(Dropout(0.5))
model3.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         3500000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               117248    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,617,377
Trainable params: 3,617,377
Non-trainable params: 0
_________________________________________________________________


In [61]:
model3.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

In [62]:
from keras.callbacks import ModelCheckpoint
modelpath = "model/naver-def-best-model.hdf5"
earlyStopping = EarlyStopping(monitor='val_loss', verbose=0, patience=4)
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', 
                               verbose=1, save_best_only=True)

In [63]:
history3 = model3.fit(X_train, y_train, epochs=5, batch_size=160,
                    validation_split=0.2, verbose=1, callbacks=[checkpointer,earlyStopping])

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.35275, saving model to model/naver-def-best-model.hdf5
Epoch 2/5

Epoch 00002: val_loss improved from 0.35275 to 0.34508, saving model to model/naver-def-best-model.hdf5
Epoch 3/5

Epoch 00003: val_loss did not improve from 0.34508
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.34508
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.34508


In [64]:
print("\n 테스트 정확도: %.4f" % (model3.evaluate(X_test, y_test, 
                                           verbose=2)[1]))

1527/1527 - 3s - loss: 0.5292 - accuracy: 0.8282

 테스트 정확도: 0.8282
