## 네이버 영화평 감성분석 - LSTM

In [34]:
!pip install Konlpy > /dev/null

In [35]:
import numpy as np
import pandas as pd

In [36]:
train_df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", sep='\t')
test_df = pd.read_csv("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", sep='\t')

In [37]:
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

In [38]:
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


#### 1. 데이터 전처리
- train dataset

In [39]:
# 결측치 확인
train_df.isna().sum()

id          0
document    5
label       0
dtype: int64

In [40]:
# 결측치 데이터 삭제
train_df.dropna(how='any', inplace=True)
train_df.shape

(149995, 3)

In [41]:
# 중복 데이터 확인
train_df.document.nunique()

146182

In [42]:
# 중복 데이터 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146182, 3)

In [43]:
# 데이터 분포
train_df.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

- test dataset

In [44]:
test_df.dropna(how='any', inplace=True)
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49157, 3)

In [45]:
test_df.label.value_counts()

1    24711
0    24446
Name: label, dtype: int64

#### 2. 텍스트 전처리
- trian dataset

In [46]:
# 한글 이외의 문자는 공백으로 처리하고 strip
train_df.document = train_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', regex=True).str.strip()
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [47]:
# 한글이 없는 글 --> ''만 남게됨
# ''만 남은 데이터는 제거: np.nan으로 대체후 dropna 실행
import numpy as np
train_df.document.replace('', np.nan, inplace=True)
train_df.document.isna().sum()

789

In [48]:
train_df.dropna(how='any', inplace=True)
train_df.shape

(145393, 3)

- test dataset

In [49]:
test_df.document = test_df.document.str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣]', ' ', regex=True).str.strip()
test_df.document.replace('', np.nan, inplace=True)
test_df.dropna(how='any', inplace=True)
test_df.shape

(48852, 3)

#### 3. 한글 형태소 분석

In [50]:
from konlpy.tag import Okt
okt = Okt()

In [51]:
from google.colab import files
up = files.upload()

Saving 한글불용어100.txt to 한글불용어100 (1).txt


In [52]:
with open('한글불용어100.txt') as st:
  lines = st.readlines()

stop_words = [line.split('\t')[0] for line in lines]
stop_words[:10]

['이', '있', '하', '것', '들', '그', '되', '수', '이', '보']

In [53]:
from tqdm import tqdm

X_train = []
for review in tqdm(train_df.document):
    morphs = okt.morphs(review, stem=True)
    clean_morph_review = ' '.join([morph for morph in morphs if morph not in stop_words])
    X_train.append(clean_morph_review)

100%|██████████| 145393/145393 [11:24<00:00, 212.47it/s]


In [54]:
%%time
X_test = []
for review in test_df.document:
    morphs = okt.morphs(review, stem=True)
    clean_morph_review = ' '.join([morph for morph in morphs if morph not in stop_words])
    X_test.append(clean_morph_review)

CPU times: user 3min 33s, sys: 920 ms, total: 3min 34s
Wall time: 3min 41s


#### 4. Keras Tokenizer

In [55]:
import tensorflow as tf
seed = 2023
np.random.seed(seed)
tf.random.set_seed(seed)

In [56]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [57]:
t = Tokenizer()
t.fit_on_texts(X_train)
len(t.word_index)

43068

In [58]:
# 빈도수 상위 10,000개 단어로 인코딩
num_words = 10000
t = Tokenizer(num_words=num_words)
t.fit_on_texts(X_train)

In [59]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [60]:
# 데이터의 최대/평균 길이
max(len(s) for s in X_train), sum(len(s) for s in X_train) / len(X_train)

(67, 10.90946606782995)

In [61]:
# 한 문장의 최대 길이
max_len = 20

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [62]:
y_train = train_df.label.values
y_test = test_df.label.values

#### 5. LSTM 모델 정의/설정/학습

In [63]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [64]:
model = Sequential([
    Embedding(num_words, 100, input_length=max_len),
    LSTM(128),
    Dense(1, 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 100)           1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1117377 (4.26 MB)
Trainable params: 1117377 (4.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [67]:
model.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best_naver_movie_lstm.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [68]:
hist = model.fit(
    X_train, y_train, validation_split=0.2, epochs=30, batch_size=128, callbacks=[mc,es]
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.36589, saving model to best_naver_movie_lstm.h5
Epoch 2/30


  saving_api.save_model(


Epoch 2: val_loss improved from 0.36589 to 0.35112, saving model to best_naver_movie_lstm.h5
Epoch 3/30
Epoch 3: val_loss did not improve from 0.35112
Epoch 4/30
Epoch 4: val_loss did not improve from 0.35112
Epoch 5/30
Epoch 5: val_loss did not improve from 0.35112
Epoch 6/30
Epoch 6: val_loss did not improve from 0.35112
Epoch 7/30
Epoch 7: val_loss did not improve from 0.35112


In [70]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.35583266615867615, 0.8416851162910461]