In [2]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /home/jupyter
 35%|█████████████▎                        | 9.00M/25.7M [00:00<00:00, 85.9MB/s]
100%|███████████████████████████████████████| 25.7M/25.7M [00:00<00:00, 164MB/s]


In [16]:
path_to_zip_file = '/home/jupyter/imdb-dataset-of-50k-movie-reviews.zip'
directory_to_extract_to = '/home/jupyter/dataset'

import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [17]:
import pandas as pd

imdb_data = pd.read_csv('./dataset/IMDB Dataset.csv')

In [18]:
imdb_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# 데이터 전처리

In [6]:
from tensorflow.keras import datasets
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [36]:
from tensorflow.keras.preprocessing.text import Tokenizer

maxlen = 100
max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(imdb_data.review)
sequences = tokenizer.texts_to_sequences(imdb_data.review)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)
# #Tokenization of text
# tokenizer=ToktokTokenizer()
# #Setting English stopwords
# stopword_list=nltk.corpus.stopwords.words('english')

Found 124252 unique tokens.


In [37]:
X_train = data[:40000]
y_train = imdb_data.sentiment[:40000]

X_test = data[40000:]
y_test = imdb_data.sentiment[40000:]
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(40000, 100) (40000,)
(10000, 100) (10000,)


In [38]:
print('first train review :', X_train[0])
print('first train sentiment :', y_train[0])

first train review : [ 123  210 3241   68   14   34 1637    9   13 2239   10  413  131   10
   13 1592   15    9   18   14   10  287   51   10 1417    3 1280   15
 3184    2  189    5    1  299 2046    4 2150  570   21   39  570   18
 7658 7154 5010   26 2983   41   15    3 6904  504   20  642    2   76
  243   16    9   69 7598  651  710 6904  109  662   82 1208  693    5
   65  574    4  920 2021   38 1208  559  147 3184   22  200  426 3819
   16   48    6 3314  805 1603   43   22   67   76    8 1228   16  125
 4103  486]
first train sentiment : positive


In [39]:
import numpy as np
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("Frequency of each label:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of each label:
[['negative' 'positive']
 [20007 19993]]


In [40]:
print(y_train[:5])

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object


In [46]:
print(imdb_data.sentiment)

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object


In [52]:
for i in range(len(imdb_data.sentiment)):
    if imdb_data.sentiment[i] == 'positive':
        imdb_data.sentiment[i] = 0
    else:
        imdb_data.sentiment[i] = 1
print(imdb_data.sentiment)        

0        0
1        0
2        0
3        1
4        0
        ..
49995    0
49996    1
49997    1
49998    1
49999    1
Name: sentiment, Length: 50000, dtype: object


In [54]:
y_train = np.asarray(y_train).astype('float32')
y_test = np.asarray(y_test).astype('float32')
print(y_train[:5])

[0. 0. 0. 1. 0.]


## CNN 분류 모델

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 256 # 임베딩 벡터의 차원
dropout_ratio = 0.3 # 드롭아웃 비율
num_filters = 256 # 커널의 수
kernel_size = 3 # 커널의 크기
hidden_units = 128 # 뉴런의 수

model = Sequential()
model.add(Embedding(10000, embedding_dim))
model.add(Dropout(dropout_ratio))
model.add(Conv1D(num_filters, kernel_size, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_units, activation='relu'))
model.add(Dropout(dropout_ratio))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[es, mc])

Epoch 1/20


2022-11-21 06:41:14.813232: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8200


Epoch 1: val_acc improved from -inf to 0.87550, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_acc improved from 0.87550 to 0.87760, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_acc did not improve from 0.87760
Epoch 4/20
Epoch 4: val_acc did not improve from 0.87760
Epoch 5/20
Epoch 5: val_acc did not improve from 0.87760
Epoch 5: early stopping


In [56]:
loaded_model = load_model('best_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8776


## RNN 분류 모델

In [57]:
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(10000, embedding_dim))
model.add(GRU(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/15
Epoch 1: val_acc improved from -inf to 0.85688, saving model to GRU_model.h5
Epoch 2/15
Epoch 2: val_acc improved from 0.85688 to 0.86862, saving model to GRU_model.h5
Epoch 3/15
Epoch 3: val_acc improved from 0.86862 to 0.87650, saving model to GRU_model.h5
Epoch 4/15
Epoch 4: val_acc did not improve from 0.87650
Epoch 5/15
Epoch 5: val_acc did not improve from 0.87650
Epoch 6/15
Epoch 6: val_acc did not improve from 0.87650
Epoch 7/15
Epoch 7: val_acc did not improve from 0.87650
Epoch 7: early stopping


In [58]:
loaded_model = load_model('GRU_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8729
