## 套件引入 及 資料前處理

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Activation, Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from math import log, sqrt
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
pd = pd.read_csv('./IMDB Dataset.csv/IMDB Dataset.csv', encoding='latin-1')
pd.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### 修改欄位名稱

In [3]:
# Change the title: 'review' as 'description'; 'sentiment' as 'label'
description = [i for i in pd.review]
label = [i for i in pd.sentiment]
pd['description'] = description
pd['label'] = label
pd = pd[['description', 'label']]
pd.head()

Unnamed: 0,description,label
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### 將 label 內 'negative' 改為 '0', 'positive' 改為 '1'

In [4]:
# Mark the 'negative' as '0', and 'positive' as '1'
pd['label'] = np.where(pd['label']=='positive', 1, 0)
pd.head()

Unnamed: 0,description,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [13]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [14]:
# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(pd['description'], pd['label'], test_size=0.2)

In [15]:
# 將文本轉換為數字序列
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [16]:
# 填充序列,使其長度相同
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

## 定義模型

In [21]:
#定義模型
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, 1)))
model.add(Dense(1, activation='sigmoid'))

## 編譯模型

In [22]:
#編譯模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## 訓練模型

In [23]:
#訓練模型
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x22a77856d00>

## 評估模型

In [24]:
#評估模型
loss, acc = model.evaluate(X_test_pad, y_test, batch_size=32)
print('Test accuracy:', acc)

Test accuracy: 0.5393000245094299


## 將測試的語句轉為索引後，預測

In [25]:
x = ['The movie is very good. I like it and watch several times!']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

model.predict(test_padded)



array([[0.5828497]], dtype=float32)

In [26]:
x = ['I hate the movie! It let me think about the sad memory.']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

model.predict(test_padded)



array([[0.5520717]], dtype=float32)

In [27]:
x = ['The movie is very succeddful!']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

model.predict(test_padded)



array([[0.5932382]], dtype=float32)