In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from wordcloud import WordCloud

import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence #unique id

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, Embedding
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

df = pd.read_csv("movie_review.csv")
df

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos
...,...,...,...,...,...,...
64715,9,cv999,14636,20,that lack of inspiration can be traced back to...,neg
64716,9,cv999,14636,21,like too many of the skits on the current inca...,neg
64717,9,cv999,14636,22,"after watching one of the "" roxbury "" skits on...",neg
64718,9,cv999,14636,23,"bump unsuspecting women , and . . . that's all .",neg


In [3]:
df["tag"].value_counts()

pos    32937
neg    31783
Name: tag, dtype: int64

In [4]:
df["tag"].replace({"neg":0 , "pos":1}, inplace=True)

In [5]:
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,1
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",1
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,1
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",1
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",1


In [6]:
df.isnull().sum()

fold_id    0
cv_tag     0
html_id    0
sent_id    0
text       0
tag        0
dtype: int64

In [7]:

def cleantext(text):
  tokens = word_tokenize(text.lower())
  ftoken = [t for t in tokens if(t.isalpha())]
  stop = stopwords.words("english")
  ctoken = [t for t in ftoken if(t not in stop)]
  lemma = WordNetLemmatizer()
  ltoken = [lemma.lemmatize(t) for t in ctoken]
  return " ".join(ltoken)


In [8]:
df["clean_text"]=df["text"].apply(cleantext)

In [9]:
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,clean_text
0,0,cv000,29590,0,films adapted from comic books have had plenty...,1,film adapted comic book plenty success whether...
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",1,starter created alan moore eddie campbell brou...
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,1,say moore campbell thoroughly researched subje...
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",1,book graphic novel page long includes nearly c...
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",1,word dismiss film source


In [10]:
x = df["clean_text"]
y = df["tag"]


In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [12]:
sentlen = []

for sent in df["clean_text"]:
  sentlen.append(len(word_tokenize(sent)))

df["SentLen"] = sentlen
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag,clean_text,SentLen
0,0,cv000,29590,0,films adapted from comic books have had plenty...,1,film adapted comic book plenty success whether...,25
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",1,starter created alan moore eddie campbell brou...,15
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,1,say moore campbell thoroughly researched subje...,17
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",1,book graphic novel page long includes nearly c...,10
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",1,word dismiss film source,4


In [13]:
max(sentlen)

76

In [14]:
np.quantile(sentlen, 0.95)

22.0

In [15]:
# 95% of the review in doc has len equals to 190

In [16]:
max_len = np.quantile(sentlen, 0.95)

In [17]:
tok = Tokenizer(char_level=False, split=" ")
#char_level	if True, every character will be treated as a token.

tok.fit_on_texts(xtrain)
tok.index_word

{1: 'film',
 2: 'movie',
 3: 'one',
 4: 'character',
 5: 'like',
 6: 'time',
 7: 'get',
 8: 'scene',
 9: 'make',
 10: 'even',
 11: 'good',
 12: 'story',
 13: 'would',
 14: 'also',
 15: 'much',
 16: 'way',
 17: 'see',
 18: 'two',
 19: 'life',
 20: 'go',
 21: 'first',
 22: 'well',
 23: 'thing',
 24: 'could',
 25: 'year',
 26: 'really',
 27: 'take',
 28: 'plot',
 29: 'come',
 30: 'know',
 31: 'little',
 32: 'people',
 33: 'bad',
 34: 'work',
 35: 'never',
 36: 'man',
 37: 'best',
 38: 'performance',
 39: 'new',
 40: 'many',
 41: 'end',
 42: 'look',
 43: 'actor',
 44: 'director',
 45: 'u',
 46: 'action',
 47: 'play',
 48: 'love',
 49: 'show',
 50: 'great',
 51: 'role',
 52: 'find',
 53: 'another',
 54: 'still',
 55: 'something',
 56: 'give',
 57: 'star',
 58: 'back',
 59: 'say',
 60: 'audience',
 61: 'want',
 62: 'world',
 63: 'made',
 64: 'seems',
 65: 'however',
 66: 'think',
 67: 'big',
 68: 'though',
 69: 'better',
 70: 'every',
 71: 'seen',
 72: 'enough',
 73: 'day',
 74: 'part',
 75:

In [18]:
vocab_len = len(tok.index_word)
vocab_len

29700

In [19]:
seqtrain = tok.texts_to_sequences(xtrain) #step1
seqtrain

[[],
 [4179, 3400, 3157, 3054, 908, 18877, 1440, 2944, 138],
 [1682, 97, 581, 15, 722, 635, 31, 435, 361, 43],
 [4831,
  3276,
  7,
  1035,
  1441,
  33,
  76,
  161,
  72,
  284,
  186,
  15,
  70,
  1816,
  4,
  445,
  4832,
  16],
 [239, 5, 202, 198, 3526, 1133, 4588, 66],
 [2, 3158, 29, 642, 364, 2945, 1627, 5748, 486, 372, 4004, 909, 801],
 [2401, 325, 150, 235, 2946, 2596, 70, 606, 5405, 83, 4833],
 [3401,
  994,
  4834,
  192,
  18,
  18878,
  9291,
  4835,
  2597,
  4180,
  636,
  501,
  12185,
  2598,
  2026,
  192,
  121,
  20,
  1298,
  943,
  6115,
  438,
  8365,
  5749,
  582,
  7027,
  1176,
  324,
  2402,
  14653,
  10483],
 [34, 158, 218, 319, 218, 92, 7614, 583, 14654, 192, 92, 3402, 3824],
 [516, 70, 81, 3277, 6116, 14655, 600, 5, 1683, 14656],
 [144, 95, 1, 163, 70, 1540, 1442],
 [615, 14657, 1221, 218, 653, 615, 5109, 687, 1253, 319, 4371],
 [44,
  7615,
  18879,
  852,
  6502,
  3527,
  573,
  107,
  18880,
  226,
  1988,
  8366,
  109,
  1684,
  822,
  42,
  1865,

In [20]:
seqmattrain = sequence.pad_sequences(seqtrain, maxlen= int(max_len)) #step2
seqmattrain

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ..., 1440, 2944,  138],
       [   0,    0,    0, ...,  435,  361,   43],
       ...,
       [   0,    0,    0, ..., 2756,  887,  275],
       [   0,    0,    0, ...,  286, 4352,   30],
       [   0,    0,    0, ...,  245,  517,  984]], dtype=int32)

In [21]:
seqtest = tok.texts_to_sequences(xtest)
seqmattest = sequence.pad_sequences(seqtest, maxlen=int(max_len))

In [22]:
vocab_len

29700

In [23]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,190, input_length=int(max_len), mask_zero=True))
rnn.add(SimpleRNN(units=32, activation="tanh"))
rnn.add(Dense(units=32, activation="relu"))
rnn.add(Dropout(0.2))

rnn.add(Dense(units=1, activation="sigmoid"))

rnn.compile(optimizer="adam", loss="binary_crossentropy")

rnn.fit(seqmattrain, ytrain, batch_size=50, epochs=25)

ypred = rnn.predict(seqmattest)

#set threshold
ypred = np.where(ypred<0.5,0,1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [24]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.62      0.62      0.62      9431
           1       0.64      0.63      0.64      9985

    accuracy                           0.63     19416
   macro avg       0.63      0.63      0.63     19416
weighted avg       0.63      0.63      0.63     19416

