In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.layers import ReLU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
#read data
data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [5]:
#size of data
data.shape

(50000, 2)

In [6]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [7]:
#check null value 
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [9]:
df = data[0:5000]

In [10]:
df.shape

(5000, 2)

In [11]:
#filtering the text
def full_form(text):
    text = text.lower()
    plain = re.sub(r'[<>?\.,!"(\)\/[\]]', '', text)
    plain = plain.replace("don't", "do not")
    plain = plain.replace("won't", "will not")
    plain = plain.replace("haven't", "have not")
    plain = plain.replace("can't", "cannot")
    plain = plain.replace("she's", "she is")
    plain = plain.replace("he's", "he is")
    plain = plain.replace("there're", "there are")
    plain = plain.replace("they'd", "they would")
    plain = plain.replace("\'ll", " will")
    return plain              

In [12]:
#initialize the stemmer and lemmentizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [13]:
#preprocessing  text 
def remove_stop_words(data):
  corpus = []
  for i in range(0, len(data)):
    #split the sentence
    plain = full_form(data['review'][i])
    sentence = plain.split()
    
    #check and stem the word
    review_processed = [stemmer.stem(word) for word in sentence if not word in stopwords.words('english')]
    #rebuild the sentence
    review_joint = ' '.join(review_processed)
    #add the sentence into list
    corpus.append(review_joint)
  return corpus

In [14]:
review_processed = remove_stop_words(df)

In [15]:
#vocabular size
voc_size = 5000

In [16]:
#convert into one hot vector
onehot_text = [one_hot(word, voc_size) for word in review_processed]

In [17]:
len(onehot_text[100])

79

In [18]:
#y train data
y = df['sentiment']
onehot_y = [one_hot(char,3) for char in y]

In [19]:
onehot_y[0:10]

[[1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]

# Embedding

In [20]:
sent_length = 200

In [21]:
#embedding
embedd_docs = pad_sequences(onehot_text, padding='post', maxlen=sent_length)

In [22]:
len(review_processed[0].split()), len(onehot_text[0])

(165, 165)

In [23]:
embedd_docs[0]

array([1788,  560, 2816, 4662, 4197, 1568, 4573, 2218,  265, 1887,  439,
       4223,  581,  417, 3065, 3987, 1568,  719, 1280, 1692, 4202, 4010,
        265,  443, 2704, 1836, 1914, 2274, 3269,  101, 1914, 3162, 2479,
       2503, 4781, 1348, 4202, 4204, 3440, 1031, 1591,  581, 4853, 1568,
       2797, 2173,  670, 2611, 3398, 1595, 2270, 2009, 4605, 4951, 2254,
       1464, 3118, 1350, 3872, 1033, 3515,  433, 4476, 2842, 2895, 1138,
       4194, 2254,  903, 2183, 3909, 4027, 4390, 2955, 3190,  748, 1058,
       1218, 1489, 2987, 1154, 3349, 3735, 3043,  311, 1942, 3664,  581,
        234, 4966, 1857, 3132, 1914, 3896, 3543, 1214, 1914,   28, 2155,
       3236, 3987, 1334, 1766, 3893, 2155,  340, 2155, 2772, 3691,  476,
        417, 4573, 2282, 2384, 3987, 1326,  769, 4966, 3710, 4662, 2207,
       2189, 1568, 2379, 3754, 2895,  291, 4066, 4202, 4202, 3122, 4887,
       4607,  111, 4565,  364,  925, 2183,  653, 3488, 3692, 2338,  199,
       3820,  364, 1208, 1350, 3787, 3896, 3165, 20

In [24]:
#shape of data
len(embedd_docs), len(onehot_y)

(5000, 5000)

In [25]:
#convert into numpy array
X_data = np.array(embedd_docs)
y_data = np.array(onehot_y)
#y_data = np.squeeze(y_data)

In [26]:
X_data.shape, y_data.shape

((5000, 200), (5000, 1))

In [27]:
#split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

In [28]:
print(f'training: {X_train.shape} - {y_train.shape}')
print(f'testing: {X_test.shape} - {y_test.shape}')

training: (3500, 200) - (3500, 1)
testing: (1500, 200) - (1500, 1)


In [29]:
y_train

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

# Model

In [30]:
#embed vector that represents each token by 40 featured vector
embed_vect = 40
model = Sequential()
model.add(Embedding(voc_size, embed_vect, input_length=sent_length))
model.add(LSTM(512, return_sequences=True, input_shape=(200,1)))
#model.add(LSTM(512, return_sequences=True))
model.add(Dense(units=16, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 40)           200000    
                                                                 
 lstm (LSTM)                 (None, 200, 512)          1132544   
                                                                 
 dense (Dense)               (None, 200, 16)           8208      
                                                                 
 dense_1 (Dense)             (None, 200, 16)           272       
                                                                 
 dense_2 (Dense)             (None, 200, 8)            136       
                                                                 
 dense_3 (Dense)             (None, 200, 1)            9         
                                                                 
Total params: 1,341,169
Trainable params: 1,341,169
Non-

In [31]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f529c102d90>

In [32]:
model.evaluate(X_test, y_test)



[2.1124612885614624e-06, 1.0]