In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import os
import gensim
import tensorflow.keras as keras

In [2]:
imdb_reviews = pd.read_csv(r'C:\Users\Hamza\Sentiment analysis\imdb_reviews.csv')
test_reviews = pd.read_csv(r'C:\Users\Hamza\Sentiment analysis\test_reviews.csv')

In [36]:
imdb_reviews.head()

Unnamed: 0,Reviews,Sentiment
0,<START this film was just brilliant casting lo...,positive
1,<START big hair big boobs bad music and a gian...,negative
2,<START this has to be one of the worst films o...,negative
3,<START the <UNK> <UNK> at storytelling the tra...,positive
4,<START worst mistake of my life br br i picked...,negative


In [4]:
#Data Preprocesing
word_index = pd.read_csv(r'C:\Users\Hamza\Sentiment analysis\word_indexes.csv')

In [5]:
word_index.head()

Unnamed: 0,Words,Indexes
0,tsukino,52009
1,nunnery,52010
2,sonja,16819
3,vani,63954
4,woods,1411


In [6]:
word_index=dict(zip(word_index.Words,word_index.Indexes))

In [7]:
word_index["<PAD>"]=0
word_index["<START"]=1
word_index["<UNK>"]=2
word_index["<UNUSED>"]=3

In [8]:
#Now we define a function review_encoder that encodes the reviews into integer format according to the mapping specified by word_index file.
def review_encoder(text):
  arr=[word_index[word] for word in text]
  return arr


In [9]:
train_data,train_labels=imdb_reviews['Reviews'],imdb_reviews['Sentiment']
test_data, test_labels=test_reviews['Reviews'],test_reviews['Sentiment']

In [10]:
#Before transforming the reviews as integers we need to tokenize or split the review on the basis of whitespaces
#For eg.the string "The movie was wonderful" becomes ["The" , "movie" , "was" , "wonderful" ].

train_data=train_data.apply(lambda review:review.split())
test_data=test_data.apply(lambda review:review.split())

In [11]:
#Since we have tokenized the reviews now we can apply the review_encoder function to each review and transform the reviews into integer format.
train_data=train_data.apply(review_encoder)
test_data=test_data.apply(review_encoder)

In [12]:
train_data.head()

0    [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...
1    [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...
2    [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...
3    [1, 4, 2, 2, 33, 2804, 4, 2040, 432, 111, 153,...
4    [1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...
Name: Reviews, dtype: object

In [13]:
#We also need to encode the sentiments and we are labeling the positive sentiment as 1 and negative sentiment as 0.
def encode_sentiments(x):
  if x=='positive':
    return 1
  else:
    return 0


In [14]:
train_labels=train_labels.apply(encode_sentiments)
test_labels=test_labels.apply(encode_sentiments)

In [15]:
train_data=keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding='post',maxlen=500)
test_data=keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding='post',maxlen=500)

In [17]:
model=keras.Sequential([keras.layers.Embedding(10000,16,input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16,activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

In [18]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [20]:
#training the model
history=model.fit(train_data,train_labels,epochs=30,batch_size=512,validation_data=(test_data,test_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [21]:
loss,accuracy=model.evaluate(test_data,test_labels)



In [22]:
index=np.random.randint(1,1000)
user_review=test_reviews.loc[index]
print(user_review)


Reviews      <START i thoroughly enjoyed <UNK> <UNK> story ...
Sentiment                                             positive
Name: 361, dtype: object


In [23]:
user_review=test_data[index]
user_review=np.array([user_review])
if (model.predict(user_review)>0.5).astype("int32"):
  print("positive sentiment")
else:
  print("negative sentiment")


positive sentiment


In [24]:
y_pred = model.predict(test_data)

In [33]:
print(y_pred[40])

[0.99585056]
