# Importing Dataset

In [None]:
import keras

from keras.datasets import imdb

from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers.embeddings import Embedding

from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import string
import re

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
data = pd.read_excel('/content/drive/My Drive/dataset/dataset.xlsx', engine='openpyxl')

data['Text'] = data['Text'].str.lower()

# Dataset cleaning and analysing

In [None]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

In [None]:
def remove_stopwords(data):
  data['review without stopwords'] = data['Text'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return data

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
    
data_without_stopwords = remove_stopwords(data)
data_without_stopwords['clean_review']= data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

In [None]:
data_without_stopwords.head()

Unnamed: 0,URL,Text,Sentiment,review without stopwords,clean_review
0,http://www.imdb.com/title/tt0210075/usercomments,girlfight follows a project dwelling new york ...,POS,girlfight follows project dwelling new york hi...,girlfight follows project dwelling new york hi...
1,http://www.imdb.com/title/tt0337640/usercomments,hollywood north is an euphemism from the movie...,POS,hollywood north euphemism movie industry went ...,hollywood north euphemism movie industry went ...
2,http://www.imdb.com/title/tt0303549/usercomments,that '70s show is definitely the funniest show...,POS,'70s show definitely funniest show currently t...,70s show definitely funniest show currently t...
3,http://www.imdb.com/title/tt0716825/usercomments,"9/10- 30 minutes of pure holiday terror. okay,...",POS,"9/10- 30 minutes pure holiday terror. okay, no...",9 10 30 minutes pure holiday terror okay no...
4,http://www.imdb.com/title/tt0182225/usercomments,"a series of random, seemingly insignificant th...",POS,"series random, seemingly insignificant thefts ...",series random seemingly insignificant thefts ...


In [None]:
reviews_list = []
for i in range(len(data_without_stopwords['clean_review'])):
  reviews_list.append(data_without_stopwords['clean_review'][i])
 
sentiment = data_without_stopwords['Sentiment']

In [None]:
y = np.array(list(map(lambda x: 1 if x=="POS" else 0, sentiment)))

### Data split

In [None]:
X_train, X_test,Y_train, Y_test = train_test_split(reviews_list, y, test_size=0.2, random_state = 45)

### Data Tokenization

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

words_to_index = tokenizer.word_index
#type(words_to_index)

# Making Global Vector Map

In [None]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
  return word_to_vec_map

Download and mount the file glove.6B.50d.txt

https://www.kaggle.com/watts2/glove6b50dtxt
 
https://nlp.stanford.edu/projects/glove/


In [None]:
# !unzip '/content/drive/My Drive/dataset/archive.zip' -d '/content/drive/My Drive/dataset/'

In [None]:
word_to_vec_map = read_glove_vector('/content/drive/My Drive/dataset/glove.6B.50d.txt')
maxLen = 150

In [None]:
word_to_vec_map['moon'].shape[0]
len(words_to_index)

50

In [None]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index - 1, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

### Defining Model

In [None]:
def movie_rating(input_shape):
  X_indices = Input(input_shape)
  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)
  X = Dropout(0.6)(X)
  X = LSTM(128, return_sequences=True)(X)
  X = Dropout(0.6)(X)
  X = LSTM(128)(X)
  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [None]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [None]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [None]:
# X_train_indices.shape

(1599, 150)

In [None]:
model = movie_rating(X_train_indices.shape[1])

In [None]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f62a5684e50>

In [None]:
model.evaluate(X_test_indices, Y_test)



[0.6922664642333984, 0.5274999737739563]

### Testing

In [None]:
preds = model.predict(X_test_indices)
n = np.random.randint(0,len(X_test))
# X_test[n]
if preds[n] > 0.5:
  print('predicted sentiment : positive')
else: 
  print('precicted sentiment : negative')

if (Y_test[n] == 1):
  print('correct sentiment : positive')
else:
  print('correct sentiment : negative')

precicted sentiment : negative
correct sentiment : negative


In [None]:
Y_test.shape

In [None]:
preds.shape

In [None]:
Y_test[:5]

In [None]:
preds[:5]

In [None]:
def get_pred(preds):
  if preds > 0.5:
    return 1
  else: 
    return 0

In [None]:
predictions = [get_pred(x) for x in preds]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.49      0.35      0.41       187
           1       0.54      0.69      0.61       213

    accuracy                           0.53       400
   macro avg       0.52      0.52      0.51       400
weighted avg       0.52      0.53      0.51       400

