# Movie Reviews Sentiment Analysis
This project aims to predict the sentiment of the movie reviews using deep-learning, Convolutional Neural Net (CNN). We train the model with embeding layer and then we load pre-trained embeding layer to the model.

In [None]:
# unzip the dataset
!tar -xvf 'review_polarity.tar.gz'

#Libraries

In [None]:
from os import listdir
import nltk
from nltk.corpus import stopwords
import string
from collections import Counter
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPool1D
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split 

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopingwords = set(stopwords.words('english'))
table = str.maketrans('','',string.punctuation)


#Dataset & pre-processing
You can download the dataset from [here](https://raw.githubusercontent.com/jbrownlee/Datasets/master/review_polarity.tar.gz). 

After unzip the dataset, we can notice that it has two folders, one for postive reviews and one for negtaive reviews.

In [None]:
# func to load the dataset
def load_data(path):
  file = open(path)
  data = file.read()
  file.close()
  return data

In [None]:
# clean the data from stopping words and special characters
def clean_data(data):
  tokens = data.split()
  tokens = [word for word in tokens if word not in stopingwords]
  tokens = [word.translate(table) for word in tokens]
  tokens = [word for word in tokens if len(word)>1 and  word.isalpha()]
  return tokens

In [None]:
# to calculate the number of words
def add_to_vocab(data):
  counter.update(data)

In [None]:
# this function wrap up loading the data, clean it, calcualte the words numbers
def process_data(path):
  files_names = listdir(path)
  for file_name in files_names:
    file_path = path + '/' + file_name  
    data = load_data(file_path)
    tokens = clean_data(data)
    add_to_vocab(tokens)
    # print(file_name)

In [62]:
# find out how much words do we have in this dataset
counter = Counter()

neg_path = 'txt_sentoken/neg'
pos_path = 'txt_sentoken/pos'
process_data(neg_path)
process_data(pos_path)
# uncomment to see the vocabulries with its counts
# print(counter)
print(len(counter))

46624


In [None]:
# filter the vocabulary based on the occurnes
# we take the words with occurnes > 5
print(len(counter))
min_occur = 5
vocabs = [k for k,c in counter.items() if c >= min_occur]
print(len(vocabs))

46624
14807


In [None]:
# to save the new filtered vocabularies
def save_list(lines, fileName):
  data = '\n'.join(lines)
  file = open(fileName, 'w')
  file.write(data)
  file.close()

save_list(vocabs, 'Vocabs.txt')

In [None]:
# load the saved vocabularies
file = open('Vocabs.txt')
vocabs = file.read()
file.close()
print(len(vocabs.split('\n')))

14807


In [None]:
# split the loaded vocabularies on new line 
loaded_vocabs = vocabs.split('\n')
len(loaded_vocabs)

14807

In [None]:
# func to filter one review based on the new vocabularies
def filter_review(tokens):
  filtered_tokens = [w for w in tokens if w in loaded_vocabs]
  return ' '.join(filtered_tokens)

In [None]:
# func for filter all the reviews
def filter_reviews(path):
  filtered_reviews = []
  files_names = listdir(path)
  for file_name in files_names:
    file_path = path + '/' + file_name  
    data = load_data(file_path)
    tokens = clean_data(data)
    filtered_review = filter_review(tokens)
    filtered_reviews.append(filtered_review)
  return filtered_reviews

In [None]:
# filter the negative and the postive reviews based on the occurnes of the vocabularies that we saved
neg_path = 'txt_sentoken/neg'
pos_path = 'txt_sentoken/pos'
neg_reviews = filter_reviews(neg_path)
print(len(neg_reviews))
print(neg_reviews[:2])
save_list(neg_reviews, 'neg_reviews.txt')
# postives
pos_reviews = filter_reviews(pos_path)
print(len(pos_reviews))
print(pos_reviews[:2])
save_list(pos_reviews, 'pos_reviews.txt')


1000
['whether would considered probably depends would ask fan original series recently retired followup well even numbered entries film series however never one folks store away trek get relative merits vs data somewhere along line star trek film series began seem directed latter category star trek generations may natural conclusion direction production values may high writing frequently appalling instead script collection references injokes characters generations opens late century members original enterprise crew including captain james kirk william shatner present latest ship bear name sooner maiden voyage distress signal brings mysterious energy among rescued alien dr malcolm mcdowell back real world years later still trying get back encounters enterprise one led captain jeanluc picard patrick stewart plan involves destroying star inhabited planet hope saving million people historic meeting two enterprise think insult fans star trek suggest certain extent particulars plot really i

In [None]:
# combine the negative and postive review togather as train data
train_data = neg_reviews + pos_reviews
print(len(train_data))

2000


In [None]:
# define the labels for the reviews
# 0: negtaive review
# 1: postive review
neg_labels = np.zeros((1,len(neg_reviews)))
pos_labels = np.ones((1,len(pos_reviews)))
labels = np.concatenate((neg_labels,pos_labels),axis=1).reshape(-1,1)
labels.shape

(2000, 1)

In [None]:
# fit the tokenizer on the train data 
tokeniser = Tokenizer()
tokeniser.fit_on_texts(train_data)

In [None]:
# uncomment this to see each word with its counts
# tokeniser.word_counts

In [None]:
# make sure the output of the tokenzier has the same saved vocab counts 
len(tokeniser.word_counts)

14807

In [None]:
# convert the words into numbers
encoded_docs = tokeniser.texts_to_sequences(train_data)

In [None]:
# padd the reviews to the maxium length we have in the dataset
max_length = np.array([len(review) for review in encoded_docs]).max()
padded_reviews = pad_sequences(encoded_docs,maxlen=max_length,padding='post')

In [None]:
len(padded_reviews[0])

1238

In [None]:
padded_reviews.shape

(2000, 1238)

In [None]:
# split the data into train + test sets
x_train, x_test, y_train, y_test = train_test_split(padded_reviews, labels, test_size=0.3,random_state=7)

In [None]:
# The vocabulary size is the total number of words in our vocabulary, plus one for unknown words.
vocabs_num = len(vocabs.split('\n')) + 1
vocabs_dim = 100

In [None]:
# this is the model we use for Sentiment Analysis
model = Sequential()
model.add(Embedding(vocabs_num, vocabs_dim, input_length=max_length))
model.add(Conv1D(64,8,activation='relu'))
model.add(MaxPool1D(2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1238, 100)         1480800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1231, 64)          51264     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 615, 64)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 39360)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                393610    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 1,925,685
Trainable params: 1,925,685
Non-trainable params: 0
____________________________________________

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
model.fit(x_train,y_train,batch_size=32,epochs=20,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f79b51285f8>

In [None]:
# evalute the trained model, we get 86% accuracy (Great!)
loss, acc = model.evaluate(x_test, y_test)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 86.000001


# train stand_alone word2vec
in this part, we will train a stand-alone embeding layer then load it to the model.

The word2vec algorithm processes documents sentence by sentence. 

This means we will preserve the sentence-based structure during cleaning.

 this means we need tokens per sentence.

In [None]:
from gensim.models import Word2Vec

In [None]:
# func to process the data
def data_word2vec(path):
  reviews_tokens = []
  files_names = listdir(path)
  for file_name in files_names:
    file_path = path + '/' + file_name  
    data = load_data(file_path)
    tokens = clean_data(data)
    reviews_tokens.append(tokens)
  return reviews_tokens

In [None]:
# process the data
neg_path = 'txt_sentoken/neg'
pos_path = 'txt_sentoken/pos'
new_data_neg = data_word2vec(neg_path)
new_data_pos = data_word2vec(pos_path)
train_data = new_data_neg + new_data_pos
print(len(train_data))


2000


In [None]:
# define the stand-alone embeding layer
standAlone_model = Word2Vec(train_data,size=100,window=5,min_count=1,workers=8)
print(standAlone_model)

Word2Vec(vocab=46624, size=100, alpha=0.025)


In [None]:
# standAlone_model.wv.vocab

In [None]:
# see the learned vector of the word "example"
standAlone_model['example']

  


array([ 0.4803404 , -0.7001595 ,  0.31508818, -0.03183432,  0.0448408 ,
       -0.22982943, -0.01224964,  0.22862013,  0.502656  , -0.7864699 ,
       -0.12819374, -0.47206038, -0.04850436,  0.24738428,  0.96677756,
       -0.1405719 , -0.9670399 ,  0.11147805,  0.60952795,  1.1552802 ,
        0.5080436 ,  0.6740551 ,  0.14654763, -0.6544406 ,  0.15083729,
        0.2545559 , -0.20509699,  0.91583425,  0.10597903, -0.10086969,
        0.1461346 , -0.17658238, -0.16476314,  0.3305362 ,  0.04102265,
        0.35190314, -0.63464266,  0.5597246 ,  0.20803364, -0.37408945,
       -0.27982354, -1.049409  , -0.3020725 ,  0.65368974, -0.25004718,
        0.3431087 ,  0.55082375, -0.06871083, -0.30621013, -0.83140004,
        0.6259254 , -0.872284  ,  0.33605662, -0.79621726, -0.6564979 ,
       -0.51993096,  0.09395964,  0.95523053,  0.6880927 , -0.7408734 ,
        0.900521  ,  0.00936002,  0.5925486 ,  0.05372783,  0.45151564,
        0.3633122 ,  0.69971997,  0.32359302,  0.70402   , -1.04

In [None]:
# convert all the vocabulries we have into vectors using the stand alone embeding layer
all_vectors = standAlone_model[standAlone_model.wv.vocab]

  


In [None]:
# save the weights of the embeding layer
standAlone_model.wv.save_word2vec_format('word2vec.txt',binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Use Pre-trained Embedding


In [None]:
# load embedding as a dict
def load_embedding(filename):
	# load embedding into memory, skip first line
	file = open(filename,'r')
	lines = file.readlines()[1:]
	file.close()
	# create a map of words to vectors
	embedding = dict()
	for line in lines:
		parts = line.split()
		# key is string word, value is numpy array for vector
		embedding[parts[0]] = np.asarray(parts[1:], dtype='float32')
	return embedding

In [None]:
# load the saved weights of stand-alone embeding layer
raw_embed = load_embedding('word2vec.txt')

In [None]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
	# total vocabulary size plus 0 for unknown words
	vocab_size = len(vocab) + 1
	# define weight matrix dimensions with all 0
	weight_matrix = np.zeros((vocab_size, 100))
	# step vocab, store vectors using the Tokenizer's integer mapping
	for word, i in vocab.items():
		weight_matrix[i] = embedding.get(word)
	return weight_matrix

embedding_vectors = get_weight_matrix(raw_embed, tokeniser.word_index)

In [None]:
# define the embeding layer after load the weights
embedding_layer = Embedding(vocabs_num, vocabs_dim, weights=[embedding_vectors], input_length=max_length, trainable=False)

In [None]:
# the model we use with the stand-alone embeding layer
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(64,8,activation='relu'))
model.add(MaxPool1D(2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1238, 100)         1480800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1231, 64)          51264     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 615, 64)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 39360)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 10)                393610    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 1,925,685
Trainable params: 444,885
Non-trainable params: 1,480,800
______________________________________

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
model.fit(x_train,y_train,batch_size=32,epochs=20,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f79ac3087b8>

In [None]:
loss, acc = model.evaluate(x_test, y_test)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 55.666667


#Reasults
We get worse reaslts using stand-alone embeding layer than training the embeding layer with the whole model. It might need tunning hyper-parameters or maybe use better pre-trained model.

#Reference
https://machinelearningmastery.com/develop-word-embedding-model-predicting-movie-review-sentiment/
