In [1]:
from os import listdir
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")
import re
import random
import pandas as pd

In [2]:
import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras import layers
from keras.layers import Embedding , Activation, Dropout,  GlobalMaxPooling1D  
from keras.layers import Conv1D , Input , Flatten, Dense 

## Stanford Large Movie Review dataset contains two files.
* One for the training set and the other for the test set.
* The training file contains two files one for the positive reviews which contains 12500 reviews and the other for the negative reviews which contains 12500 reviews and the same for test file.
* Each review was built as a text file, so I created the function "readData" which reads text files and returns a list.

In [28]:
def readData(directory):
    lst = []
    # walk through all files in the folder
    for filename in listdir(directory):
        # create the full path of the file to open
        path = directory + '/' + filename
        # open the file as read only
        file = open(path,'r',encoding="utf8")
        # read all text
        text = file.read()
        lst.append(text)
        # close the file
        file.close()
    return lst

In [29]:
directoryOfTrainPos = r'E:\2020-21-2\NLP\FourthProject\aclImdb\train\pos'
directoryOfTrainNeg = r'E:\2020-21-2\NLP\FourthProject\aclImdb\train\neg'
directoryOfTestPos = r'E:\2020-21-2\NLP\FourthProject\aclImdb\test\pos'
directoryOfTestNeg = r'E:\2020-21-2\NLP\FourthProject\aclImdb\test\neg'

TrainPos = readData(directoryOfTrainPos)
TrainNeg = readData(directoryOfTrainNeg)
TestPos = readData(directoryOfTestPos)
TestNeg = readData(directoryOfTestNeg)

In [30]:
len(TrainPos) , len(TrainNeg) , len(TestPos) , len(TestNeg)

(12500, 12500, 12500, 12500)

## After reading the data, I merged the positive and negative training data and the same for test data.

In [31]:
trainData = TrainPos + TrainNeg
testData = TestPos + TestNeg

## Here I created the output of training and test data where the number one means positive review and zero means negative review.

In [32]:
Y_train = [1]*len(TrainPos) + [0]*len(TrainNeg) 
Y_test =  [1]*len(TestPos) + [0]*len(TestNeg)

## I created the function "PreprocessingData" to processing the data, it substitutes any strange string with ',' then I used Spacy to tokenize, lemmatize and lower the words of text.

In [33]:
all_stopwords = nlp.Defaults.stop_words

In [34]:
## Dataset Preprocessing
def PreprocessingData(text):
    review = re.sub('[^a-zA-Z]', ', ' , text)
    doc = nlp(review)
    review = [word.lemma_ for word in doc]
    review = [word.lower() for word in review if not (word in all_stopwords) and len(word)>2]
    review = ' '.join(review)
    return review

In [35]:
trainData = [PreprocessingData(text) for text in trainData]
testData = [PreprocessingData(text) for text in testData]

## I shuffled the data

In [36]:
train = list(zip(trainData, Y_train))
test = list(zip(testData, Y_test))

random.shuffle(train)
random.shuffle(test)


trainData, Y_train = zip(*train)
testData, Y_test = zip(*test)

trainData = list(trainData)
testData = list(testData)
Y_train = list(Y_train)
Y_test = list(Y_test)

## I made the size of training data as 49000 and 1000 as test data.

In [37]:
trainData = trainData + testData[:24000]
Y_train = Y_train + Y_test[:24000]

testData = testData[24000:]
Y_test = Y_test[24000:]

## I saved the preprocessing training data as a CSV file and the same for test data.

In [38]:

data_Train_Of_Review = {'trainData': trainData , 'Y_train': Y_train }
df_train = pd.DataFrame(data_Train_Of_Review, columns = ['trainData', 'Y_train'])
df_train.to_csv('data_Train_Of_Review.csv', index=False)

data_Test_Of_Review = {'testData':testData , 'Y_test': Y_test }
df_test = pd.DataFrame(data_Test_Of_Review, columns = ['testData', 'Y_test'])
df_test.to_csv('data_Test_Of_Review.csv', index=False)

In [39]:
df_train = pd.read_csv("data_Train_Of_Review.csv")
df_test = pd.read_csv("data_Test_Of_Review.csv")

In [40]:
print(df_train)

                                               trainData  Y_train
0      jim henson muppet favorite childhood film feel...        1
1      fun theater guilty pleasure corner movie taste...        1
2      good way baseketball waste film single way off...        0
3      amazing story film direct larry clark story sc...        1
4      thomas ian griffith doesn polish big buck acto...        0
...                                                  ...      ...
48995  great british director christopher nolan momen...        1
48996  new york attorney plot rid senile mother meeti...        0
48997  film crap probably bad film advice don watch f...        0
48998  ken russell direct weird erotic thriller hadn ...        0
48999  wrap people enjoy film watch early teen time l...        1

[49000 rows x 2 columns]


In [41]:
texts_train = [sentence for sentence  in df_train.loc[:, 'trainData']]
texts_test = [sentence for sentence  in df_test.loc[:, 'testData']]

In [42]:
Y_train = [i for i  in df_train.loc[:, 'Y_train']]
Y_test =  [i for i  in df_test.loc[:, 'Y_test']]

## Here I made the classes as follow: 
* [ 1 , 0 ] for positive review. 
* [ 0 , 1 ] for negative review.

In [43]:
Y_train = np.array([np.array([1.,0.]) if i==1 else np.array([0.,1.]) for i in Y_train])
Y_test = np.array([np.array([1.,0.]) if i==1 else np.array([0.,1.]) for i in Y_test])

## The input of the first model will be based on one hot representation.

In [44]:
max_features=10000

X_train_oneHot = [one_hot(sentence,max_features) for sentence in texts_train]
X_test_oneHot  = [one_hot(sentence,max_features) for sentence in texts_test]

## I decided to set 256 as the maximum length sentence and 50 as the embedding dimension.

In [45]:
max_sent_length=256
embedding_dim = 50

## pad_sequences function is used to ensure that all sequences in a list have the same length by padding 0 in the beginning of each sequence until each sequence has the same length as the longest sequence.

In [46]:
X_train_pad = pad_sequences(X_train_oneHot , padding='pre', maxlen=max_sent_length)
X_test_pad = pad_sequences(X_test_oneHot , padding='pre', maxlen=max_sent_length)

In [47]:
X_train_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

## First Model:
* Embedding layer
* Two Pooling layers with GlobalMaxPool1D
* Dense

In [48]:
# A integer input for vocab indices.
inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# I added a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(2, activation="sigmoid", name="predictions")(x)

model_1 = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model_1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [49]:
# Fit the model using the train and test datasets.
model_1.fit(X_train_pad, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29ba05bfb50>

In [50]:
print(model_1.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 50)          500000    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 50)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 128)         44928     
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1651

In [51]:
result_1 = model_1.predict(X_test_pad)

In [54]:
result_1[1:5] , Y_test[1:5]

(array([[9.9998105e-01, 1.9062576e-05],
        [4.5256755e-05, 9.9996829e-01],
        [9.9959350e-01, 4.4307113e-04],
        [2.8655586e-05, 9.9998349e-01]], dtype=float32),
 array([[1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.]]))

In [55]:
model_1.evaluate(X_test_pad, Y_test)



[0.4821602702140808, 0.8500000238418579]

## The input of the second model will be based on BPEmbed pre-trained model.

In [56]:
from bpemb import BPEmb
bpemb_en = BPEmb(lang='en' , dim = embedding_dim)

In [57]:
TrainData_bpemb = [bpemb_en.embed(text) for text in texts_train]
TestData_bpemb = [bpemb_en.embed(text) for text in texts_test]

## The function "sameDim" is used to add zeros arrays if the list has less than max_sent_length arrays.

In [58]:
def sameDim(matrix):
    result = matrix[:max_sent_length]
    l = len(result)
    if l<max_sent_length:
        zeroArr = np.zeros((max_sent_length-l,50)).astype(np.float32)
        result = np.concatenate((result, zeroArr), axis=0) 
    return result

In [59]:
TrainData_bpemb = np.array([sameDim(matrix) for matrix in TrainData_bpemb])
TestData_bpemb =  np.array([sameDim(matrix) for matrix in TestData_bpemb])

In [60]:
TrainData_bpemb[0]

array([[ 1.075245,  0.166705,  0.11528 , ...,  0.931   ,  0.79428 ,
        -0.435135],
       [ 0.835609,  0.218231,  0.148973, ...,  0.308579,  0.521739,
        -0.584431],
       [ 0.966539, -0.155275, -0.047727, ...,  0.452914,  0.28516 ,
        -0.485385],
       ...,
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]], dtype=float32)

## The second model:
* Two Pooling layers with GlobalMaxPool1D
* Dense

In [61]:
# A integer input for vocab indices.
inputs = keras.Input(shape=(None,embedding_dim))

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(inputs)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# I add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(2, activation="sigmoid", name="predictions")(x)

model_2 = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model_2.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [62]:
# Fit the model using the train and test datasets.
model_2.fit(TrainData_bpemb, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29bb54b5a90>

In [63]:
print(model_2.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, None, 50)]        0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 128)         44928     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
predictions (Dense)          (None, 2)                 258 

In [64]:
result_2 = model_2.predict(TestData_bpemb)

In [65]:
result_2[1:5] , Y_test[1:5]

(array([[5.2296519e-03, 9.9487925e-01],
        [1.4853650e-01, 8.3828413e-01],
        [1.8583834e-03, 9.9821222e-01],
        [3.0855579e-10, 1.0000000e+00]], dtype=float32),
 array([[1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.]]))

In [66]:
model_2.evaluate(TestData_bpemb, Y_test)



[1.0082924365997314, 0.7649999856948853]

## Here I created a list that contains the unique words from training and test data to use it later.

In [None]:
dic_review_data = []

for lst in texts_train:
    doc = nlp(lst)
    lst_split = [word.text for word in doc]
    for word in lst_split:
        if not word in dic_review_data:
            dic_review_data.append(word)

for lst in texts_test:
    doc = nlp(lst)
    lst_split = [word.text for word in doc]
    for word in lst_split:
        if not word in dic_review_data:
            dic_review_data.append(word)

## The input of the third model will be based on pre-trained fastText embeddings.
* I downloaded the model "wiki-news-300d-1M.vec" from this page https://fasttext.cc/docs/en/english-vectors.html

In [None]:
import io

In [None]:
model_fasttext = io.open('wiki-news-300d-1M.vec', 'r', encoding='utf-8', newline='\n', errors='ignore')

In [None]:
dataOf_fastText = model_fasttext.readlines()

## Here I created a dictionary from the pre-trained model, this dictionary contains the same words which exist in our data (dic_review_data) as indexes where the values are vectors.

In [None]:
dic_fastText = {}

for item in dataOf_fastText:
    doc = nlp(item)
    item_split = [word.text for word in doc]
    if item_split[0] in dic_review_data:
        vector = [float(num) for num in item_split[1:embedding_dim+1]]
        dic_fastText[item_split[0]] = vector

## The size of fastText model is very big so I saved the last dictionary as CSV file to avoid loading the model each time.

In [None]:
words = []
vectors = []
for word,vec in dic_fastText.items():
    words.append(word)
    vectors.append(vec)

In [59]:
data_fastText = {'words': words , 'vectors': vectors }
df_data_fastText = pd.DataFrame(data_fastText, columns = ['words', 'vectors'])
df_data_fastText.to_csv('df_data_fastText.csv', index=False)

In [67]:
df_data_fastText = pd.read_csv('df_data_fastText.csv')

In [68]:
df_data_fastText.head()

Unnamed: 0,words,vectors
0,the,"[0.0897, 0.016, -0.0571, 0.0405, -0.0696, -0.1..."
1,that,"[0.0806, -0.0063, 0.0875, 0.0152, -0.068, -0.0..."
2,with,"[0.0177, -0.0273, -0.0135, 0.0351, -0.0135, -0..."
3,was,"[0.0986, 0.0069, -0.0897, -0.0036, 0.0114, -0...."
4,this,"[-0.038, -0.0383, -0.0304, -0.0533, -0.0059, -..."


In [69]:
words = [word for word  in df_data_fastText.loc[:, 'words']]
vectors = [vector for vector  in df_data_fastText.loc[:, 'vectors']]

In [70]:
dic_fastText ={}
for i in range(len(words)):
    vec = vectors[i].split(',')
    firstNum = [float(vec[0][1:])]
    lastNum = [float(vec[-1][:-1])]
    vec_numbers = firstNum + [float(i) for i in vec[1:-1]] + lastNum
    dic_fastText[words[i]] = vec_numbers

In [71]:
dic_fastText['the']

[0.0897,
 0.016,
 -0.0571,
 0.0405,
 -0.0696,
 -0.1237,
 0.0301,
 0.0248,
 -0.0303,
 0.0174,
 0.0063,
 0.0184,
 0.0217,
 -0.0257,
 0.035,
 -0.0242,
 0.0029,
 0.0188,
 -0.057,
 0.0252,
 -0.021,
 -0.0008,
 0.036,
 -0.0729,
 -0.0665,
 0.0989,
 0.0676,
 0.0852,
 -0.0089,
 0.0313,
 -0.0069,
 -0.0032,
 -0.0462,
 0.0497,
 0.0261,
 0.0268,
 -0.031,
 -0.1361,
 -0.0062,
 0.0375,
 -0.032,
 -0.0106,
 0.0534,
 -0.0187,
 0.0638,
 0.0094,
 0.0047,
 -0.053,
 0.0093,
 -0.0087]

## The function "getVectors":
* Input: a sentence as string.
* Output: for each word, it gets the vector from the pre-trained fastText model (from the dictionary dic_fastText) and returns a list of vectors.

In [72]:
def getVectors(sentence):
    doc = nlp(sentence)
    sentence_split = [word.text for word in doc]
    vectors = []
    for word in sentence_split:
        if word in dic_fastText:
            vectors.append(np.array(dic_fastText[word]))
        else:
            vectors.append(np.array([0.]*embedding_dim))
    return np.array(vectors).astype(np.float32)

In [73]:
TrainData_fastText = [getVectors(sentence) for sentence in texts_train]
TestData_fastText =  [getVectors(sentence) for sentence in texts_test]

In [74]:
TrainData_fastText = np.array([sameDim(matrix) for matrix in TrainData_fastText]).astype(np.float32)
TestData_fastText =  np.array([sameDim(matrix) for matrix in TestData_fastText]).astype(np.float32)

In [75]:
TrainData_fastText[0]

array([[-0.0054,  0.1111, -0.1341, ..., -0.0126, -0.0774, -0.1184],
       [-0.0114,  0.1345, -0.024 , ..., -0.0205, -0.0457, -0.0546],
       [-0.0183,  0.256 , -0.1479, ..., -0.0689,  0.0284, -0.1965],
       ...,
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]],
      dtype=float32)

## The third model:
* Two Pooling layers with GlobalMaxPool1D
* Dense

In [76]:
# A integer input for vocab indices.
inputs = keras.Input(shape=(None,embedding_dim))

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(inputs)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# I added a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

predictions = layers.Dense(2, activation="sigmoid", name="predictions")(x)

model_3 = keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model_3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [77]:
# Fit the model using the train and test datasets.
model_3.fit(TrainData_fastText, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x29b8f3258e0>

In [78]:
print(model_3.summary())

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None, 50)]        0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, None, 128)         44928     
_________________________________________________________________
conv1d_7 (Conv1D)            (None, None, 128)         114816    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
predictions (Dense)          (None, 2)                 258 

In [79]:
result_3 = model_3.predict(TestData_fastText)

In [80]:
result_3[1:5] , Y_test[1:5]

(array([[1.0000000e+00, 8.4444531e-09],
        [4.5901537e-04, 9.9954885e-01],
        [1.6003827e-07, 1.0000000e+00],
        [2.5254488e-04, 9.9975002e-01]], dtype=float32),
 array([[1., 0.],
        [0., 1.],
        [1., 0.],
        [0., 1.]]))

In [81]:
model_3.evaluate(TestData_fastText, Y_test)



[0.9825292229652405, 0.8019999861717224]

## Comparison:
I got the following results:
* First model:-----------------------------accuracy 0.9743 , evaluation of test data [0.4821, 0.8500]
* Second model(BPEmbed):---------accuracy 0.9794 , evaluation of test data [1.0082, 0.7649]
* Third model(fastText):----------------accuracy 0.9851 , evaluation of test data [0.9825, 0.8019]