# Importing The Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing The Dataset

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv")

In [3]:
print(dataset.shape)

(50000, 2)


# Data Preprocessing

In [4]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [5]:
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [6]:
X = []
sentences = list(dataset['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [7]:
y = dataset['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

# Splitting The Dataset

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Preparing the Embedding Layer


In [9]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [10]:
# Adding 1 because of reserved 0 index
from keras.preprocessing.sequence import pad_sequences
vocab_size = len(tokenizer.word_index) + 1

maxlen = 256

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [11]:
print('Found %s unique tokens.' % vocab_size)

Found 92285 unique tokens.


In [12]:
# importing the dictionary of words 'glove.6B.200d'
from numpy import asarray
embeddings_dictionary = dict()
glove_file = open('./glove.6B.200d.txt', encoding="utf8")

for line in glove_file:
    records = line.split(' ')
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [13]:
from numpy import zeros
embedding_matrix = zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Building The Model

In [14]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import GlobalMaxPooling1D
from keras.layers import Conv1D
from keras.layers.core import Activation, Dropout, Dense

model = Sequential()

embedding_layer = Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Conv1D(256, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [15]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 200)          18457000  
_________________________________________________________________
conv1d (Conv1D)              (None, 252, 256)          256256    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 18,713,513
Trainable params: 256,513
Non-trainable params: 18,457,000
_________________________________________________________________
None


In [16]:
history = model.fit(X_train, Y_train, batch_size=128, epochs=11, verbose=1)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


# Confusion Matrix

In [17]:
from sklearn.metrics import confusion_matrix,classification_report
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
cm =confusion_matrix(y_pred, Y_test)
cm
print(classification_report(y_pred,Y_test))

              precision    recall  f1-score   support

       False       0.90      0.90      0.90      5050
        True       0.89      0.90      0.90      4950

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [18]:
results = model.evaluate(X_test,Y_test,batch_size=128)



In [19]:
from sklearn.metrics import precision_score , recall_score, f1_score
print(f1_score(Y_test,y_pred))
print('f1------')
print(recall_score(Y_test,y_pred))
print('RECAL------')
print(precision_score(Y_test,y_pred))
print('PRECISION------')

0.8962178517397882
f1------
0.8948640483383686
RECAL------
0.8975757575757576
PRECISION------
