Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import io
import re
import regex
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dropout, SpatialDropout1D
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

read the csv file

In [None]:
data = pd.read_csv('dataset/imdb.csv', engine='python')
print(data.head)

In [None]:
data = data.sample(frac=1., random_state=14).reset_index(drop=True)
data.head()

In [None]:
plt.hist(data[data.sentiment == 1].sentiment,
         bins=2, color='green', label='Positive')
plt.hist(data[data.sentiment == 0].sentiment,
         bins=2, color='blue', label='Negative')
plt.title('Classes distribution in the train data', fontsize=12)
plt.xticks([])
plt.xlim(-0.5, 2)
plt.legend()
plt.show()

In [43]:
def clean_text(text):
    # Remove HTML tags
    text = regex.sub(r"<[^<]+?>", "", text)

    # Remove Special chars
    text = regex.sub(r'[^a-zA-Z0-9\s]', "", text)

    # Convet to LowerCase
    text = text.lower()

    return text

data["review"] = data["review"].apply(clean_text)
data.head()

Unnamed: 0,review,sentiment
0,hammer house of horror witching time is set in...,1
1,no matter what country your in you have to buy...,1
2,i thought this was a really cute movie inspir...,1
3,woosh man what can i saythe openingscene maybe...,0
4,this movie is nothing more than christian prop...,0


In [44]:
X = data['review']
y = data['sentiment']
X_main, X_test, y_main, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y, shuffle=True)

X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.11111111, random_state=42, stratify=y_main,
                                                  shuffle=True)

print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))
print("X_test: {}".format(X_test.shape))
print("y_test: {}".format(y_test.shape))

X_train shape: (40000,)
y_train shape: (40000,)
X_val shape: (5000,)
y_val shape: (5000,)
X_test: (5000,)
y_test: (5000,)


In [45]:
max_length = 200
vocab_size = 140631
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<unk>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print(len(tokenizer.word_index))

192660


In [46]:
train_seqs = tokenizer.texts_to_sequences(X_train)
val_seqs = tokenizer.texts_to_sequences(X_val)
test_seqs = tokenizer.texts_to_sequences(X_test)

train_seqs = pad_sequences(train_seqs, padding='post', maxlen=max_length, truncating='post')
val_seqs = pad_sequences(val_seqs, padding='post', maxlen=max_length, truncating='post')
test_seqs = pad_sequences(test_seqs, padding='post', maxlen=max_length, truncating='post')


In [47]:
model = Sequential([
    Embedding(vocab_size, 128, name="embedding"),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2)),
    GlobalMaxPool1D(),
    Dense(32, activation="relu"),
    Dropout(0.05),
    Dense(1, activation="sigmoid")
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 128)         18000768  
                                                                 
 bidirectional_3 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_7 (Dense)             (None, 32)                4128      
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_8 (Dense)             (None, 1)                

In [48]:
history = model.fit(train_seqs, y_train, epochs=2, validation_data=(val_seqs, y_val), verbose=1)

Epoch 1/2
Epoch 2/2


In [49]:
# Testing the model
predict_p = model.predict(test_seqs)
predict_p = predict_p.flatten()
print(predict_p.round(2))

# Result
pred = np.where(predict_p > 0.5, 1, 0)
print(pred)

classi = classification_report(y_test, pred)
confu = confusion_matrix(y_test, pred)
accu = accuracy_score(y_test, pred)

# Display the outcome of classification
print('Classification Report: \n', classi)
print('Confusion Matrix: \n', confu)
print('Accuracy Score: \n', accu)

[0.98 0.   0.01 ... 0.93 0.   0.13]
[1 0 0 ... 1 0 0]
Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.88      0.89      2500
           1       0.89      0.89      0.89      2500

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Confusion Matrix: 
 [[2212  288]
 [ 271 2229]]
Accuracy Score: 
 0.8882


In [50]:
model.save('88.h5')
model.save('model/')



INFO:tensorflow:Assets written to: model/assets


INFO:tensorflow:Assets written to: model/assets


In [51]:
weights = model.get_layer('embedding').get_weights()[0]
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size - 1):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()