In [None]:
!pip install shap

Collecting shap
  Downloading shap-0.39.0.tar.gz (356 kB)
[?25l[K     |█                               | 10 kB 23.6 MB/s eta 0:00:01[K     |█▉                              | 20 kB 28.8 MB/s eta 0:00:01[K     |██▊                             | 30 kB 35.3 MB/s eta 0:00:01[K     |███▊                            | 40 kB 39.8 MB/s eta 0:00:01[K     |████▋                           | 51 kB 28.4 MB/s eta 0:00:01[K     |█████▌                          | 61 kB 31.2 MB/s eta 0:00:01[K     |██████▍                         | 71 kB 26.8 MB/s eta 0:00:01[K     |███████▍                        | 81 kB 26.2 MB/s eta 0:00:01[K     |████████▎                       | 92 kB 28.1 MB/s eta 0:00:01[K     |█████████▏                      | 102 kB 30.3 MB/s eta 0:00:01[K     |██████████▏                     | 112 kB 30.3 MB/s eta 0:00:01[K     |███████████                     | 122 kB 30.3 MB/s eta 0:00:01[K     |████████████                    | 133 kB 30.3 MB/s eta 0:00:01[K     

In [None]:
#Make the necessary imports

from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from keras import regularizers
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from zipfile import ZipFile
import pandas as pd
import numpy as np
import nltk
import re
import warnings
import os

warnings.filterwarnings("ignore") 

In [None]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.2

In [None]:
data = pd.read_csv("cleaned_data.csv")
data = data.drop("Unnamed: 0" , axis=1)
data.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod hook right ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [None]:
data.shape

(50000, 2)

In [None]:
X = data.drop("sentiment", axis = 1)
y = data["sentiment"]

In [None]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X, y, shuffle=True,
                                                     test_size=0.2, random_state=1)

In [None]:
# turn the splitted datasets into list format in order to 
# apply Embedding operations on them

X_train = X_train["review"].to_list()
X_test = X_test["review"].to_list()
y_train = y_train.to_list()
y_test = y_test.to_list()

In [None]:
# instantiate the tokenizer object and fit it on the 
# training set
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)

# convert each and every sentences in training and test sets
# into sequence of numbers in order to apply further Embedding operations
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index

print("Found {} unique tokens...".format(len(word_index)))

Found 62842 unique tokens...


In [None]:
# apply 'pre' padding of 0s on the train and test sets to 
# set the length of all of the sentences into a specific value
# in order to fed them into neural network 
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

trainvalid_labels = to_categorical(np.asarray(y_train))
test_labels = to_categorical(np.asarray(y_test))

In [None]:
# get the indeces of trainig set, shuffle it and split
# training set into training and validation sets
indeces = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indeces)
trainvalid_data = trainvalid_data[indeces]
trainvalid_labels = trainvalid_labels[indeces]
num_validation_steps = int(VALIDATION_SPLIT * trainvalid_data.shape[0])
X_train = trainvalid_data[:-num_validation_steps]
y_train = trainvalid_labels[:-num_validation_steps]
X_val = trainvalid_data[-num_validation_steps:]
y_val= trainvalid_labels[-num_validation_steps:]

print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [None]:
cnnmodel = Sequential()
cnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
cnnmodel.add(Conv1D(64, 5, activation='relu'))
cnnmodel.add(MaxPooling1D(5))
cnnmodel.add(GlobalMaxPooling1D())
cnnmodel.add(Dense(64, activation='relu',
             kernel_regularizer=regularizers.l2(0.1)))
cnnmodel.add(Dropout(0.5))
cnnmodel.add(Dense(2, activation='softmax'))

cnnmodel.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["acc"])

# train the model. tune to validation set. 
cnnmodel.fit(X_train, y_train,
          batch_size=16,
          epochs=3, validation_data=(X_val, y_val))

# evaluate on test set:
score, acc = cnnmodel.evaluate(test_data, test_labels)
print('Test accuracy with CNN:', acc)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test accuracy with CNN: 0.8550999760627747


In [None]:
cnnmodel.save("model/cnn-model.h5")

In [None]:
text = "it is a bad movie overall."
ps = PorterStemmer()
CLEAN = re.compile("<.*?>")   # to remove everything between "<>" 
result = re.sub(CLEAN, " ", text)
result = re.sub("[^a-zA-Z]" , " " , result)
result = result.lower()
result = result.split()   # to break sentences into words
word = [ps.stem(word) for word in result if word not in stopwords.words("english")]
result = " ".join(word)
tokens = tokenizer.texts_to_sequences([result])
sent = pad_sequences(tokens, maxlen=MAX_SEQUENCE_LENGTH)
pred = cnnmodel.predict(np.array(sent))
print(pred)

[[0.8123285  0.18767147]]
