In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import keras
from keras import Sequential
from keras import layers
import os
import sklearn as sk
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
tf.config.set_visible_devices([], 'GPU') # turning GPU use off as tensors exceed 10000*1000*50


importing the data and CNN

In [2]:
raw_df = pd.read_csv("../IMDB_with_predictions.csv")
reviews = raw_df.review
sentiment = raw_df.CNN_Predictions

CNN = keras.models.load_model("../CNN_Non_Dense")

Tokenizing the inputs so that we can vectorise them in the embedding layer

In [3]:
tokenizer = Tokenizer(num_words=37500)
tokenizer.fit_on_texts(reviews)
tokenized_reviews = tokenizer.texts_to_sequences(reviews)
padded_reviews = tf.keras.utils.pad_sequences(tokenized_reviews, padding="post", maxlen=1000)
print(padded_reviews)
flatten_layer = keras.layers.Flatten()

[[  27    4    1 ...    0    0    0]
 [   3  393  120 ...    0    0    0]
 [  10  190   11 ...    0    0    0]
 ...
 [  10  235    3 ...    0    0    0]
 [ 145  166    5 ...    0    0    0]
 [  54   27 5892 ...    0    0    0]]


Setting up the models

In [7]:
print(CNN.summary())
# setting up our model that takes output of embedding layer and predicts output
Embedding_layer = keras.Model(inputs=CNN.inputs, outputs=CNN.get_layer(name="embedding").output)
embedding_raw = Embedding_layer(padded_reviews[0:50000:1])
embedding_final = np.array(flatten_layer(embedding_raw))
train_embedding_x, test_embedding_x, train_embedding_y, test_embedding_y = train_test_split(embedding_final, sentiment, random_state=1000, shuffle=True, test_size=0.3)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 50)          1875000   
                                                                 
 conv1d (Conv1D)             (None, 998, 32)           4832      
                                                                 
 global_max_pooling1d (Globa  (None, 32)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                330       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,880,173
Trainable params: 1,880,173
Non-trainable params: 0
______________________________________________

Training the embedding tree

In [8]:
embedding_tree = RandomForestClassifier(criterion="entropy").fit(train_embedding_x, train_embedding_y)
training_embedding_prediction = embedding_tree.predict(train_embedding_x)
test_embedding_prediction = embedding_tree.predict(test_embedding_x)
training_embedding_prediction_accuracy = sk.metrics.accuracy_score(train_embedding_y, training_embedding_prediction, normalize=True)
test_embedding_prediction_accuracy = sk.metrics.accuracy_score(test_embedding_y, test_embedding_prediction, normalize=True)
print("Embedding layer\nTraining accuracy: {} vs Testing Accuracy {}".format(training_embedding_prediction_accuracy, test_embedding_prediction_accuracy))


Embedding layer
Training accuracy: 1.0 vs Testing Accuracy 0.5574666666666667


Building convolutional model and getting output

In [5]:
#building model 
convolutional_model = keras.Sequential()
convolutional_model.add(CNN.get_layer(name="embedding"))
convolutional_model.add(CNN.get_layer(name="conv1d"))

# getting output given the padded input
convolutional_raw = convolutional_model(padded_reviews)
convolutional_final = np.array(flatten_layer(convolutional_raw))
train_conv_x, test_conv_x, train_conv_y, test_conv_y = train_test_split(convolutional_final, sentiment, random_state=1000, shuffle=True, test_size=0.3)


Training the convolutional tree

In [6]:
conv_tree = RandomForestClassifier(criterion="entropy").fit(train_conv_x, train_conv_y)
training_conv_prediction = conv_tree.predict(train_conv_x)
test_conv_prediction = conv_tree.predict(test_conv_x)
training_conv_prediction_accuracy = sk.metrics.accuracy_score(train_conv_y, training_conv_prediction, normalize=True)
test_conv_prediction_accuracy = sk.metrics.accuracy_score(test_conv_y, test_conv_prediction, normalize=True)
print("Convolution layer\nTraining accuracy: {} vs Testing Accuracy {}".format(training_conv_prediction_accuracy, test_conv_prediction_accuracy))


Convolution layer
Training accuracy: 1.0 vs Testing Accuracy 0.5474666666666667


Building the pooling tree

In [4]:
pooling_model = keras.Sequential()
pooling_model.add(CNN.get_layer(name="embedding"))
pooling_model.add(CNN.get_layer(name="conv1d"))
pooling_model.add(CNN.get_layer(name="global_max_pooling1d"))

# getting outputs from model

pooling_raw = pooling_model(padded_reviews)
pooling_final = np.array(flatten_layer(pooling_raw))
train_pooling_x, test_pooling_x, train_pooling_y, test_pooling_y = train_test_split(pooling_final, sentiment, random_state=1000, shuffle=True, test_size=0.3)


Training and running the pooling DT

In [5]:
pooling_tree = RandomForestClassifier(criterion="entropy").fit(train_pooling_x, train_pooling_y)
training_pooling_prediction = pooling_tree.predict(train_pooling_x)
test_pooling_prediction = pooling_tree.predict(test_pooling_x)
training_pooling_prediction_accuracy = sk.metrics.accuracy_score(train_pooling_y, training_pooling_prediction, normalize=True)
test_pooling_prediction_accuracy = sk.metrics.accuracy_score(test_pooling_y, test_pooling_prediction, normalize=True)
print("Pooling layer\nTraining accuracy: {} vs Testing Accuracy {}".format(training_pooling_prediction_accuracy, test_pooling_prediction_accuracy))


Pooling layer
Training accuracy: 1.0 vs Testing Accuracy 0.6106666666666667


Building first dense model

In [4]:
dense_model = keras.Sequential()
dense_model.add(CNN.get_layer(name="embedding"))
dense_model.add(CNN.get_layer(name="conv1d"))
dense_model.add(CNN.get_layer(name="global_max_pooling1d"))
dense_model.add(CNN.get_layer(name="dense"))
# getting outputs from model

dense_raw = dense_model(padded_reviews)
dense_final = np.array(flatten_layer(dense_raw))
train_dense_x, test_dense_x, train_dense_y, test_dense_y = train_test_split(dense_final, sentiment, random_state=1000, shuffle=True, test_size=0.3)


Building dense tree

In [5]:
dense_tree = RandomForestClassifier(criterion="entropy").fit(train_dense_x, train_dense_y)
training_dense_prediction = dense_tree.predict(train_dense_x)
test_dense_prediction = dense_tree.predict(test_dense_x)
training_dense_prediction_accuracy = sk.metrics.accuracy_score(train_dense_y, training_dense_prediction, normalize=True)
test_dense_prediction_accuracy = sk.metrics.accuracy_score(test_dense_y, test_dense_prediction, normalize=True)
print("Dense layer\nTraining accuracy: {} vs Testing Accuracy {}".format(training_dense_prediction_accuracy, test_dense_prediction_accuracy))


Dense layer
Training accuracy: 1.0 vs Testing Accuracy 0.5748666666666666
