In [1]:
import pandas as pd

raw_data = pd.read_csv("IMDB_with_predictions.csv", low_memory=False)

# printing statements to check the variables
raw_x = raw_data.review; # print(raw_x)
raw_y = raw_data.CNN_Predictions; # print(raw_y)

number_of_words_in_dic = 37500; 

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
tf.config.set_visible_devices([], 'GPU')


tokenizer = Tokenizer(num_words=number_of_words_in_dic) # to tokenize the words for learning
tokenizer.fit_on_texts(raw_x)
tokenized_sentiments = tokenizer.texts_to_sequences(raw_x) # converting the words to number arrays

vocab_size = len(tokenizer.word_index) + 1

padded_tokenized_sentiments= tf.keras.utils.pad_sequences(tokenized_sentiments, padding="post", maxlen=1000)

In [2]:
import keras
from keras import Sequential # we will be using this for the CNN
from keras import layers
import tensorflow as tf
import os

The code beneath changes every time you run it and hence shows a bad approach to extracting the embeddings

In [3]:
# importing the trained keras cnn
CNN = tf.keras.models.load_model("CNN_Non_Dense")
CNN.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 50)          1875000   
                                                                 
 conv1d (Conv1D)             (None, 998, 32)           4832      
                                                                 
 global_max_pooling1d (Globa  (None, 32)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                330       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,880,173
Trainable params: 1,880,173
Non-trainable params: 0
______________________________________________

Getting the node embedding outputs from the STABLE CNN model

In [4]:
# getting the outputs from the embedding layer of the CNN
embeddings = keras.Model(inputs=CNN.inputs, outputs=CNN.get_layer(name="embedding").output) 

embedding_raw_outputs = embeddings(padded_tokenized_sentiments[0:50000:1])
flatten_layer = keras.layers.Flatten()
embedding_outputs = flatten_layer(embedding_raw_outputs)
print(embedding_outputs)


tf.Tensor(
[[-0.04275493 -0.08479704  0.0292923  ...  0.08790416  0.07556851
  -0.06879054]
 [ 0.00583083 -0.01126311  0.00234818 ...  0.08790416  0.07556851
  -0.06879054]
 [ 0.01972886  0.07676113 -0.08007895 ...  0.08790416  0.07556851
  -0.06879054]
 ...
 [ 0.01972886  0.07676113 -0.08007895 ...  0.08790416  0.07556851
  -0.06879054]
 [ 0.08189929  0.08689225  0.11288299 ...  0.08790416  0.07556851
  -0.06879054]
 [-0.09420051  0.00115355 -0.06794097 ...  0.08790416  0.07556851
  -0.06879054]], shape=(50000, 50000), dtype=float32)


Splitting this big tensor up into train test and validation segments

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
embedding_outputs = np.array(embedding_outputs)
train_x, test_x, train_y, test_y = train_test_split(embedding_outputs, raw_y, random_state=1000, shuffle=True, test_size=0.3)


Training a DT on embedding_outputs (Don't need validation)

In [6]:
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier

best_test_accuracy = 0
best_i = 0
# evaluating the models
tree = DecisionTreeClassifier(max_depth=5).fit(train_x, train_y)
training_prediction = tree.predict(train_x)
test_prediction = tree.predict(test_x)

training_prediction_accuracy = sk.metrics.accuracy_score(train_y, training_prediction, normalize=True)
test_prediction_accuracy = sk.metrics.accuracy_score(test_y, test_prediction, normalize=True)


print("Training accuracy: {} vs Testing Accuracy {}".format(training_prediction_accuracy, test_prediction_accuracy))


Training accuracy: 0.565 vs Testing Accuracy 0.5488666666666666


Getting convolutional layer outputs 

In [7]:
conv = keras.Model(inputs=CNN.inputs, outputs=CNN.get_layer(name="conv1d").output) 

conv_raw_out = conv(padded_tokenized_sentiments[0:50000:1])
flatten_layer = keras.layers.Flatten()
conv_outputs = flatten_layer(conv_raw_out)
print(conv_outputs)


tf.Tensor(
[[0.         0.01070739 0.         ... 0.         0.07876873 0.        ]
 [0.24322078 0.         0.97995    ... 0.         0.07876873 0.        ]
 [0.6055584  0.         0.04483942 ... 0.         0.07876873 0.        ]
 ...
 [0.05258211 0.         0.7495864  ... 0.         0.07876873 0.        ]
 [0.         0.         0.         ... 0.         0.07876873 0.        ]
 [0.04430513 0.         0.         ... 0.         0.07876873 0.        ]], shape=(50000, 31936), dtype=float32)


Splitting output

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
conv_output = np.array(conv_outputs)
train_x, test_x, train_y, test_y = train_test_split(conv_output, raw_y, random_state=1000, shuffle=True, test_size=0.3)

Training DT on conv layer outputs

In [9]:
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
best_test_accuracy = 0
best_i = 0
# evaluating the models
tree = DecisionTreeClassifier(max_depth=5).fit(train_x, train_y)
training_prediction = tree.predict(train_x)
test_prediction = tree.predict(test_x)

training_prediction_accuracy = sk.metrics.accuracy_score(train_y, training_prediction, normalize=True)
test_prediction_accuracy = sk.metrics.accuracy_score(test_y, test_prediction, normalize=True)


print("Training accuracy: {} vs Testing Accuracy {}".format(training_prediction_accuracy, test_prediction_accuracy))

Training accuracy: 0.5502285714285714 vs Testing Accuracy 0.5571333333333334


Getting output of pooling layer

In [10]:
pooling = keras.Model(inputs=CNN.inputs, outputs=CNN.get_layer(name="global_max_pooling1d").output) 

pooling_raw_out = pooling(padded_tokenized_sentiments[0:50000:1])
flatten_layer = keras.layers.Flatten()
pooling_outputs = flatten_layer(pooling_raw_out)
print(pooling_outputs)

: 

: 

Splitting pooling output

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
pooling_output = np.array(pooling_outputs)
train_x, test_x, train_y, test_y = train_test_split(pooling_output, raw_y, random_state=1000, shuffle=True, test_size=0.3)

Training Dt on pooling

In [None]:
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier

best_test_accuracy = 0
best_i = 0
# evaluating the models
tree = DecisionTreeClassifier(max_depth=5).fit(train_x, train_y)
training_prediction = tree.predict(train_x)
test_prediction = tree.predict(test_x)

training_prediction_accuracy = sk.metrics.accuracy_score(train_y, training_prediction, normalize=True)
test_prediction_accuracy = sk.metrics.accuracy_score(test_y, test_prediction, normalize=True)


print("Training accuracy: {} vs Testing Accuracy {}".format(training_prediction_accuracy, test_prediction_accuracy))

Training accuracy: 0.5735142857142858 vs Testing Accuracy 0.5683333333333334


Getting output from first dense layer

In [None]:
dense = keras.Model(inputs=CNN.inputs, outputs=CNN.get_layer(name="dense").output) 

dense_raw_out = dense(padded_tokenized_sentiments[0:50000:1])
flatten_layer = keras.layers.Flatten()
dense_outputs = flatten_layer(dense_raw_out)
print(dense_outputs)

tf.Tensor(
[[0.9367351  2.4739137  2.6744     ... 1.157961   1.2296641  1.903941  ]
 [0.6682062  2.8604558  1.5090369  ... 1.0655558  1.1144934  1.9300716 ]
 [1.6244669  1.4200797  1.4253105  ... 1.2042524  1.1858108  1.2665263 ]
 ...
 [1.5316938  1.5835792  2.2751236  ... 1.3601041  1.4951031  1.1588593 ]
 [1.6358929  0.74724245 2.4778965  ... 1.2150865  0.9762061  1.0232066 ]
 [2.0390623  1.1072648  2.3255944  ... 1.816544   1.8689241  0.88813496]], shape=(50000, 10), dtype=float32)


Splitting dense into train and test

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
dense_output = np.array(dense_outputs)
train_x, test_x, train_y, test_y = train_test_split(dense_output, raw_y, random_state=1000, shuffle=True, test_size=0.3)

Training DT on dense layer

In [None]:
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier

best_test_accuracy = 0
best_i = 0
# evaluating the models
tree = DecisionTreeClassifier(max_depth=5).fit(train_x, train_y)
training_prediction = tree.predict(train_x)
test_prediction = tree.predict(test_x)

training_prediction_accuracy = sk.metrics.accuracy_score(train_y, training_prediction, normalize=True)
test_prediction_accuracy = sk.metrics.accuracy_score(test_y, test_prediction, normalize=True)


print("Training accuracy: {} vs Testing Accuracy {}".format(training_prediction_accuracy, test_prediction_accuracy))

Training accuracy: 0.5800285714285714 vs Testing Accuracy 0.5675333333333333


Extracting output from last dense layer

In [None]:
dense = keras.Model(inputs=CNN.inputs, outputs=CNN.get_layer(name="dense_1").output) 

dense_raw_out = dense(padded_tokenized_sentiments[0:50000:1])
flatten_layer = keras.layers.Flatten()
dense_outputs = flatten_layer(dense_raw_out)
print(dense_outputs)

tf.Tensor(
[[0.61962384]
 [0.95021033]
 [0.41221657]
 ...
 [0.13051863]
 [0.03999363]
 [0.00474292]], shape=(50000, 1), dtype=float32)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
dense_output = np.array(dense_outputs)
train_x, test_x, train_y, test_y = train_test_split(dense_output, raw_y, random_state=1000, shuffle=True, test_size=0.3)

In [None]:
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier

best_test_accuracy = 0
best_i = 0
# evaluating the models
tree = DecisionTreeClassifier(max_depth=5).fit(train_x, train_y)
training_prediction = tree.predict(train_x)
test_prediction = tree.predict(test_x)

training_prediction_accuracy = sk.metrics.accuracy_score(train_y, training_prediction, normalize=True)
test_prediction_accuracy = sk.metrics.accuracy_score(test_y, test_prediction, normalize=True)


print("Training accuracy: {} vs Testing Accuracy {}".format(training_prediction_accuracy, test_prediction_accuracy))

Training accuracy: 0.5702 vs Testing Accuracy 0.5608
