In [None]:
!pip install numpy==1.16.1

In [None]:
# MLP for the IMDB problem
import numpy as np
import keras
import pickle 
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Average
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.preprocessing import sequence

import matplotlib.pyplot as plt
%matplotlib inline
import warnings,logging, os
warnings.filterwarnings('ignore')
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

Using TensorFlow backend.


In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

#set below folder path to shared folder on google drive
folder_path = '/content/drive/My Drive/University/FYP/Conferences/MerCon/word embedding workshop/copy_WordEmbed workshop/'

In [None]:
import sys
sys.path.insert(0, folder_path)

from utils import *

In [None]:
# load the dataset but only keep the top n words, zero the rest
top_words = 15000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
imdb_words = imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [None]:
INDEX_FROM=3
word_to_id = {k:(v+INDEX_FROM) for k,v in imdb_words.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
MAX_NB_WORDS = 15000
embed_dim = 300
max_words = 500
nb_words = min(MAX_NB_WORDS, len(word_to_id))
id_to_word = {value:key for key,value in word_to_id.items()}

In [None]:
[id_to_word[i] for i in X_train[0]]

['<START>',
 'this',
 'film',
 'was',
 'just',
 'brilliant',
 'casting',
 'location',
 'scenery',
 'story',
 'direction',
 "everyone's",
 'really',
 'suited',
 'the',
 'part',
 'they',
 'played',
 'and',
 'you',
 'could',
 'just',
 'imagine',
 'being',
 'there',
 'robert',
 '<UNK>',
 'is',
 'an',
 'amazing',
 'actor',
 'and',
 'now',
 'the',
 'same',
 'being',
 'director',
 '<UNK>',
 'father',
 'came',
 'from',
 'the',
 'same',
 'scottish',
 'island',
 'as',
 'myself',
 'so',
 'i',
 'loved',
 'the',
 'fact',
 'there',
 'was',
 'a',
 'real',
 'connection',
 'with',
 'this',
 'film',
 'the',
 'witty',
 'remarks',
 'throughout',
 'the',
 'film',
 'were',
 'great',
 'it',
 'was',
 'just',
 'brilliant',
 'so',
 'much',
 'that',
 'i',
 'bought',
 'the',
 'film',
 'as',
 'soon',
 'as',
 'it',
 'was',
 'released',
 'for',
 '<UNK>',
 'and',
 'would',
 'recommend',
 'it',
 'to',
 'everyone',
 'to',
 'watch',
 'and',
 'the',
 'fly',
 'fishing',
 'was',
 'amazing',
 'really',
 'cried',
 'at',
 'th

In [None]:
# maximum words for a review. We truncate longer reviews and pad shorter reviews to make the length 500
max_words = 500 
X_train = sequence.pad_sequences(X_train, maxlen=max_words, padding='post', truncating='post', value=0.0)
X_test = sequence.pad_sequences(X_test, maxlen=max_words, padding='post', truncating='post', value=0.0)

In [None]:
# This is with random initializations for embeddings
model = Sequential()
model.add(Embedding(top_words, 300, input_length=max_words,trainable=False))
model.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Fit the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 300)          4500000   
_________________________________________________________________
lambda_4 (Lambda)            (None, 300)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               75250     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 251       
Total params: 4,575,501
Trainable params: 75,501
Non-trainable params: 4,500,000
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 12s - loss: 0.6931 - accuracy: 0.5031 - val_loss: 0.6917 - val_accuracy: 0.5342
Epoch 2/2
 - 12s - loss: 0.6911 - accuracy: 0.5202 - val_loss: 0.6900 - val_accuracy: 0.5

Below snippet loads the model. Would take a couple of minutes.

In [None]:
# #embedding matrix
# MAX_NB_WORDS = 15000
# embed_dim = 300
# max_words = 500
# words_not_found = []
# nb_words = min(MAX_NB_WORDS, len(word_to_id))
# embedding_matrix = np.zeros((nb_words, embed_dim))
# for word, i in word_to_id.items():
#     if i >= nb_words:
#         continue
#     embedding_vector = embeddings_index[word]
#     if (embedding_vector is not None) and len(embedding_vector) > 0:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector
#     else:
#         words_not_found.append(word)

In [None]:
embedding_matrix = load_obj(folder_path + 'sentiment_analysis/embedding_matrix')

In [None]:
# This model uses pretrained embeddings, but does not train the embeddings while the model is training.
model2 = Sequential()
model2.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_words, trainable=False))
model2.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
model2.add(Dense(250, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

# Fit the model
model2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores2 = model2.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores2[1]*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 300)          4500000   
_________________________________________________________________
lambda_2 (Lambda)            (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               75250     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
Total params: 4,575,501
Trainable params: 75,501
Non-trainable params: 4,500,000
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 12s - loss: 0.6735 - accuracy: 0.5908 - val_loss: 0.6486 - val_accuracy: 0.6491
Epoch 2/2
 - 12s - loss: 0.6171 - accuracy: 0.6773 - val_loss: 0.5989 - val_accuracy: 0.7

In [None]:
# This model uses trainable pretrained word embeddings
model3 = Sequential()
model3.add(Embedding(nb_words, embed_dim, weights=[embedding_matrix], input_length=max_words, trainable=True))
model3.add(keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1)))
model3.add(Dense(250, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model3.summary())
# Fit the model
model3.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
scores3 = model3.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores3[1]*100))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 300)          4500000   
_________________________________________________________________
lambda_3 (Lambda)            (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               75250     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 251       
Total params: 4,575,501
Trainable params: 4,575,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
 - 27s - loss: 0.5197 - accuracy: 0.7410 - val_loss: 0.3350 - val_accuracy: 0.8684
Epoch 2/2
 - 27s - loss: 0.2528 - accuracy: 0.9022 - val_loss: 0.2802 - val_accuracy: 0.8868
A