In [None]:
import os
import pickle
import shutil
import sys
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics as mt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils import shuffle
from statsmodels.stats.contingency_tables import mcnemar

LOAD_FROM_PICKLE = False

import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {‘0’, ‘1’, ‘2’}
os.environ['AUTOGRAPH_VERBOSITY'] = '0'

import warnings

warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', RuntimeWarning)

USE_GPU = True

In [None]:
# The kernel needs to be restarted before changing this setting to take effect

if USE_GPU:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
# Tensorflow import needs to be after setting the CUDA_VISIBLE_DEVICES

import tensorflow as tf
from keras.models import Model, Sequential, load_model
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import (GRU, LSTM, Activation, Conv1D, Dense,
                                     Dropout, Embedding, Flatten, Input,
                                     InputLayer, MaxPooling1D, concatenate)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# print Tensorflow and CUDA information
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
print(f"Tensorflow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

if tf.test.gpu_device_name():
    gpu_devices = tf.config.list_physical_devices('GPU')
    details = tf.config.experimental.get_device_details(gpu_devices[0])
    name = details.get('device_name', 'Unknown GPU')
    
    print(f"Using {name}")
else:
    print("No GPU found")

In [None]:
# data is stored in the folowing way:
# "rating", "title", "review" all as strings
# "1" is negative, "2" is positive

if LOAD_FROM_PICKLE:
    with open('../Data/Pickle/reviews.pickle', 'rb') as handle:
        data = pickle.load(handle)

    print('Loaded data from pickle')
else:
    data = pd.read_csv('../Data/Amazon_Reviews/reviews.csv', names=['rating', 'title', 'review'], low_memory=False, header=0)

    # combine title and review
    data['review'] = data['title'] + ' ' + data['review']

    # remove title
    data.drop('title', axis=1, inplace=True)

    # turn sets of spaces into single space
    data['review'] = data['review'].str.replace(' +', ' ')

    # remove leading and trailing spaces
    data['review'] = data['review'].str.strip()

    # convert rating to uint8
    data['rating'] = data['rating'].astype('uint8')

    # convert rating to 0 and 1 where 0 is positive and 1 is negative
    data['rating'] = data['rating'].apply(lambda x: 0 if x == 2 else 1)

    # save as pickle
    with open('../Data/Pickle/reviews.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print('Loaded data from csv')

In [None]:
# print the first 5 rows

for i in range(5):
    print(f"Rating: {data['rating'][i]}")
    print(f"Review: {data['review'][i]}")
    print()

In [None]:
# print some statistics
print(f"Number of reviews: {len(data):,}")
print(f"Number of positive reviews: {len(data[data['rating'] == 1]):,}")
print(f"Number of negative reviews: {len(data[data['rating'] == 0]):,}")

review_lengths = [len(review.split()) for review in data['review'].astype(str)]

# print the longest review in words
print(f"Longest review in words: {max(review_lengths)}")

# print the 99th percentile of review lengths
print(f"99th percentile of review lengths: {np.percentile(review_lengths, 99)}")

# graph the number of words in each review
sns.set_style('darkgrid')
sns.set_context('talk')
sns.set_palette('colorblind')

plt.figure(figsize=(18, 6))

sns.histplot(review_lengths, bins=100, kde=True)

plt.title('Review length distribution')
plt.xlabel('Number of words')
plt.ylabel('Number of reviews')
plt.xlim(0, 257)

plt.show()

In [None]:
X = data['review'].astype(str).values
y = data['rating'].values

In [None]:
NUM_TOP_WORDS = None # use entire vocabulary!
MAX_ART_LEN = 200 # maximum and minimum number of words
NUM_CLASSES = 2

if LOAD_FROM_PICKLE:
    with open('../Data/Pickle/reviews_tokenized.pickle', 'rb') as handle:
        X, word_index = pickle.load(handle)
        NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
        top_words = min((len(word_index), NUM_TOP_WORDS))

    print('Loaded tokenized X from pickle')
else:
    tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
    tokenizer.fit_on_texts(X)

    sequences = tokenizer.texts_to_sequences(X)

    word_index = tokenizer.word_index
    NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
    top_words = min((len(word_index),NUM_TOP_WORDS))

    X = pad_sequences(sequences, maxlen=MAX_ART_LEN)

    # save as pickle
    with open('../Data/Pickle/reviews_tokenized.pickle', 'wb') as handle:
        pickle.dump((X, word_index), handle, protocol=pickle.HIGHEST_PROTOCOL)

y_ohe = keras.utils.to_categorical(y, num_classes=2)

print(f"Found {len(word_index):,} unique tokens. Distilled to {top_words:,} top words.")

In [None]:
EMBED_SIZE = 100
# the embed size should match the file you load glove from
embeddings_index = {}
f = open('../Data/GloVe/glove.6B.100d.txt')
# save key/array pairs of the embeddings
#  the key of the dictionary is the word, the array is the embedding
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print(f"Found {len(embeddings_index):,} word vectors.\n")

# now fill in the matrix, using the ordering from the
#  keras word tokenizer from before
found_words = 0
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be ALL-ZEROS
        embedding_matrix[i] = embedding_vector
        found_words = found_words+1

print(f"Embedding Shape: {embedding_matrix.shape}")
print(f"Total words found: {found_words:,}")
print(f"Percentage: {round(100 * found_words / embedding_matrix.shape[0], 2)}")

In [None]:
X_train, X_test, y_train_ohe, y_test_ohe = train_test_split(X, y_ohe, test_size=0.2, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train_ohe shape: {y_train_ohe.shape}")
print(f"y_test_ohe shape: {y_test_ohe.shape}")

In [None]:
# these three functions are from https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

@tf.autograph.experimental.do_not_convert
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],# here is the embedding getting saved
                            input_length=MAX_ART_LEN,
                            trainable=False)

def create_RNN(name, RNN_layer_type, num_units):
    if RNN_layer_type == 'LSTM':
        RNN_layer = LSTM
    elif RNN_layer_type == 'GRU':
        RNN_layer = GRU
    else:
        raise ValueError("RNN_layer_type must be one of 'LSTM' or 'GRU'")

    rnn = Sequential(name=name)
    rnn.add(embedding_layer)
    rnn.add(RNN_layer(num_units, dropout=0.2))
    rnn.add(Dense(NUM_CLASSES, activation='sigmoid'))
    rnn.compile(loss='binary_crossentropy',
                optimizer='adam', 
                metrics=['accuracy', recall_m])

    return rnn

rnn1 = create_RNN('RNN-LSTM-25', 'LSTM', 25)
rnn2 = create_RNN('RNN-GRU-25', 'GRU', 25)
rnn3 = create_RNN('RNN-LSTM-50', 'LSTM', 50)
rnn4 = create_RNN('RNN-GRU-50', 'GRU', 50)

models = [rnn1, rnn2, rnn3, rnn4]

for model in models:
    print(model.summary())

In [None]:
LOAD_STORED_MODELS = False

if LOAD_STORED_MODELS:
    for model in models:
        # load model
        model.load_weights(f"Models/Lab7/{model.name}.h5")
        #load history
        with open(f"Models/Lab7/{model.name}.pk1", 'rb') as file_pi:
            model.history = pickle.load(file_pi)
else:
    for model in models:
        start = time.time()

        history = model.fit(X_train, y_train_ohe, validation_data=(X_test, y_test_ohe), epochs=25, batch_size=2048, verbose=1)

        print(f"Model {model.name} took {round(time.time() - start, 2)} seconds to train")

        # Save model
        model.save_weights(f"Models/Lab7/{model.name}.h5")
        # save history
        with open(f"Models/Lab7/{model.name}.pkl", 'wb') as f:
            pickle.dump(history.history, f)

        print(f"Model {model.name} saved")

In [None]:
def history_plot(histories):
    combined = dict()
    for key in ['accuracy','val_accuracy','loss','val_loss', 'recall_m', 'val_recall_m']:
        combined[key] = np.hstack([x.history[key] for x in histories[0]])
        
    # summarize history for accuracy
    plt.figure(figsize=(15,5))
    plt.subplot(1,2,1)
    plt.plot(combined['accuracy'])
    plt.plot(combined['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')

    # summarize history for loss
    plt.subplot(1,2,2)
    plt.plot(combined['loss'])
    plt.plot(combined['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()