In [3]:
# Packages
import tensorflow as tf
import pandas as pd
import nltk, re, time
import collections
from nltk.corpus import stopwords
import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import train_test_split, StratifiedKFold
from keras.models import model_from_json
from keras.models import load_model
import gensim
import os
import numpy as np
from sklearn.utils import resample
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from sklearn.metrics import roc_auc_score
from keras.initializers import Constant


In [4]:
# Load data
data = pd.read_csv('/Users/admin/Documents/Queens_Masters_Courses/Deep_Learning/course_project/best/pre_processed_steam_reviews_final2.csv')

# Shuffle data and remove test set
X_data, X_test, y_data, y_test = train_test_split(
    data['content'].values, 
    data['recommend'].values, 
    test_size = 0.2)

In [5]:
X = X_data
y = y_data

EMBEDDING_DIM = 128
MAX_LENGTH = max([len(s.split()) for s in data['content'].values.tolist()])
word2vec_path = 'test/reviews_embedding_word2vec_recent.txt'


# Create a 2D vector of tokenized words
data_lines = [word_tokenize(line) for line in data['content'].values.tolist()]

print(MAX_LENGTH)

770


In [6]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(data_lines)
sequences = tokenizer_obj.texts_to_sequences(data_lines)
data_word_index = tokenizer_obj.word_index

In [25]:
## Create Word2Vec model
word2vec_model = create_word2vec_model(data_lines, EMBEDDING_DIM)

# Get embeddings index
embeddings_index = get_embeddings_index(word2vec_model, word2vec_path)

# Create embedding matrix
num_words, embedding_matrix = create_embedding_matrix(EMBEDDING_DIM, data_word_index, embeddings_index)


Vocab size:  49136


In [26]:
# Test word2vec
word2vec_model.wv.most_similar('excellent')

[('incredible', 0.9616700410842896),
 ('fantastic', 0.9353382587432861),
 ('brilliant', 0.9319596290588379),
 ('stunning', 0.9298729300498962),
 ('outstanding', 0.929619312286377),
 ('wonderful', 0.92575603723526),
 ('gorgeous', 0.9215208292007446),
 ('immersive', 0.9208390116691589),
 ('stellar', 0.9030176401138306),
 ('rich', 0.8990558981895447)]

In [27]:
## Main program

# 5-stratified fold, so classes are balanced
skf = StratifiedShuffleSplit(n_splits = 5)
num_folds = skf.get_n_splits(X)


for index, (train_index, val_index) in enumerate(skf.split(X, y)):
    print("Training on fold " + str(index + 1) + "/" + str(num_folds) + "...")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
#     print(collections.Counter(y_train))

    # Create a 2D vector of tokenized words
    train_review_lines = [word_tokenize(line) for line in X_train.tolist()]
    val_review_lines = [word_tokenize(line) for line in X_val.tolist()]
    
    # Pad training and validation sequences 
    X_train_pad, X_val_pad = pad_data(train_review_lines, val_review_lines, MAX_LENGTH)

    print('Shape of X_train_pad tensor:', X_train_pad.shape)
    print('Shape of y_train tensor:', y_train.shape)

    print('Shape of X_val_pad tensor:', X_val_pad.shape)
    print('Shape of y_val tensor:', y_val.shape)
    print('====================')

    # Build network with GRU
    model = None
    model = create_model(num_words, EMBEDDING_DIM, embedding_matrix, MAX_LENGTH)
    
    # Train model
    history = train_model(model, X_train_pad, y_train, X_val_pad, y_val)

    accuracy_history = history.history['acc']
    val_accuracy_history = history.history['val_acc']
    print("Last training accuracy:" + str(accuracy_history[-1]) + ", validation accuracy: " + str(val_accuracy_history[-1]))

    


Training on fold 1/5...
Found 38906 unique tokens.
Found 19116 unique tokens.
Shape of X_train_pad tensor: (29547, 770)
Shape of y_train tensor: (29547,)
Shape of X_val_pad tensor: (7388, 770)
Shape of y_val tensor: (7388,)
Building model...
{0: 1.9342105263157894, 1: 0.6743119266055045}
Training model...
Train on 29547 samples, validate on 7388 samples
Epoch 1/30
 - 358s - loss: 0.6446 - acc: 0.6015 - val_loss: 0.6072 - val_acc: 0.6631
Epoch 2/30
 - 348s - loss: 0.5705 - acc: 0.6821 - val_loss: 0.7459 - val_acc: 0.5421
Epoch 3/30
 - 344s - loss: 0.5440 - acc: 0.7020 - val_loss: 0.7475 - val_acc: 0.5704
Epoch 4/30
 - 341s - loss: 0.5294 - acc: 0.7176 - val_loss: 0.7658 - val_acc: 0.5562
Epoch 5/30
 - 343s - loss: 0.5160 - acc: 0.7261 - val_loss: 0.7098 - val_acc: 0.6034
Epoch 6/30
 - 343s - loss: 0.5089 - acc: 0.7349 - val_loss: 0.7574 - val_acc: 0.5726
Epoch 7/30
 - 342s - loss: 0.4998 - acc: 0.7393 - val_loss: 0.8129 - val_acc: 0.5575
Epoch 8/30
 - 342s - loss: 0.4959 - acc: 0.7461 -

In [24]:
def get_embeddings_index(word2vec_model, word2vec_path):
    # Save word2vec model
    word2vec_model.wv.save_word2vec_format(word2vec_path, binary = False)
    
    embeddings_index = {}
    # Load word2vec model to get embeddings index
    with open(word2vec_path, encoding = 'utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coeffs = np.asarray(values[1:])
            embeddings_index[word] = coeffs
    
    return embeddings_index;
    
    

In [23]:
def create_embedding_matrix(EMBEDDING_DIM, train_word_index, embeddings_index):
    num_words = len(train_word_index) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
        
    for word, i in train_word_index.items():
        if i > num_words:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    return num_words, embedding_matrix;
        

In [22]:
def pad_data(train_review_lines, val_review_lines, MAX_LENGTH):
    # Pad training set
    train_tokenizer_obj = Tokenizer()
    train_tokenizer_obj.fit_on_texts(train_review_lines)
    train_sequences = train_tokenizer_obj.texts_to_sequences(train_review_lines)
    train_word_index = train_tokenizer_obj.word_index
    print('Found %s unique tokens.' % len(train_word_index))

    X_train_pad = pad_sequences(train_sequences, maxlen = MAX_LENGTH)
    
    # Pad validation set
    val_tokenizer_obj = Tokenizer()
    val_tokenizer_obj.fit_on_texts(val_review_lines)
    val_sequences = val_tokenizer_obj.texts_to_sequences(val_review_lines)
    val_word_index = val_tokenizer_obj.word_index
    print('Found %s unique tokens.' % len(val_word_index))
    
    X_val_pad = pad_sequences(val_sequences, maxlen = MAX_LENGTH)
    
    return X_train_pad, X_val_pad;

    

In [21]:
def pad_test_sequences(test_review_lines, MAX_LENGTH):
    test_tokenizer_obj = Tokenizer()
    test_tokenizer_obj.fit_on_texts(test_review_lines)
    test_sequences = test_tokenizer_obj.texts_to_sequences(test_review_lines)
    
    X_test_pad = pad_sequences(test_sequences, maxlen = MAX_LENGTH)
    
    return X_test_pad;
    

In [20]:
def create_word2vec_model(data_lines, EMBEDDING_DIM):
    model = gensim.models.Word2Vec(sentences = data_lines,
                              size = EMBEDDING_DIM,
                              #negative = 5,
                              window = 5,
                              workers = 6,
                              min_count = 1)
    
    words = list(model.wv.vocab)
    print('Vocab size: ', len(words))
    return model;

In [18]:
def create_model(num_words, EMBEDDING_DIM, embedding_matrix, MAX_LENGTH):
    print("Building model...")
    model = Sequential()

    embedding_layer = Embedding(num_words,
                               EMBEDDING_DIM,
                               embeddings_initializer = Constant(embedding_matrix),
                               input_length = MAX_LENGTH,
                               trainable = False)

    model.add(embedding_layer)
    
    # Currently 64, 0.4, 0.4
    model.add(GRU(units = 64, dropout = 0.4, recurrent_dropout = 0.4))
    model.add(Dense(1, activation = 'sigmoid'))

    adam = optimizers.Adam(lr = 0.001)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

#     print('Summary of the built model...')
#     print(model.summary())
    return model;

In [19]:
def train_model(model, X_train_pad, y_train, X_val_pad, y_val):
    # Get class weights and early stopping obj
    class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)
    
    class_weight_dict = dict(enumerate(class_weights))
    print(class_weight_dict)
    
    callbacks = [EarlyStopping(monitor = 'val_loss', patience = 10),
            ModelCheckpoint(filepath = 'cv_model_checkpoint.h5', monitor = 'val_loss', save_best_only = True)]
    
    print("Training model...")
    result = model.fit(X_train_pad, 
              y_train, 
              batch_size = 64,
              epochs = 30, 
              validation_data = (X_val_pad, y_val), 
              verbose = 2, 
              callbacks = callbacks, 
              class_weight = class_weight_dict, 
              shuffle = True)
    
    return result;


In [34]:
# Save trained model
model_file_path = "best/best_model_2019-03-26.h5"
model.save(model_file_path )

NameError: name 'model' is not defined

In [None]:
# Get accuracy and AUC of model on unseen test set
del model
model = load_model(model_file_path )

test_review_lines = [word_tokenize(line) for line in X_test.tolist()]
X_test_pad = pad_test_sequences(test_review_lines, MAX_LENGTH)

# Loss score and accuracy
score, acc = model.evaluate(X_test_pad, y_test, batch_size = 128)
print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: {0:.2%}".format(acc))

# AUC
y_pred = model.predict(x = X_test_pad)
auc = roc_auc_score(y_test, y_pred)
print('Test AUC:', auc)

