## Review Classification using LSTM

- Baseline model inspired from [this github](https://github.com/nsinha280/lstm-on-Yelp-review-data/blob/master/lstm-final.ipynb)

In [None]:
import keras, os, pickle, re, sklearn, string, tensorflow
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing import text, sequence
from keras.models import load_model
from keras.layers import Embedding, Dense, LSTM
from keras.metrics import Recall, AUC, FalseNegatives
from keras.optimizers import Adadelta

from sklearn.model_selection import train_test_split
from gensim import models
from nltk.corpus import stopwords

import funcs

print('Keras version: \t\t%s' % keras.__version__)
print('Scikit version: \t%s' % sklearn.__version__)
print('TensorFlow version: \t%s' % tensorflow.__version__)

#Parameters to set acoss data management
MAX_NUM_WORDS  = 5000
MAX_SEQ_LENGTH = 200

In [None]:
# EMBEDDING
# GoogleNews only comes in 300-dimension form
EMBEDDING_DIM  = 300

def create_pretrained_embeddings(preset='None'):

    embeddings_index = {}
    
    if preset == 'glove':
        print('Pretrained embeddings GloVe is loading...')
        f = open('../dat/w2v/glove.6B.%id.txt' % EMBEDDING_DIM, encoding='utf8')
    elif preset == 'google':
        print('Pretrained embeddings GoogleNews is loading...')
        f = open('../dat/w2v/GoogleNews-vectors-negative%i.txt' % EMBEDDING_DIM, encoding='utf8')
    elif preset == 'custom':
        print('Custom Word2Vec Embedding (based on training data) is loading...')
        f = open('../dat/w2v/CustomW2V_%i.txt' % EMBEDDING_DIM, encoding='utf8')    
    else:
        if preset != 'None':
            print('invalid pretrained preset')
        return None
    
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    print('Found %s word vectors in'% len(embeddings_index), preset, 'embedding' )

    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

    for word, i in tokenizer.word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM,
                     input_length=MAX_SEQ_LENGTH,
                     weights=[embedding_matrix],
                     trainable=True
                    )

In [None]:

df_train = pd.read_csv('../dat/train.csv')
df_dev = pd.read_csv('../dat/dev.csv')
#df_test = pd.read.csv('../dat/test.csv')

trn_r = []
trn_f = []
dev_r = []
dev_f = []

for r in df_train[df_train['label']==0]['review']:
    trn_r.append(r)
for r in df_train[df_train['label']==1]['review']:
    trn_f.append(r)   
    
for r in df_dev[df_dev['label']==0]['review']:
    dev_r.append(r)
for r in df_dev[df_dev['label']==1]['review']:
    dev_f.append(r)
    
negative_docs = trn_r[:round(len(trn_r)/4)]
positive_docs = trn_f + trn_f
negative_docs_test = dev_r[:round(len(dev_r)/4)]
positive_docs_test = dev_f + dev_f

docs   = negative_docs + positive_docs
docs_t = negative_docs_test + positive_docs_test
labels_train = [0 for _ in range(len(negative_docs))] + [1 for _ in range(len(positive_docs))]
labels_test = [0 for _ in range(len(negative_docs_test))] + [1 for _ in range(len(positive_docs_test))]

print('Training samples: %i' % len(docs))
print('Testing samples: %i' % len(docs_t))


In [None]:
tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)
word_index = tokenizer.word_index
result = [len(x.split()) for x in docs]
train   = sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Text informations Training:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result),
                                                                                np.min(result),
                                                                                np.mean(result),
                                                                                MAX_SEQ_LENGTH))
print('vocabulary size: %i / limit: %i' % (len(word_index), MAX_NUM_WORDS))


tokenizer_test = text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_test.fit_on_texts(docs_t)
sequences_test = tokenizer_test.texts_to_sequences(docs_t)
word_index_test = tokenizer_test.word_index
result_test = [len(x.split()) for x in docs_t]
test   = sequence.pad_sequences(sequences_test, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Text informations Test:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result_test),
                                                                                np.min(result_test),
                                                                                np.mean(result_test),
                                                                                MAX_SEQ_LENGTH))
print('vocabulary size: %i / limit: %i' % (len(word_index_test), MAX_NUM_WORDS))

In [None]:
EL = create_pretrained_embeddings(preset='custom')

In [None]:
#LSTM w/ custom Embedding
model = keras.Sequential()
model.add(EL)
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Recall(), AUC(), FalseNegatives()])
print(model.summary())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, labels_train, test_size=.25, random_state=42)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=100)

In [None]:
# Final evaluation of the model
scores = model.evaluate(data_test, labels_test, verbose=0)
print("Recall: %.2f%%" % (scores[1]*100))