## Review Classification using Convolutional Neural Network

- Original code from [Christopher Masch github](https://github.com/cmasch/cnn-text-classification) which was inspired by [Britz Blog & Github](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/) 

---

In [None]:
import keras, os, pickle, re, sklearn, string, tensorflow
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import funcs

from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing import text, sequence
from keras.models import load_model
from keras.layers import Embedding, Dense, LSTM
from keras.metrics import Recall, AUC, FalseNegatives
from keras.optimizers import Adadelta

from sklearn.model_selection import train_test_split
from gensim import models
from nltk.corpus import stopwords

print('Keras version: \t\t%s' % keras.__version__)
print('Scikit version: \t%s' % sklearn.__version__)
print('TensorFlow version: \t%s' % tensorflow.__version__)

#Parameters to set acoss data management
MAX_NUM_WORDS  = 5000
MAX_SEQ_LENGTH = 200

## Word Embedding

### added GoogleNews and Custom W2V embedding for model

- built the w2v embeddings externally, exported as txt, added loading mechanism

In [None]:
# EMBEDDING
# GoogleNews only comes in 300-dimension form
EMBEDDING_DIM  = 300

def create_pretrained_embeddings(preset='None'):

    embeddings_index = {}
    
    if preset == 'glove':
        print('Pretrained embeddings GloVe is loading...')
        f = open('../dat/w2v/glove.6B.%id.txt' % EMBEDDING_DIM, encoding='utf8')
    elif preset == 'google':
        print('Pretrained embeddings GoogleNews is loading...')
        f = open('../dat/w2v/GoogleNews-vectors-negative%i.txt' % EMBEDDING_DIM, encoding='utf8')
    elif preset == 'custom':
        print('Custom Word2Vec Embedding (based on training data) is loading...')
        f = open('../dat/w2v/CustomW2V_%i.txt' % EMBEDDING_DIM, encoding='utf8')    
    else:
        if preset != 'None':
            print('invalid pretrained preset')
        return None
    
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    print('Found %s word vectors in'% len(embeddings_index), preset, 'embedding' )

    embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))

    for word, i in tokenizer.word_index.items():
        if i >= MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM,
                     input_length=MAX_SEQ_LENGTH,
                     weights=[embedding_matrix],
                     trainable=True
                    )

## Yelp data management

In [None]:

df_train = pd.read_csv('../dat/train.csv')
df_dev = pd.read_csv('../dat/dev.csv')
#df_test = pd.read.csv('../dat/test.csv')

trn_r = []
trn_f = []
dev_r = []
dev_f = []

for r in df_train[df_train['label']==0]['review']:
    trn_r.append(r)
for r in df_train[df_train['label']==1]['review']:
    trn_f.append(r)   
    
for r in df_dev[df_dev['label']==0]['review']:
    dev_r.append(r)
for r in df_dev[df_dev['label']==1]['review']:
    dev_f.append(r)
    
negative_docs = trn_r[:round(len(trn_r)/4)]
positive_docs = trn_f + trn_f
negative_docs_test = dev_r[:round(len(dev_r)/4)]
positive_docs_test = dev_f + dev_f

docs   = negative_docs + positive_docs
docs_t = negative_docs_test + positive_docs_test
labels_train = [0 for _ in range(len(negative_docs))] + [1 for _ in range(len(positive_docs))]
labels_test = [0 for _ in range(len(negative_docs_test))] + [1 for _ in range(len(positive_docs_test))]

print('Training samples: %i' % len(docs))
print('Testing samples: %i' % len(docs_t))


## Tokenizer

In [None]:
tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(docs)
sequences = tokenizer.texts_to_sequences(docs)
word_index = tokenizer.word_index
result = [len(x.split()) for x in docs]
train   = sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Text informations Training:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result),
                                                                                np.min(result),
                                                                                np.mean(result),
                                                                                MAX_SEQ_LENGTH))
print('vocabulary size: %i / limit: %i' % (len(word_index), MAX_NUM_WORDS))


tokenizer_test = text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_test.fit_on_texts(docs_t)
sequences_test = tokenizer_test.texts_to_sequences(docs_t)
word_index_test = tokenizer_test.word_index
result_test = [len(x.split()) for x in docs_t]
test   = sequence.pad_sequences(sequences_test, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Text informations Test:')
print('max length: %i / min length: %i / mean length: %i / limit length: %i' % (np.max(result_test),
                                                                                np.min(result_test),
                                                                                np.mean(result_test),
                                                                                MAX_SEQ_LENGTH))
print('vocabulary size: %i / limit: %i' % (len(word_index_test), MAX_NUM_WORDS))

# CNN

### Parameters

In [None]:
# Use pretrained Embedding (default to custom)
USE_GLOVE      = False
USE_GOOGLE     = False

# MODEL
FILTER_SIZES   = [3,4,5]
FEATURE_MAPS   = [10,10,10]
DROPOUT_RATE   = 0.4

# LEARNING
BATCH_SIZE     = 100
NB_EPOCHS      = 10 
RUNS           = 1
VAL_SIZE       = .25

### Training

In [None]:
import cnn_model

histories = []

for i in range(RUNS):
    print('Running iteration %i/%i' % (i+1, RUNS))
    
    X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=VAL_SIZE, random_state=42)
    
    emb_layer = None
    if USE_GLOVE:
        emb_layer = create_pretrained_embeddings(preset='glove')
    elif USE_GOOGLE:
        emb_layer = create_pretrained_embeddings(preset='google')
    elif not(USE_GOOGLE or USE_GLOVE):
        emb_layer = create_pretrained_embeddings(preset='custom')
    else:
        emb_layer = None
        
    
    model = cnn_model.build_cnn(
        embedding_layer=emb_layer,
        num_words=MAX_NUM_WORDS,
        embedding_dim=EMBEDDING_DIM,
        filter_sizes=FILTER_SIZES,
        feature_maps=FEATURE_MAPS,
        max_seq_length=MAX_SEQ_LENGTH,
        dropout_rate=DROPOUT_RATE
    )
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adadelta(clipvalue=3),
        metrics=['accuracy', Recall()]
    )
    
    history = model.fit(
        X_train, y_train,
        epochs=NB_EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=1,
        validation_data=(X_val, y_val),
        callbacks=[ModelCheckpoint('model-%i.h5'%(i+1), monitor='val_loss',
                                   verbose=1, save_best_only=True, mode='min'),
                   ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, min_lr=0.01)
                  ]
    )
    print()
    histories.append(history.history)

In [None]:
with open('history.pkl', 'wb') as f:
    pickle.dump(histories, f)

### CNN Evaluation

In [None]:
histories = pickle.load(open('history.pkl', 'rb'))

In [None]:
# corrected `acc` to `accuracy` & `val_acc` to `val_accuracy` in testing

def get_avg(histories, his_key):
    tmp = []
    for history in histories:
        tmp.append(history[his_key][np.argmin(history['val_loss'])])
    return np.mean(tmp)
    
print('Training: \t%0.4f loss / %0.4f acc' % (get_avg(histories, 'loss'),
                                              get_avg(histories, 'accuracy')))
print('Validation: \t%0.4f loss / %0.4f acc' % (get_avg(histories, 'val_loss'),
                                                get_avg(histories, 'val_accuracy')))

In [None]:
plot_acc_loss('training', histories, 'accuracy', 'loss')
plot_acc_loss('validation', histories, 'val_accuracy', 'val_loss')

##### Final test (IMDB / Yelp)

In [None]:
test_loss = []
test_accs = []

for i in range(0,RUNS):
    cnn_ = load_model("model-%i.h5" % (i+1))
    
    score = cnn_.evaluate(X_test, y_test, verbose=1)
    test_loss.append(score[0])
    test_accs.append(score[1])
    
    print('Running test with model %i: %0.4f loss / %0.4f acc' % (i+1, score[0], score[1]))
    
print('\nAverage loss / accuracy on testset: %0.4f loss / %0.4f acc' % (np.mean(test_loss),
                                                                        np.mean(test_accs)))
print('Standard deviation: (+-%0.4f) loss / (+-%0.4f) acc' % (np.std(test_loss), np.std(test_accs)))