# Formalize and Save Data

Read essays, depression and anxiety from different files and concatenate them into a dictionary containing all info needed for further exploration.

For this jupyter notebook to run, download docker image by executing the command line
```bash
sudo docker pull pupster90/cse255-18
```

In [None]:
!pip install gensim scikit-plot nltk keras tqdm
#############################################
%matplotlib inline
import pandas as pd
import numpy as np
import os
import pickle
import scikitplot
from functools import reduce

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale
from sklearn.metrics.pairwise import cosine_similarity

import gensim
import re, collections
import string
import scikitplot.plotters as skplt

import nltk
from nltk.corpus import stopwords

from xgboost import XGBClassifier
import xgboost as xgb

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
from tqdm import tqdm

In [None]:
nltk.download('stopwords')
nltk.download("punkt")

In [None]:
def words(text):
    return re.findall('[a-z]+', text.lower())
def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(open('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts = [a + c + b for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words):
    return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

In [None]:
def modifyText(text):
    sw = set(stopwords.words('english'))
    text = re.sub(r'\[[a-zA-Z ]+\]|\r\n', '', text)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub('', text)
    mod = ''
    counter = 0
    words_count = 0
    for word in text.split(' '):
        new_word = correct(word.lower())
        if new_word not in sw:
            mod += new_word + ' '
            words_count += 1
            if(new_word != word.lower()):
                counter += 1
    return mod, counter, words_count

In [None]:
def processFile(filename):
    global data
    f = open(filename)
    with open(filename, 'rb') as f:
        texts = []
        for line in f:
            texts.append(line.decode(errors='ignore'))
        text = reduce(lambda x, y : x + y , texts[2:])
        words = filename.split("-")
        key = words[-1].split(".")[0].upper()
        data[key] = dict({"essay": text})

In [None]:
def getData(data_filename, preprocess=True):
    global data
    global counters
    global words_counts
    if os.path.exists(data_filename):
        with open(data_filename, 'rb') as fp:
            data = pickle.load(fp)
    else:
        directory = "./essays/rtf/"
        counter = 0
        for filename in os.listdir(directory):
            if filename.endswith(".rtf"):
                processFile(directory + filename)
                counter += 1
        print("There are {0} essays.".format(counter))
        f = open("./277labels/labels.csv")
        complete_entries_list = []
        for line in f:
            entries = line.split(",")
            key = entries[0]
            depression = entries[1]
            anxiety = entries[2]
            if(key == "ncdsid" or depression == "$null$" or anxiety == "$null$"):
                continue
            else:
                depression = float(depression)
                anxiety = float(anxiety)
                if(key not in data or depression == -1.0 or anxiety == -1.0):
                    if key in data:
                        deleted = data.pop(key)
                    continue
                else:
                    data[key]["anxiety"] = anxiety
                    data[key]["depression"] = depression
                    counter += 1
                    complete_entries_list.append(key)
        print("There are {0} entries that have complete info.".format(len(complete_entries_list)))
        selected = dict({})
        for key in complete_entries_list:
            selected[key] = data[key]
            selected[key]["essay"] = re.sub(r'[0-9]+[a-zA-Zx*]*\s|words ', '', selected[key]["essay"])
        if preprocess:
            for i in tqdm(range(len(complete_entries_list))):
                key = complete_entries_list[i]
                selected[key]['essay'], counter, words_count = modifyText(selected[key]['essay'])
                counters.append(counter)
                words_counts.append(words_count)
        with open(data_filename, 'wb') as fp:
            pickle.dump(selected, fp, protocol=2)
        data = selected
    return data, counters, words_counts

__Load processed data from disk or process raw data here__

One option is to load vector form of data.

Here, getData(filename) would first search if such file exists, if not, it will then collect raw data, preprocess it, you can select to do correction by setting preprocess=True(default)/False.

clarifications about files:
* data_embedding_google.p: processed essay Word2Vec embeddings (vectors)
* data_mod_contain_corrections.p: processed essay Word2Vec embeddings with correction ratio and total number of words (vectors)
* data_modified.p: processed essay (text)

Another choice is to use text files. 
Here we use "data_modified.p" to set `data`.
Data are stored in a dictionary named as `data`. The UCDSId is used to serve as the key of this dictionary, and the value is the detailed info encapsulated in a dictionary containing __essay__, __depression__, and __anxiety__.

In [None]:
data = dict({})
counters = []
words_counts = []
data, counters, words_counts = getData('data_modified.p')
df = pd.DataFrame.from_dict(data, orient='index')
df = df.dropna()
depression = df['depression']

In [None]:
threshold = 0
X_train, X_test, y_train, y_test = train_test_split(data, depression, test_size = 0.2, random_state=49)
y_test = [1 if x > threshold else 0 for x in y_test.values.tolist()]
y_train = [1 if x > threshold else 0 for x in y_train.values.tolist()]

In [None]:
data = dict({})
counters = []
words_counts = []
data, counters, words_counts = getData('data_mod_contain_corrections.p')

# Natural Language Processing

## Bag of Words

In [None]:
count_vectorizer = CountVectorizer(
    analyzer="word", tokenizer=nltk.word_tokenize,
    preprocessor=None, stop_words='english', max_features=None)    

In [None]:
bag_of_words = count_vectorizer.fit_transform(df['essay'])
print("There are {0} unique words in corpus.".format(len(count_vectorizer.get_feature_names())))

In [None]:
svd = TruncatedSVD(n_components=25, n_iter=25, random_state=12)
truncated_bag_of_words = svd.fit_transform(bag_of_words)

## Word2Vec

### Word2Vec implementation

In [None]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location):
    """Returns trained word2vec  
    Args:
        sentences: iterator for sentences    
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=500, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

Train Word2Vec model and save to local disk.

filename clarifications:

* w2vmodel_modified: trained word2vec model using processed data (containing numbers), embedding size 100
* w2vmodel_modified_1000: trained word2vec model using processed data (containing numbers), embedding size 1000
* w2vmodel_modified_delete_numbers: trained word2vec model using processed data (without numbers), embedding size 100
* w2vmodel_modified_delete_numbers_500: trained word2vec model using processed data (without numbers), embedding size 500

In [None]:
w2vec = get_word2vec(MySentences(X_train.values, X_test.values),
                     'w2vmodel_modified_delete_numbers_500')

Another option is to load Google's pre-trained Word2Vec model.
Before running the following block, download the Google model https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [None]:
w2vec = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin.gz', binary=True)

Having a word2vec model, we then use it to transform each documents into a feature vector. In order to convert a document of multiple words into a single vector using trained word2vec, we take the word2vec of all words in the document, then take its mean.

In [None]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

Transform training data to `mean_embedded`.

In [None]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded = mean_embedding_vectorizer.fit_transform(X_train)

In [None]:
test_embedded = mean_embedding_vectorizer.fit_transform(X_test)

Calculate the cosine similarity

Here to execute the code, you need to make sure `data` is the embedding of essays you want to use. Uncomment the first line to use vectors generated by google's word2vec model.

In [None]:
# data, _, _ = getData('data_mod_contain_corrections.p')
origin = data
y_train = df['depression'].values.tolist()
cos = cosine_similarity(origin)
counter = np.array([0] * 400).reshape(20, 20)
index_to_delete = set([])
for i in range(len(cos)):
    for j in range(i, len(cos[0])):
        if cos[i][j] > 0.95 and i != j and y_train[i] != y_train[j]:
            counter[int(max(y_train[i], y_train[j]))][int(min(y_train[i], y_train[j]))] += 1
            index_to_delete.add(i)
            index_to_delete.add(j)

In [None]:
for i in counter:
    print("{0:5d}\t{1:5d}\t{2:5d}\t{3:5d}\t{4:5d}\t{5:5d}\t{6:5d}\t{7:5d}\t{8:5d}\t{9:5d}".format(i[0], i[1], i[2],\
                                                                                                     i[3],i[4],i[5],\
                                                                                                     i[6],i[7],i[8],i[9]))

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_title('# of pairs')
plt.imshow(counter[:12,:12], cmap="GnBu")
ax.set_aspect('equal')

cax = fig.add_axes([0.12, 0.1, 0.78, 0.8])
cax.get_xaxis().set_visible(False)
cax.get_yaxis().set_visible(False)
cax.patch.set_alpha(0.3)
cax.set_frame_on(False)
plt.colorbar(orientation='vertical')
plt.show()

change selected index's labels to -1

In [None]:
ori_label = df['depression'].values.tolist()
filtered_label = [ori_label[i] if (i not in index_to_delete) else -1 for i in range(len(ori_label))]

delete selected indexes

In [None]:
filtered_data = []
for i in range(len(data)):
    if(i not in index_to_delete):
        filtered_data.append(data[i])
filtered_data = np.array(filtered_data)
filtered_data.shape

In [None]:
ori_label = df['depression'].values.tolist()
filtered_label = [ori_label[i] for i in range(len(ori_label)) if i not in index_to_delete]

In [None]:
threshold = 0
X_train, X_test, y_train, y_test = train_test_split(data, filtered_label, test_size = 0.2, random_state=49)
y_test = [1 if x > threshold else x for x in y_test]
y_train = [1 if x > threshold else x for x in y_train]

### Training models

#### Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators=1000, max_depth=15, verbose=1, class_weight='balanced')

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
pred = clf.predict(X_test)

In [None]:
a = [ x==y for (x,y) in zip(pred.tolist(), y_test)]

In [None]:
sum(a)/len(a)

In [None]:
clf.score(X_train, y_train)

In [None]:
probas = clf.predict_proba(X_test).tolist()
pred_indices = np.argmax(probas, axis=1)
# classes = np.array(range(clf.n_classes_))
classes = np.array([-1,0,1])
preds = classes[pred_indices]
# true_labels = [int(x+1) for x in y_test]
# print('Log loss: {}'.format(log_loss(classes[y_test], probas)))
# print('Accuracy: {}'.format(accuracy_score(classes[y_test], preds)))
skplt.plot_confusion_matrix(y_test, preds)
# skplt.plot_confusion_matrix(true_labels, preds)
probas = clf.predict_proba(X_test)

In [None]:
scikitplot.metrics.plot_precision_recall(y_test, probas)

In [None]:
scikitplot.metrics.plot_roc(y_test, probas)

#### XGBoost

In [None]:
def xgboost_plst():
    param = {}
    param['max_depth']= 2   # depth of tree
    param['eta'] = 0.3      # shrinkage parameter
    param['silent'] = 1     # not silent
    param['objective'] = 'binary:logistic'
    param['nthread'] = 7 # Number of threads used
    param['eval_metric'] = 'logloss'

    plst = param.items()
    return plst

In [None]:
mean_embedded = X_train
test_embedded = X_test
dtrain = xgb.DMatrix(mean_embedded, label=np.array(y_train))
dtest = xgb.DMatrix(test_embedded, label=np.array(y_test))
booster = xgb.train(xgboost_plst(), dtrain, 400)
pred = booster.predict(dtest, output_margin=False, ntree_limit=booster.best_ntree_limit)

In [None]:
xgb.plot_importance(booster)

In [None]:
a = [(x - 0.5)*(y-0.5) > 0 for (x, y) in zip(pred.tolist(), y_test)]

In [None]:
sum(a)/len(a)

In [None]:
pred_indices = [1 if x < 0 else 0 for x in pred]
# print('Log loss: {}'.format(log_loss(classes[y_test], pred)))
# print('Accuracy: {}'.format(accuracy_score(classes[y_test], preds)))
skplt.plot_confusion_matrix(y_test, pred_indices)

#### Adaboost

In [None]:
mean_embedded = X_train
test_embedded = X_test
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, class_weight='balanced'), algorithm="SAMME.R", n_estimators=1000)
clf = clf.fit(mean_embedded, y_train)
pred = clf.predict(test_embedded)
a = [ x==y for (x,y) in zip(pred.tolist(), y_test)]
print(sum(a)/len(a))
probas = clf.predict_proba(test_embedded).tolist()
pred_indices = np.argmax(probas, axis=1)
# classes = np.array(range(0, 2))
classes = np.array([-1,0,1])
preds = classes[pred_indices]
print('Log loss: {}'.format(log_loss(y_test, probas)))
print('Accuracy: {}'.format(accuracy_score(y_test, preds)))
skplt.plot_confusion_matrix(y_test, preds)
# print('Log loss: {}'.format(log_loss(classes[y_test], probas)))
# print('Accuracy: {}'.format(accuracy_score(classes[y_test], preds)))
# skplt.plot_confusion_matrix(classes[y_test], preds)

In [None]:
clf.score(mean_embedded, y_train)

In [None]:
skplt.plot_confusion_matrix(classes[y_test], preds, normalize=True)

In [None]:
scikitplot.metrics.plot_precision_recall(y_test, probas)
scikitplot.metrics.plot_roc(y_test, probas)

#### Neural Network

In [None]:
!cat nn_config.yaml 

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import tensorflow as tf
from matplotlib import pyplot as plt

## Pretty Print
import pprint as pp

In [None]:
import yaml
def import_config():
    with open("nn_config.yaml", 'r') as ymlfile:
        try:
            cfg = yaml.load(ymlfile)
        except yaml.YAMLError as err:
            print(err)
    return cfg

In [None]:
cfg = import_config()

## Is it loaded correctly?
pp.pprint(cfg)

In [None]:
def standardize(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    return (data - mu)/sigma

In [None]:
training_data = mean_embedded

In [None]:
learning_rate = cfg['learning_rate']
training_epochs = cfg['epochs'] * 10
train_valid_split = cfg['training_to_validation_ratio']
num_batches = cfg['num_mini_batches']
display_step = cfg['display_step'] * 10

num_examples= training_data.shape[0]

# The first `num_train_examples` should be used for training, the rest for validation.
num_train_examples = int(num_examples * train_valid_split)

batch_size = num_train_examples/num_batches


# Network Parameters
n_hidden_1 = cfg['hidden_layer_sizes']['first_layer']*3 # 1st layer number of features
n_hidden_2 = cfg['hidden_layer_sizes']['second_layer']*3 # 2nd layer number of features
n_input = 302 # change input size here
n_classes = 3 # change classes numbers here


print("Total Training examples: %d, Number of Batches: %d, Batch Size: %d" %(num_train_examples,num_batches,batch_size))

In [None]:
# TF Graph input
## Use the below placeholders appropriately inside the train_nn() function

x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, 3])
n_hidden_3 = 5

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, n_classes]))
}

biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [None]:
def create_feedforward_nn_model(x, weights, biases):
    # Hidden layer with SIGMOID activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.sigmoid(layer_1)
    layer_1 = tf.nn.dropout(layer_1, 0.5)
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.sigmoid(layer_2)
    layer_2 = tf.nn.dropout(layer_2, 0.5)
    layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
    layer_3 = tf.nn.sigmoid(layer_3)
    out_layer = tf.add(tf.matmul(layer_3, weights['out']), biases['out'])
    return out_layer

In [None]:
# Construct model
def stitch_network(x, y, weights, biases, learning_rate):
    
    pred_raw = create_feedforward_nn_model(x, weights, biases)
    pred = tf.round(tf.nn.sigmoid(pred_raw))
    probas = tf.nn.sigmoid(pred_raw)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred_raw, labels=y))
    train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    return pred_raw, pred, loss, train_op, probas


pred_raw, pred, loss, train_op, probas = stitch_network(x, y, weights, biases, learning_rate)

In [None]:
init = tf.global_variables_initializer()

In [None]:
def train_nn():
    with tf.Session() as sess:

        sess.run(init)
        
        ## this is needed to print debug statements during training.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        x_train, x_valid = features[:num_train_examples], features[num_train_examples:]
        y_train, y_valid = labels[:num_train_examples], labels[num_train_examples:]

        y_valid = y_valid.reshape(len(y_valid), 1)
        y_valid = np.array([[0, 0, 1] if x == -1 else ([0, 1, 0] if x == 0 else [1, 0, 0]) for x in y_valid])
        training_losses = []
        training_accs = []

        validation_losses = []
        validation_accs = []
        
        
        for epoch in range(training_epochs):
            loss_counter = 0
            correct_counter = 0
            for num_batch in range(num_batches):
                cur_features = x_train[int(num_batch * batch_size) : int(min(len(x_train), (num_batch + 1) * batch_size))]
                cur_labels = y_train[int(num_batch * batch_size) : int(min(len(x_train), (num_batch + 1) * batch_size))]
                cur_labels = cur_labels.reshape(len(cur_labels), 1)
                cur_labels = np.array([[0, 0, 1] if x == -1 else ([0, 1, 0] if x == 0 else [1, 0, 0]) for x in cur_labels])
                _, cur_loss, cur_pred = sess.run([train_op, loss, pred], feed_dict={x:cur_features, y:cur_labels})
                loss_counter += cur_loss * len(cur_labels)
#                 correct_counter += sum([1 if x==y else 0 for (x,y) in zip(cur_labels, cur_pred)])
#             training_losses.append(np.float64(loss_counter/len(x_train)))
#             training_accs.append(np.float64(correct_counter/len(x_train)))
            
            _, val_loss, val_pred = sess.run([train_op, loss, pred], feed_dict={x: x_valid, y: y_valid})
            validation_losses.append(np.float64(val_loss))
#             validation_accs.append(np.float64(sum([1 if x==y else 0 for (x,y) in zip(y_valid, val_pred)])/len(y_valid)))
        
#             if epoch%display_step==0:
#                     print("Epoch {0} | Tr loss: {1} | Tr accuracy {2} | Va loss: {3} | Va accuracy: {4}"\
#                           .format(epoch + 1,training_losses[epoch],training_accs[epoch],validation_losses[epoch], validation_accs[epoch]))       
#         print("Optimization Finished!")

        test_predictions = []
#         test_label = np.array[0] * len(test_features)
#         test_label = test_label
        test_pred, proba = sess.run([pred, probas], feed_dict={x: test_features})
        test_predictions = np.float64(test_pred.T[0]) 
        
        ## this is needed to print debug statements during training.
        coord.request_stop()
        coord.join(threads)
    
    ## close TF session if open
    if 'session' in locals() and sess is not None:
        print('Close interactive session')
        sess.close()
        
    return training_losses, validation_losses, training_accs, validation_accs, test_predictions, proba

In [None]:
# features = mean_embedded
# labels = np.array(y_train)
features = X_train
labels = np.array(y_train)
test_features = X_test
# test_features = test_embedded
training_losses, validation_losses, training_accs, validation_accs, test_predictions, probas = train_nn()

In [None]:
classes = np.array([1,0,-1])
test_predictions = classes[np.argmax(probas, axis = 1)]
skplt.plot_confusion_matrix(y_test, test_predictions)

In [None]:
a = [x == y for (x,y) in zip(y_test, test_predictions)]

In [None]:
sum(a)/len(a)

In [None]:
test_predictions[:10]

In [None]:
probas = [[x, 1-x] for x in probas]
probas = [[x[0][0], x[1][0]] for x in probas]
probas = np.array(probas)
probas = probas.tolist()
probas = [[x[1], x[0]] for x in probas]

In [None]:
def plot_loss_vs_epochs(training_losses, validation_losses):
    plt.title("loss vs epochs")
    plt.plot(training_losses)
    plt.plot(validation_losses)
    plt.legend(["training","validation"])

In [None]:
def plot_acc_vs_epochs(training_acc, validation_acc):
    plt.title("accuracy vs epochs")
    plt.plot(training_acc)
    plt.plot(validation_acc)
    plt.legend(["training","validation"])

In [None]:
def plots(training_losses, validation_losses, training_accs, validation_accs):
    
    fig = plt.figure(figsize=(20,10))
    
    plt.subplot(1,2,1)
    plot_loss_vs_epochs(training_losses, validation_losses)
    plt.subplot(1,2,2)
    plot_acc_vs_epochs(training_accs, validation_accs)

In [None]:
plots(training_losses, validation_losses, training_accs, validation_accs)

In [None]:
a = [1 if x==y else 0 for (x,y) in zip(test_predictions.tolist(), y_test)] 

In [None]:
sum(a)/len(a)

In [None]:
skplt.plot_confusion_matrix(y_test, test_predictions.tolist())

In [None]:
scikitplot.metrics.plot_precision_recall(y_test, probas)

In [None]:
scikitplot.metrics.plot_roc(y_test, probas)

## LSTM

In [None]:
# Use the Keras tokenizer
num_words = 2000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train.values)
# Pad the data 
X = tokenizer.texts_to_sequences(X_train.values)
X = pad_sequences(X, maxlen=2000)

In [None]:
# Build out our simple LSTM
embed_dim = 128
lstm_out = 196
num_words = 2000
# Model saving callback
ckpt_callback = ModelCheckpoint('keras_model', 
                                 monitor='val_loss', 
                                 verbose=1, 
                                 save_best_only=True, 
                                 mode='auto')

model = Sequential()
model.add(Embedding(num_words, embed_dim, input_length = X.shape[1]))
model.add(LSTM(lstm_out, recurrent_dropout=0.5, dropout=0.5))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['categorical_crossentropy'])
print(model.summary())

In [None]:
# Y = np.array(y_train)
Y = to_categorical(np.array(y_train))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42, stratify=Y)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
batch_size = 32
trained_model = model.fit(X_train, Y_train, epochs=3, batch_size=batch_size, validation_split=0.2, callbacks=[ckpt_callback])

In [None]:
trained_model = load_model('keras_model')

In [None]:
pred = trained_model.predict(X_test)

In [None]:
train_res = trained_model.predict(X_train)

In [None]:
pred

In [None]:
probas = train_res
pred_indices = np.argmax(probas, axis=1)
classes = np.array(range(2))
preds = classes[pred_indices]
accuracy_score(classes[np.argmax(Y_train, axis=1)], preds)

In [None]:
probas = pred
pred_indices = np.argmax(probas, axis=1)
classes = np.array(range(2))
preds = classes[pred_indices]
print('Log loss: {}'.format(log_loss(classes[np.argmax(Y_test, axis=1)], probas)))
print('Accuracy: {}'.format(accuracy_score(classes[np.argmax(Y_test, axis=1)], preds)))
skplt.plot_confusion_matrix(classes[np.argmax(Y_test, axis=1)], preds)

In [None]:
scikitplot.metrics.plot_precision_recall(Y_test[:,1], pred)

In [None]:
scikitplot.metrics.plot_roc(Y_test[:,1], pred)