# Classification Experiments

In [1]:
# Change working directory to be project root
import os
#os.chdir("..")
os.getcwd()

'/Users/aaronquinton/Documents/UBC-MDS/Capstone/BCstats/DSCI_591_capstone-BCStats'

In [529]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import time

# Custom functions for preprocessing and data preparation
from src.data.preprocessing_text import (
    clean_text, clean_numbers, replace_typical_misspell, remove_stopwords,
    balance_themes, preprocess_for_embed, preprocess_for_bow
)

from src.features.word_vectors import (
    build_vocab, check_coverage, get_average_embeddings
)

from src.models.eval import theme_results, investigate_results

# Functions for preprocessing and data preparation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
import sklearn.metrics as metrics

# Training Word embeddings and pre-trained embeddings
import gensim
from gensim.models import Word2Vec, KeyedVectors

# Training LSTM Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPooling1D, Conv1D, GlobalAveragePooling1D
from keras.layers import GRU, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# Classification alogrithms
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import multilayer_perceptron
from sklearn.tree import DecisionTreeClassifier


## <span style = "color:Darkblue"> Read in Data and Embeddings </span>

In [316]:
# Input Filepaths
# Data Files
fname_rawdata2015 = "data/interim/train_2015-qualitative-data.csv"
fname_rawdata2018 = "data/interim/train_2018-qualitative-data.csv"
fname_quant = "data/processed/tidy_quant_questions.csv"
fname_legend = "references/data-dictionaries/survey_mc_legend.csv"

# Pre-trained Embeddings
fname_fasttext_crawl = "./references/pretrained_embeddings.nosync/fasttext/" \
                       "crawl-300d-2M.vec"
fname_fasttext_wiki = "./references/pretrained_embeddings.nosync/fasttext/" \
                      "wiki-news-300d-1M.vec"
fname_w2v_googlenews = "./references/pretrained_embeddings.nosync/" \
                       "GoogleNews-vectors-negative300.bin"
fname_glove_twitter = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.twitter.27B.200d.w2v.txt"
fname_glove_wiki = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.6B.300d.w2v.txt"
fname_glove_crawl = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.840B.300d.w2v.txt"

In [275]:
# Read in raw data
df = pd.read_csv(fname_rawdata2018)

In [5]:
t_start = time.time()

# Read in pre-trained embeddings
w2v_google_news = KeyedVectors.load_word2vec_format(fname_w2v_googlenews,
                                                    binary=True)
fasttext_crawl = KeyedVectors.load_word2vec_format(fname_fasttext_crawl,
                                                   unicode_errors='ignore')
fasttext_wiki = KeyedVectors.load_word2vec_format(fname_fasttext_wiki,
                                                  unicode_errors='ignore')
glove_twitter = KeyedVectors.load_word2vec_format(fname_glove_twitter,
                                                  unicode_errors='ignore')
glove_wiki = KeyedVectors.load_word2vec_format(fname_glove_wiki,
                                               unicode_errors='ignore')
glove_crawl = KeyedVectors.load_word2vec_format(fname_glove_crawl,
                                                unicode_errors='ignore')

t_end = time.time()
print("Elapsed time to load embeddings: %.1f s" % (t_end - t_start))

Elapsed time to load embeddings: 2958.4 s


## <span style = "color:Darkblue"> Preprocessing and Data Preperation</span>

### Preparing Comment data

In [276]:
df_userid = df[['_telkey', '2018 Comment']]
df_userid = df_userid.rename(columns = {'_telkey':'USERID'})

df = df[['2018 Comment']].join(df.loc[:,'CPD':'OTH'])
df = df.rename(columns = {'2018 Comment' : 'comment'})

Y = np.array(df.loc[:,"CPD":"OTH"])

themes = df.loc[:,'CPD':'OTH'].columns.tolist()

# Split the data
df_X_train, df_X_valid, Y_train, Y_valid = train_test_split(
        df.comment, Y, test_size=0.25, random_state=2019)

In [339]:
print(df_X_train.shape)
print(Y_train.shape)

(9958,)
(9958, 12)


### Prepare Quantitative Data

In [298]:
df_quant = pd.read_csv(fname_quant).query("survey_year == 2018")
df_legend = pd.read_csv(fname_legend)

In [340]:
df_quant_train = df_userid.loc[df_X_train.index] \
                          .merge(df_quant, how='left', on='USERID')

df_quant_valid = df_userid.loc[df_X_valid.index] \
                          .merge(df_quant, how='left', on='USERID')

In [341]:
# Grab relevant question responses for each theme
X_train_quant = {}
X_valid_quant = {}

for i, theme in enumerate(themes):
    
    codel = (i)*10
    codeu = (i+2)*10 - 1
    col_quant = list(df_legend.loc[df_legend.subtheme_code.between(codel,codeu)] \
                              .new_column_name)

    X_train_quant[theme] = np.array(df_quant_train[col_quant])
    X_valid_quant[theme] = np.array(df_quant_valid[col_quant])

In [464]:
# Fill missing values with the mean response for that question
for theme in themes:
    for i in range(X_valid_quant[theme].shape[1]):
        mean_replace = np.nanmean(X_valid_quant[theme], axis = 0)[i]
        X_valid_quant[theme][:,i][np.isnan(X_valid_quant[theme][:,i])] = mean_replace

In [622]:
X_valid_quant = np.array(df_quant_valid.iloc[:,3:20])

In [707]:
# Fill missing values with the mean response for that column
for i in range(X_valid_quant.shape[1]):
    mean_replace = np.nanmean(X_valid_quant, axis = 0)[i]
    X_valid_quant[:,i][np.isnan(X_valid_quant[:,i])] = mean_replace

In [709]:
X_valid_quant.shape

(3320, 17)

### Train Word Vectors

In [241]:
comments = preprocess_for_embed(df.comment, 'w2v_base_model')

w2v_base_model = Word2Vec(comments, 
                     size=300, 
                     window=5, 
                     min_count=1,
                     sg=1, 
                     negative=4)        

In [242]:
# Dictionary of pretrained embeddings
embeddings = {'w2v_base_model': w2v_base_model,
              'w2v_google_news': w2v_google_news, 
              'fasttext_crawl': fasttext_crawl,
              'fasttext_wiki': fasttext_wiki,
              'glove_twitter': glove_twitter,
              'glove_wiki': glove_wiki,
              'glove_crawl': glove_crawl}

### Word Embedding Vocab Coverage

In [243]:
# Check coverage of vocab words in embedding
oov = {}
vocab_coverage = []
text_coverage = []

for embedding in embeddings.keys():
    
    comments = preprocess_for_embed(df.comment, embedding)
    vocab = build_vocab(comments)
        
    a, b, oov[embedding] = check_coverage(vocab, embeddings[embedding])
    
    vocab_coverage.append(a)
    text_coverage.append(b)

pd.DataFrame({'embedding': list(embeddings.keys()),
              'vocab_coverage': vocab_coverage, 
              'text_coverage': text_coverage})

100%|██████████| 13278/13278 [00:00<00:00, 86084.02it/s]
100%|██████████| 13673/13673 [00:00<00:00, 40466.79it/s]
100%|██████████| 13278/13278 [00:00<00:00, 88164.98it/s]
100%|██████████| 17246/17246 [00:03<00:00, 5679.35it/s] 
100%|██████████| 13278/13278 [00:00<00:00, 91469.55it/s]
100%|██████████| 17500/17500 [00:01<00:00, 10368.71it/s]
100%|██████████| 13278/13278 [00:00<00:00, 88333.35it/s]
100%|██████████| 17500/17500 [00:02<00:00, 6214.14it/s]
100%|██████████| 13278/13278 [00:00<00:00, 90237.65it/s]
100%|██████████| 13673/13673 [00:02<00:00, 4575.48it/s]
100%|██████████| 13278/13278 [00:00<00:00, 92607.72it/s]
100%|██████████| 13673/13673 [00:02<00:00, 4979.80it/s]
100%|██████████| 13278/13278 [00:00<00:00, 89359.93it/s]
100%|██████████| 17500/17500 [00:06<00:00, 2735.08it/s]


Unnamed: 0,embedding,vocab_coverage,text_coverage
0,w2v_base_model,1.0,1.0
1,w2v_google_news,0.93987,0.996661
2,fasttext_crawl,0.953943,0.997412
3,fasttext_wiki,0.938514,0.996345
4,glove_twitter,0.887954,0.990666
5,glove_wiki,0.913479,0.994892
6,glove_crawl,0.953543,0.997421


In [244]:
# Take a look at the out of vocab words for each embedding
for i in oov.keys():
    print(i)
    print(oov[i][:5])

w2v_base_model
[]
w2v_google_news
[('CYMH', 54), ('FLNRORD', 35), ('GCPE', 33), ('CSNR', 32), ('BCWS', 23)]
fasttext_crawl
[('CYMH', 54), ('BCTS', 37), ('FLNRORD', 35), ('GCPE', 33), ('CSNR', 32)]
fasttext_wiki
[('MCFD', 128), ('CYMH', 54), ('BCTS', 37), ('FLNRORD', 35), ('GCPE', 33)]
glove_twitter
[('2', 402), ('1', 302), ('3', 236), ('4', 171), ('5', 151)]
glove_wiki
[('####', 181), ('mcfd', 131), ('cymh', 54), ('#####', 49), ('bcts', 37)]
glove_crawl
[('CYMH', 54), ('FLNRORD', 35), ('GCPE', 33), ('CSNR', 32), ('STIIP', 20)]


## <span style = "color:Darkblue"> Feature Engineering </span>

### Bag of Words


In [245]:
# Use Count Vectorizer to build bag of word arrays to train on
vectorizer = CountVectorizer(stop_words= 'english',
                             ngram_range=(1,5), 
                             min_df=2)   

X_train_bow = vectorizer.fit_transform(preprocess_for_bow(df_X_train))
X_valid_bow = vectorizer.transform(preprocess_for_bow(df_X_valid))

In [246]:
print(X_train_bow.shape)
print(X_valid_bow.shape)

(9958, 31422)
(3320, 31422)


### Get Average Word Vectors per Comment

In [247]:
X_train_avg_wv = {}
X_valid_avg_wv = {}

for embedding in embeddings.keys():
    
    # Adjust features based on twitter embeddings 
    if embedding == 'glove_twitter':
        n_features = 200
    else:
        n_features = 300
    
    # Preprocess comment data
    comments_train = preprocess_for_embed(df_X_train, embedding)
    comments_valid = preprocess_for_embed(df_X_valid, embedding)
    
    # Get average embeddings for each comment
    # train
    X_train_avg_wv[embedding] = np.array(
        [get_average_embeddings(comment, embeddings[embedding], n_features)
         for comment in comments_train])
    
    # valid
    X_valid_avg_wv[embedding] = np.array(
        [get_average_embeddings(comment, embeddings[embedding], n_features)
         for comment in comments_valid])

In [248]:
print(X_valid_avg_wv['w2v_base_model'].shape)
print(X_valid_avg_wv['glove_twitter'].shape)

(3320, 300)
(3320, 200)


## <span style = "color:Darkblue"> Classification Models </span>
### Baseline Classifier - BOW | Linear SVC 

#### Optimize Hyper Parameters for BOW | Linear SVC

In [829]:
# C = (5.0**np.arange(-1,4)/10).tolist()
# C.append(1)

# tol = (5.0**np.arange(-3,2)/100).tolist()
# tol.append(0.0001)

# print('C:', C)
# print('tol:', tol)

C: [0.02, 0.1, 0.5, 2.5, 12.5, 1]
tol: [8e-05, 0.0004, 0.002, 0.01, 0.05, 0.0001]


In [830]:
# t_start = time.time()
# print("Grid Search for BOW | Linear SVC")

# parameters = [
#     {
#         'classifier':[LinearSVC(max_iter=2000)],
#         'classifier__tol': tol,
#         'classifier__C': C,    
#     }
# ]

# clf1 = GridSearchCV(BinaryRelevance(), parameters, scoring = 'accuracy', cv = 2)
# clf1.fit(X_train_bow, Y_train)

# t_end_train = time.time()

# print(clf1.best_params_, clf1.best_score_)
# # Calculate and print elapsed time
# t_end = time.time()
# print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
#       "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Grid Search for BOW | Linear SVC
{'classifier': LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=2000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.05,
     verbose=0), 'classifier__C': 0.5, 'classifier__tol': 0.05} 0.4243824061056437
Elapsed Training time: 5454.6 s 
Elapsed Predict time: 0.1 s


#### Final BOW | Linear SVC Model

In [837]:
t_start = time.time()
print("Training Bag of words Model with Linear SVC")

model_bow = BinaryRelevance(
    classifier = LinearSVC(C = 0.5, tol = 0.2)
)

model_bow.fit(X_train_bow, Y_train)
t_end_train = time.time()

Y_pred_bow = model_bow.predict(X_valid_bow).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training Bag of words Model with Linear SVC
Elapsed Training time: 71.4 s 
Elapsed Predict time: 13.1 s


In [838]:
theme_results(Y_valid, Y_pred_bow)

Overall Accuracy: 0.4512 
Hamming Loss: 0.0721 
Hamming Loss (pred. zeros): 0.1191


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.12741,0.112048,0.073193,0.054217,0.926807,0.741935,0.652482
1,CB,0.184639,0.174096,0.043072,0.141566,0.956928,0.906574,0.854812
2,EWC,0.084337,0.056325,0.067771,0.016566,0.932229,0.647059,0.432143
3,Exec,0.103012,0.089157,0.081325,0.021687,0.918675,0.621622,0.538012
4,FWE,0.062048,0.054217,0.025904,0.036145,0.974096,0.833333,0.728155
5,SP,0.096386,0.082229,0.065964,0.030422,0.934036,0.684982,0.584375
6,RE,0.085542,0.062952,0.075602,0.00994,0.924398,0.578947,0.426056
7,Sup,0.127711,0.111145,0.105723,0.021988,0.894277,0.598916,0.521226
8,SW,0.165964,0.134639,0.11506,0.050904,0.88494,0.689038,0.558984
9,TEPE,0.228614,0.211145,0.071687,0.156928,0.928313,0.871612,0.805007


In [839]:
print("Shape of Y_pred:",Y_pred_bow.shape)
print("Zeros predicted:",Y_pred_bow[Y_pred_bow.sum(axis = 1) == 0,:].shape)

Shape of Y_pred: (3320, 12)
Zeros predicted: (466, 12)


### Average Word Vectors | LogReg SVC

#### Optimize Hyperparameters for Avg WV | LogReg

In [None]:
# C = (5.0**np.arange(-1,4)/10).tolist()
# C.append(1)

# tol = (5.0**np.arange(-3,2)/100).tolist()
# tol.append(0.0001)

# print('C:', C)
# print('tol:', tol)

In [None]:
# for embedding in embeddings.keys():
    
#     print("Grid Search on: ", embedding)
#     t_start = time.time()

#     parameters = [
#         {
#             'classifier':[LogisticRegression(solver = 'lbfgs', max_iter=500)],
#             'classifier__tol': tol,
#             'classifier__C': C,
        
#         }
#     ]

#     clf2 = GridSearchCV(BinaryRelevance(), parameters, scoring = 'accuracy', cv = 2)
#     clf2.fit(X_train_avg_wv[embedding], Y_train)

#     t_end_train = time.time()

#     print(clf2.best_params_, clf2.best_score_)
#     # Calculate and print elapsed time
#     t_end = time.time()
#     print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
#       "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))


#### Final Avg WV | LogReg Model

In [253]:
Y_pred_avg_wv = {}
model_avg_wv = {}
train_time = []
predict_time = []
accuarcies = []

for embedding in embeddings.keys():
    t_start = time.time()

    clf = BinaryRelevance(
        classifier = LogisticRegression(solver = 'lbfgs', max_iter=500, C = 7.5,
                                        tol = 0.05)
    )

    clf.fit(X_train_avg_wv[embedding], Y_train)
    t_end_train = time.time()

    Y_pred_avg_wv[embedding] = clf.predict_proba(X_valid_avg_wv[embedding]) \
                                  .toarray()
    model_avg_wv[embedding] = clf
    
    # Calculate elapsed time
    t_end = time.time()
    train_time.append(t_end_train - t_start)
    predict_time.append(t_end - t_end_train)
    
    accuarcies.append(metrics.accuracy_score(Y_valid,
                                             Y_pred_avg_wv[embedding]))

results_avg_wv = pd.DataFrame({'embedding': list(embeddings.keys()),
                               'train_time': train_time,
                               'predict_time': predict_time,
                               'overall_accuracy': accuarcies})

results_avg_wv

Unnamed: 0,embedding,train_time,predict_time,overall_accuracy
0,w2v_base_model,16.752789,0.163054,0.345181
1,w2v_google_news,10.903381,0.12926,0.402108
2,fasttext_crawl,20.40469,0.119333,0.408133
3,fasttext_wiki,7.21436,0.103724,0.39247
4,glove_twitter,51.91293,0.060164,0.340964
5,glove_wiki,46.241062,0.091611,0.389157
6,glove_crawl,39.054376,0.114515,0.400301


### LSTM Model

In [840]:
# Build Embedding Matrices and prepare data for deep 
# learning Models
max_words = 12000
maxlen = 700

# dictionaries for each embedding
embedding_matrix = {}
tokenizer = {}
X_train_lstm = {}
X_valid_lstm = {}

for embedding in embeddings.keys():

    # Preprocess text data based on embedding
    X_train = np.array(preprocess_for_embed(df_X_train,
                                            embedding,
                                            split = False))
    
    X_valid = np.array(preprocess_for_embed(df_X_valid,
                                            embedding,
                                            split = False))
    
    # Tokenize and pad numbers for LSTM Model
    tokenizer[embedding] = Tokenizer(num_words=max_words)
    tokenizer[embedding].fit_on_texts(X_train)
    
    tokenized_train = tokenizer[embedding].texts_to_sequences(X_train)
    tokenized_test = tokenizer[embedding].texts_to_sequences(X_valid)

    X_train_lstm[embedding] = pad_sequences(tokenized_train, maxlen=maxlen)
    X_valid_lstm[embedding] = pad_sequences(tokenized_test, maxlen=maxlen)
    
    
    # Build Embedding Matrices
    if embedding == 'glove_twitter':
        embed_size = 200
    else:
        embed_size = 300

    word_index = tokenizer[embedding].word_index
    
    num_words = min(max_words, len(word_index) + 1)
    embedding_matrix[embedding] = np.zeros((num_words, embed_size),
                                           dtype='float32')

    for word, i in word_index.items():

        if i >= max_words:
            continue

        try:
            embedding_vector = embeddings[embedding][word]

            if embedding_vector is not None:
                embedding_matrix[embedding][i] = embedding_vector
        except:
            continue



In [None]:
# Build LSTM Model and train and validate
Y_pred_lstm = {}
model_lstm = {}
train_time = []
predict_time = []
accuarcies = []

for embedding in embeddings.keys():
    t_start = time.time()
    print("Training LSTM on the ", embedding)
    
    if embedding == 'glove_twitter':
        embed_size = 200
    else:
        embed_size = 300
    
    # Deep Learning Architecture
    inp = Input(shape=(maxlen, ))
    
    x = Embedding(max_words, embed_size, 
                  weights=[embedding_matrix[embedding]], 
                  trainable=False)(inp)

    x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1,
                          recurrent_dropout=0.1))(x)

    x = Conv1D(64, kernel_size=3, padding="valid", 
               kernel_initializer="glorot_uniform")(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)

    x = concatenate([avg_pool, max_pool])

    preds = Dense(12, activation="sigmoid")(x)

    model = Model(inp, preds)

    model.compile(loss='binary_crossentropy',
                  optimizer= 'adam',
                  metrics=['accuracy'])

    # Train and Predict Model
    batch_size = 128
    epochs = 12
    model.fit(X_train_lstm[embedding],
              Y_train,
              batch_size=batch_size,
              epochs=epochs, 
              validation_split=0.15)
    t_end_train = time.time()
    
    Y_pred_lstm[embedding] = model.predict(X_valid_lstm[embedding])
    model_lstm[embedding] = model

    # Calculate and report results
    t_end = time.time()
    train_time.append(t_end_train - t_start)
    predict_time.append(t_end - t_end_train)
    
    accuarcies.append(metrics.accuracy_score(Y_valid,
                                             np.round(Y_pred_lstm[embedding])))

Training LSTM on the  w2v_base_model
Train on 8464 samples, validate on 1494 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Training LSTM on the  w2v_google_news
Train on 8464 samples, validate on 1494 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Training LSTM on the  fasttext_crawl
Train on 8464 samples, validate on 1494 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12

In [None]:
results_lstm = pd.DataFrame({'embedding': list(embeddings.keys()),
                             'train_time': train_time,
                             'predict_time': predict_time,
                             'overall_accuracy': accuarcies})

results_lstm

In [441]:
theme_results(Y_valid, np.round(Y_pred_lstm['fasttext_crawl']))

Overall Accuracy: 0.491 
Hamming Loss: 0.0653 
Hamming Loss (pred. zeros): 0.1191


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.12741,0.100301,0.065663,0.061747,0.934337,0.807808,0.635934
1,CB,0.184639,0.176205,0.038554,0.146084,0.961446,0.91453,0.872757
2,EWC,0.084337,0.037048,0.062952,0.021386,0.937048,0.788618,0.346429
3,Exec,0.103012,0.098494,0.079217,0.023795,0.920783,0.620795,0.593567
4,FWE,0.062048,0.056928,0.02259,0.039458,0.97741,0.846561,0.776699
5,SP,0.096386,0.068072,0.062651,0.033735,0.937349,0.747788,0.528125
6,RE,0.085542,0.051506,0.070181,0.015361,0.929819,0.649123,0.390845
7,Sup,0.127711,0.135542,0.106627,0.021084,0.893373,0.577778,0.613208
8,SW,0.165964,0.111747,0.098795,0.067169,0.901205,0.800539,0.53902
9,TEPE,0.228614,0.212349,0.056627,0.171988,0.943373,0.904965,0.84058


## Stacking Classifiers

In [812]:
# Stacked predictions of various models
X_train_stack_wv = np.hstack(tuple(Y_pred_avg_wv.values()))
X_train_stack_lstm = np.hstack(tuple(Y_pred_lstm.values()))
X_train_stack_bow = Y_pred_bow

X_train_stack = np.hstack((X_train_stack_bow,
                           X_train_stack_wv,
                           X_train_stack_lstm,
                           X_valid_quant))

# Shapes of each model
print(X_train_stack_bow.shape)
print(X_train_stack_lstm.shape)
print(X_train_stack_wv.shape)
print(X_valid_quant.shape)

X_train_stack.shape

(3320, 12)
(3320, 84)
(3320, 84)
(3320, 17)


(3320, 197)

In [813]:
X_train_meta, X_valid_meta, Y_train_meta, Y_valid_meta = train_test_split(
    X_train_stack, Y_valid, test_size = 0.30)

In [814]:
print(X_train_meta.shape)
print(X_valid_meta.shape)
print(Y_train_meta.shape)
print(Y_valid_meta.shape)

(2324, 197)
(996, 197)
(2324, 12)
(996, 12)


#### Optimize Hyper Parameters of Stacking | LogReg

In [793]:
C = [0.25, 0.5, 0.75, 1]
C.append(1)

tol = [0.005, 0.01, 0.02, 0.05]
tol.append(0.0001)

print('C:', C)
print('tol:', tol)

C: [0.02, 0.1, 0.5, 2.5, 12.5, 1]
tol: [8e-05, 0.0004, 0.002, 0.01, 0.05, 0.0001]


In [792]:
t_start = time.time()
print("Grid Search for stacked LogReg")

parameters = [
    {
        'classifier':[LogisticRegression(solver = 'liblinear', max_iter=500,
                                         penalty = 'l1')],
        'classifier__tol': tol,
        'classifier__C': C,
        
    }
]

clf = GridSearchCV(BinaryRelevance(), parameters, scoring = 'accuracy', cv = 3)
clf.fit(X_train_meta, Y_train_meta)

t_end_train = time.time()

print(clf.best_params_, clf.best_score_)
# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training on Stack
{'classifier': LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.01, verbose=0, warm_start=False), 'classifier__C': 0.5, 'classifier__penalty': 'l1', 'classifier__tol': 0.01} 0.5266781411359724
Elapsed Training time: 449.0 s 
Elapsed Predict time: 0.0 s


#### Final Stacked Model

In [815]:
t_start = time.time()
print("Training on Stack")

model_stack = BinaryRelevance(
    classifier = LogisticRegression(penalty='l1', solver='liblinear', C = 0.5, tol = 0.01)
)

model_stack.fit(X_train_meta, Y_train_meta)

t_end_train = time.time()

Y_pred_stack = model_stack.predict_proba(X_valid_meta).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training on Stack
Elapsed Training time: 3.9 s 
Elapsed Predict time: 0.1 s


In [816]:
theme_results(Y_valid_meta, np.round(Y_pred_stack))

Overall Accuracy: 0.5291 
Hamming Loss: 0.0582 
Hamming Loss (pred. zeros): 0.1166


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.124498,0.105422,0.061245,0.063253,0.938755,0.8,0.677419
1,CB,0.179719,0.183735,0.034137,0.145582,0.965863,0.896175,0.916201
2,EWC,0.087349,0.046185,0.057229,0.03012,0.942771,0.826087,0.436782
3,Exec,0.092369,0.080321,0.058233,0.034137,0.941767,0.7125,0.619565
4,FWE,0.066265,0.052209,0.026104,0.040161,0.973896,0.884615,0.69697
5,SP,0.094378,0.071285,0.049197,0.045181,0.950803,0.816901,0.617021
6,RE,0.079317,0.044177,0.059237,0.02008,0.940763,0.727273,0.405063
7,Sup,0.125502,0.098394,0.089357,0.036145,0.910643,0.683673,0.536
8,SW,0.158635,0.124498,0.098394,0.060241,0.901606,0.741935,0.582278
9,TEPE,0.233936,0.218876,0.059237,0.174699,0.940763,0.899083,0.841202


### Increase the Precision! 

In [825]:
predictions = np.round(Y_pred_stack - 0.40)
#predictions = np.round(Y_pred_lstm['fasttext_crawl']-0.495)

a = theme_results(Y_valid_meta, predictions)
size = predictions.shape[0]
zero_size = (predictions[predictions.sum(axis = 1) == 0,:].shape[0])
print("Total comments:", size, 
      "\nTotal Predictions:", size - zero_size, 
      "\nPercent Pred non-zero:", round(1 - zero_size/size, 4))
a

Overall Accuracy: 0.4167 
Hamming Loss: 0.0709 
Hamming Loss (pred. zeros): 0.1166
Total comments: 996 
Total Predictions: 629 
Percent Pred non-zero: 0.6315


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.124498,0.058233,0.078313,0.046185,0.921687,0.896552,0.419355
1,CB,0.179719,0.156627,0.045181,0.134538,0.954819,0.929487,0.810056
2,EWC,0.087349,0.024096,0.071285,0.016064,0.928715,0.833333,0.229885
3,Exec,0.092369,0.032129,0.070281,0.022088,0.929719,0.84375,0.293478
4,FWE,0.066265,0.044177,0.032129,0.034137,0.967871,0.886364,0.590909
5,SP,0.094378,0.031124,0.071285,0.023092,0.928715,0.870968,0.287234
6,RE,0.079317,0.014056,0.069277,0.01004,0.930723,0.857143,0.151899
7,Sup,0.125502,0.028112,0.103414,0.022088,0.896586,0.892857,0.2
8,SW,0.158635,0.057229,0.113454,0.045181,0.886546,0.894737,0.322785
9,TEPE,0.233936,0.180723,0.073293,0.160643,0.926707,0.944444,0.729614


In [826]:
non_zero_pred = predictions[predictions.sum(axis = 1) != 0,:]
non_zero_valid = Y_valid_meta[predictions.sum(axis = 1) != 0,:]

theme_results(non_zero_valid, non_zero_pred)

Overall Accuracy: 0.6486 
Hamming Loss: 0.0441 
Hamming Loss (pred. zeros): 0.1166


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.125596,0.09221,0.052464,0.073132,0.947536,0.896552,0.658228
1,CB,0.249603,0.248013,0.036566,0.213037,0.963434,0.929487,0.923567
2,EWC,0.062003,0.038156,0.036566,0.025437,0.963434,0.833333,0.512821
3,Exec,0.066773,0.050874,0.031797,0.034976,0.968203,0.84375,0.642857
4,FWE,0.085851,0.069952,0.031797,0.054054,0.968203,0.886364,0.722222
5,SP,0.09539,0.049285,0.058824,0.036566,0.941176,0.870968,0.45
6,RE,0.060413,0.022258,0.044515,0.015898,0.955485,0.857143,0.315789
7,Sup,0.09539,0.044515,0.060413,0.034976,0.939587,0.892857,0.416667
8,SW,0.146264,0.09062,0.074722,0.071542,0.925278,0.894737,0.554348
9,TEPE,0.305246,0.286169,0.050874,0.254372,0.949126,0.944444,0.885417


### Model Summary

### Save Models