# Classification Experiments

In [1]:
# Change working directory to be project root
import os
#os.chdir("..")
os.getcwd()

'/Users/aaronquinton/Documents/UBC-MDS/Capstone/BCstats/DSCI_591_capstone-BCStats'

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import time

# Custom functions for preprocessing and data preparation
from src.data.preprocessing_text import (
    clean_text, clean_numbers, replace_typical_misspell, remove_stopwords,
    balance_themes, preprocess_for_embed, preprocess_for_bow
)

from src.features.word_vectors import (
    build_vocab, check_coverage, get_average_embeddings
)

from src.models.eval import theme_results, investigate_results

# Functions for preprocessing and data preparation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate
import sklearn.metrics as metrics

# Training Word embeddings and pre-trained embeddings
import gensim
from gensim.models import Word2Vec, KeyedVectors

# Training LSTM Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPooling1D, Conv1D, GlobalAveragePooling1D
from keras.layers import GRU, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

# Classification alogrithms
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import multilayer_perceptron
from sklearn.tree import DecisionTreeClassifier


## <span style = "color:Darkblue"> Read in Data and Embeddings </span>

In [236]:
# Input Filepaths
fname_rawdata2015 = "data/interim/train_2015-qualitative-data.csv"
fname_rawdata2018 = "data/interim/train_2018-qualitative-data.csv"
fname_fasttext_crawl = "./references/pretrained_embeddings.nosync/fasttext/" \
                       "crawl-300d-2M.vec"
fname_fasttext_wiki = "./references/pretrained_embeddings.nosync/fasttext/" \
                      "wiki-news-300d-1M.vec"
fname_w2v_googlenews = "./references/pretrained_embeddings.nosync/" \
                       "GoogleNews-vectors-negative300.bin"
fname_glove_twitter = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.twitter.27B.200d.w2v.txt"
fname_glove_wiki = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.6B.300d.w2v.txt"
fname_glove_crawl = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.840B.300d.w2v.txt"

In [237]:
# Read in raw data
df = pd.read_csv(fname_rawdata2018)

In [5]:
t_start = time.time()

# Read in pre-trained embeddings
w2v_google_news = KeyedVectors.load_word2vec_format(fname_w2v_googlenews,
                                                    binary=True)
fasttext_crawl = KeyedVectors.load_word2vec_format(fname_fasttext_crawl,
                                                   unicode_errors='ignore')
fasttext_wiki = KeyedVectors.load_word2vec_format(fname_fasttext_wiki,
                                                  unicode_errors='ignore')
glove_twitter = KeyedVectors.load_word2vec_format(fname_glove_twitter,
                                                  unicode_errors='ignore')
glove_wiki = KeyedVectors.load_word2vec_format(fname_glove_wiki,
                                               unicode_errors='ignore')
glove_crawl = KeyedVectors.load_word2vec_format(fname_glove_crawl,
                                                unicode_errors='ignore')

t_end = time.time()
print("Elapsed time to load embeddings: %.1f s" % (t_end - t_start))

Elapsed time to load embeddings: 2958.4 s


## <span style = "color:Darkblue"> Preprocessing and Data Preperation</span>

### Preparing Comment data

In [186]:
# df_2015 = pd.read_csv(fname_rawdata2015)
# df_2018 = pd.read_csv(fname_rawdata2018)

# df_2015['year_of_survey'] = 2015
# df_2015.rename(columns={'2015 Comments':'comment',
#             'Career_Personal_Development':'CPD','Compensation_Benefits':'CB',
#             'Engagement_Workplace_Culture':'EWC','Executives':'Exec',
#             'Flexible_Work_Environment':'FWE','Hiring_Promotion':'SP',
#             'Recognition_Empowerment':'RE','Supervisors':'Sup',
#             'Stress_Workload':'SW','Tools_Equipment_Physical_Environment':'TEPE',
#             'Vision_Mission_Goals':'VMG','Other':'OTH'}, inplace=True)

# selected_columns = ["comment","CPD","CB","EWC","Exec","FWE","SP","RE",
#                        "Sup","SW","TEPE","VMG","OTH"]

# df_2015_selected = df_2015[selected_columns]

# df_2018.rename(columns={'2018 Comment':'comment'}, inplace=True)
# df_2018_selected = df_2018[selected_columns]

# df = df_2015_selected.append(df_2018_selected, ignore_index=True)

In [238]:
df = df[['2018 Comment']].join(df.loc[:,'CPD':'OTH'])
df = df.rename(columns = {'2018 Comment' : 'comment'})

Y = np.array(df.loc[:,"CPD":"OTH"])

themes = df.loc[:,'CPD':'OTH'].columns.tolist()

# Split the data
df_X_train, df_X_valid, Y_train, Y_valid = train_test_split(
        df.comment, Y, test_size=0.25, random_state=2019)

In [239]:
df_X_train.shape

(9958,)

In [240]:
Y_train.shape

(9958, 12)

### Train Word Vectors

In [241]:
comments = preprocess_for_embed(df.comment, 'w2v_base_model')

w2v_base_model = Word2Vec(comments, 
                     size=300, 
                     window=5, 
                     min_count=1,
                     sg=1, 
                     negative=4)        

In [242]:
# Dictionary of pretrained embeddings
embeddings = {'w2v_base_model': w2v_base_model,
              'w2v_google_news': w2v_google_news, 
              'fasttext_crawl': fasttext_crawl,
              'fasttext_wiki': fasttext_wiki,
              'glove_twitter': glove_twitter,
              'glove_wiki': glove_wiki,
              'glove_crawl': glove_crawl}

### Word Embedding Vocab Coverage

In [243]:
# Check coverage of vocab words in embedding
oov = {}
vocab_coverage = []
text_coverage = []

for embedding in embeddings.keys():
    
    comments = preprocess_for_embed(df.comment, embedding)
    vocab = build_vocab(comments)
        
    a, b, oov[embedding] = check_coverage(vocab, embeddings[embedding])
    
    vocab_coverage.append(a)
    text_coverage.append(b)

pd.DataFrame({'embedding': list(embeddings.keys()),
              'vocab_coverage': vocab_coverage, 
              'text_coverage': text_coverage})

100%|██████████| 13278/13278 [00:00<00:00, 86084.02it/s]
100%|██████████| 13673/13673 [00:00<00:00, 40466.79it/s]
100%|██████████| 13278/13278 [00:00<00:00, 88164.98it/s]
100%|██████████| 17246/17246 [00:03<00:00, 5679.35it/s] 
100%|██████████| 13278/13278 [00:00<00:00, 91469.55it/s]
100%|██████████| 17500/17500 [00:01<00:00, 10368.71it/s]
100%|██████████| 13278/13278 [00:00<00:00, 88333.35it/s]
100%|██████████| 17500/17500 [00:02<00:00, 6214.14it/s]
100%|██████████| 13278/13278 [00:00<00:00, 90237.65it/s]
100%|██████████| 13673/13673 [00:02<00:00, 4575.48it/s]
100%|██████████| 13278/13278 [00:00<00:00, 92607.72it/s]
100%|██████████| 13673/13673 [00:02<00:00, 4979.80it/s]
100%|██████████| 13278/13278 [00:00<00:00, 89359.93it/s]
100%|██████████| 17500/17500 [00:06<00:00, 2735.08it/s]


Unnamed: 0,embedding,vocab_coverage,text_coverage
0,w2v_base_model,1.0,1.0
1,w2v_google_news,0.93987,0.996661
2,fasttext_crawl,0.953943,0.997412
3,fasttext_wiki,0.938514,0.996345
4,glove_twitter,0.887954,0.990666
5,glove_wiki,0.913479,0.994892
6,glove_crawl,0.953543,0.997421


In [244]:
# Take a look at the out of vocab words for each embedding
for i in oov.keys():
    print(i)
    print(oov[i][:5])

w2v_base_model
[]
w2v_google_news
[('CYMH', 54), ('FLNRORD', 35), ('GCPE', 33), ('CSNR', 32), ('BCWS', 23)]
fasttext_crawl
[('CYMH', 54), ('BCTS', 37), ('FLNRORD', 35), ('GCPE', 33), ('CSNR', 32)]
fasttext_wiki
[('MCFD', 128), ('CYMH', 54), ('BCTS', 37), ('FLNRORD', 35), ('GCPE', 33)]
glove_twitter
[('2', 402), ('1', 302), ('3', 236), ('4', 171), ('5', 151)]
glove_wiki
[('####', 181), ('mcfd', 131), ('cymh', 54), ('#####', 49), ('bcts', 37)]
glove_crawl
[('CYMH', 54), ('FLNRORD', 35), ('GCPE', 33), ('CSNR', 32), ('STIIP', 20)]


## <span style = "color:Darkblue"> Feature Engineering </span>

### Bag of Words


In [245]:
# Use Count Vectorizer to build bag of word arrays to train on
vectorizer = CountVectorizer(stop_words= 'english',
                             ngram_range=(1,5), 
                             min_df=2)   

X_train_bow = vectorizer.fit_transform(preprocess_for_bow(df_X_train))
X_valid_bow = vectorizer.transform(preprocess_for_bow(df_X_valid))

In [246]:
print(X_train_bow.shape)
print(X_valid_bow.shape)

(9958, 31422)
(3320, 31422)


### Get Average Word Vectors per Comment

In [247]:
X_train_avg_wv = {}
X_valid_avg_wv = {}

for embedding in embeddings.keys():
    
    # Adjust features based on twitter embeddings 
    if embedding == 'glove_twitter':
        n_features = 200
    else:
        n_features = 300
    
    # Preprocess comment data
    comments_train = preprocess_for_embed(df_X_train, embedding)
    comments_valid = preprocess_for_embed(df_X_valid, embedding)
    
    # Get average embeddings for each comment
    # train
    X_train_avg_wv[embedding] = np.array(
        [get_average_embeddings(comment, embeddings[embedding], n_features)
         for comment in comments_train])
    
    # valid
    X_valid_avg_wv[embedding] = np.array(
        [get_average_embeddings(comment, embeddings[embedding], n_features)
         for comment in comments_valid])

In [248]:
print(X_valid_avg_wv['w2v_base_model'].shape)
print(X_valid_avg_wv['glove_twitter'].shape)

(3320, 300)
(3320, 200)


## <span style = "color:Darkblue"> Classification Models </span>
### Baseline Classifier - BOW | Linear SVC 

In [249]:
################################################################################
# Final Train and Predict Model                                                #
################################################################################
t_start = time.time()
print("Training Bag of words Model with Linear SVC")

model_bow = BinaryRelevance(
    classifier = LinearSVC()
)

model_bow.fit(X_train_bow, Y_train)
t_end_train = time.time()

Y_pred_bow = model_bow.predict(X_valid_bow).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training Bag of words Model with Linear SVC
Elapsed Training time: 82.8 s 
Elapsed Predict time: 13.0 s


In [250]:
theme_results(Y_valid, Y_pred_bow)

Overall Accuracy: 0.4509 
Hamming Loss: 0.0737 
Hamming Loss (pred. zeros): 0.1191


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.12741,0.113253,0.075602,0.051807,0.924398,0.728723,0.647754
1,CB,0.184639,0.175,0.044578,0.14006,0.955422,0.900172,0.853181
2,EWC,0.084337,0.05994,0.067771,0.016566,0.932229,0.638191,0.453571
3,Exec,0.103012,0.090964,0.08253,0.020482,0.91747,0.612583,0.540936
4,FWE,0.062048,0.055723,0.02741,0.034639,0.97259,0.810811,0.728155
5,SP,0.096386,0.084639,0.067771,0.028614,0.932229,0.669039,0.5875
6,RE,0.085542,0.065663,0.075301,0.010241,0.924699,0.577982,0.443662
7,Sup,0.127711,0.11506,0.107831,0.01988,0.892169,0.586387,0.528302
8,SW,0.165964,0.143072,0.120482,0.045482,0.879518,0.658947,0.568058
9,TEPE,0.228614,0.215361,0.073494,0.15512,0.926506,0.86014,0.810277


In [251]:
Y_pred_bow[Y_pred_bow.sum(axis = 1) == 0,:].shape

(421, 12)

In [252]:
Y_pred_bow.shape

(3320, 12)

### Average Word Vectors | Linear SVC

In [253]:
Y_pred_avg_wv = {}
model_avg_wv = {}
train_time = []
predict_time = []
accuarcies = []

for embedding in embeddings.keys():
    t_start = time.time()

    clf = BinaryRelevance(
        classifier = LinearSVC(max_iter = 2000)
    )

    clf.fit(X_train_avg_wv[embedding], Y_train)
    t_end_train = time.time()

    Y_pred_avg_wv[embedding] = clf.predict(X_valid_avg_wv[embedding]) \
                                  .toarray()
    model_avg_wv[embedding] = clf
    
    # Calculate elapsed time
    t_end = time.time()
    train_time.append(t_end_train - t_start)
    predict_time.append(t_end - t_end_train)
    
    accuarcies.append(metrics.accuracy_score(Y_valid,
                                             Y_pred_avg_wv[embedding]))

results_avg_wv = pd.DataFrame({'embedding': list(embeddings.keys()),
                               'train_time': train_time,
                               'predict_time': predict_time,
                               'overall_accuracy': accuarcies})

results_avg_wv

Unnamed: 0,embedding,train_time,predict_time,overall_accuracy
0,w2v_base_model,16.752789,0.163054,0.345181
1,w2v_google_news,10.903381,0.12926,0.402108
2,fasttext_crawl,20.40469,0.119333,0.408133
3,fasttext_wiki,7.21436,0.103724,0.39247
4,glove_twitter,51.91293,0.060164,0.340964
5,glove_wiki,46.241062,0.091611,0.389157
6,glove_crawl,39.054376,0.114515,0.400301


### LSTM Model

In [254]:
# Build Embedding Matrices and prepare data for deep 
# learning Models
max_words = 12000
maxlen = 700

# dictionaries for each embedding
embedding_matrix = {}
tokenizer = {}
X_train_lstm = {}
X_valid_lstm = {}

for embedding in embeddings.keys():

    # Preprocess text data based on embedding
    X_train = np.array(preprocess_for_embed(df_X_train,
                                            embedding,
                                            split = False))
    
    X_valid = np.array(preprocess_for_embed(df_X_valid,
                                            embedding,
                                            split = False))
    
    # Tokenize and pad numbers for LSTM Model
    tokenizer[embedding] = Tokenizer(num_words=max_words)
    tokenizer[embedding].fit_on_texts(X_train)
    
    tokenized_train = tokenizer[embedding].texts_to_sequences(X_train)
    tokenized_test = tokenizer[embedding].texts_to_sequences(X_valid)

    X_train_lstm[embedding] = pad_sequences(tokenized_train, maxlen=maxlen)
    X_valid_lstm[embedding] = pad_sequences(tokenized_test, maxlen=maxlen)
    
    
    # Build Embedding Matrices
    if embedding == 'glove_twitter':
        embed_size = 200
    else:
        embed_size = 300

    word_index = tokenizer[embedding].word_index
    
    num_words = min(max_words, len(word_index) + 1)
    embedding_matrix[embedding] = np.zeros((num_words, embed_size),
                                           dtype='float32')

    for word, i in word_index.items():

        if i >= max_words:
            continue

        try:
            embedding_vector = embeddings[embedding][word]

            if embedding_vector is not None:
                embedding_matrix[embedding][i] = embedding_vector
        except:
            continue



In [255]:
# Build LSTM Model and train and validate
Y_pred_lstm = {}
model_lstm = {}
train_time = []
predict_time = []
accuarcies = []

for embedding in embeddings.keys():
    t_start = time.time()
    print("Training LSTM on the ", embedding)
    
    if embedding == 'glove_twitter':
        embed_size = 200
    else:
        embed_size = 300
    
    # Deep Learning Architecture
    inp = Input(shape=(maxlen, ))
    
    x = Embedding(max_words, embed_size, 
                  weights=[embedding_matrix[embedding]], 
                  trainable=False)(inp)

    x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1,
                          recurrent_dropout=0.1))(x)

    x = Conv1D(64, kernel_size=3, padding="valid", 
               kernel_initializer="glorot_uniform")(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)

    x = concatenate([avg_pool, max_pool])

    preds = Dense(12, activation="sigmoid")(x)

    model = Model(inp, preds)

    model.compile(loss='binary_crossentropy',
                  optimizer= 'adam',
                  metrics=['accuracy'])

    # Train and Predict Model
    batch_size = 128
    epochs = 16
    model.fit(X_train_lstm[embedding],
              Y_train,
              batch_size=batch_size,
              epochs=epochs, 
              validation_split=0.15)
    t_end_train = time.time()
    
    Y_pred_lstm[embedding] = model.predict(X_valid_lstm[embedding])
    model_lstm[embedding] = model

    # Calculate and report results
    t_end = time.time()
    train_time.append(t_end_train - t_start)
    predict_time.append(t_end - t_end_train)
    
    accuarcies.append(metrics.accuracy_score(Y_valid,
                                             np.round(Y_pred_lstm[embedding])))

Training LSTM on the  w2v_base_model
Train on 8962 samples, validate on 996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training LSTM on the  w2v_google_news
Train on 8962 samples, validate on 996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training LSTM on the  fasttext_crawl
Train on 8962 samples, validate on 996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training LSTM on the  fasttext_wiki
Train on 8962 samples, validate on 996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training LSTM on the  glove_twitter
Train on 8962 samples, validate on 996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training LSTM on the  glove_wiki
Train on 8962 samples, validate on 996 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Training LSTM on the 

In [256]:
results_lstm = pd.DataFrame({'embedding': list(embeddings.keys()),
                             'train_time': train_time,
                             'predict_time': predict_time,
                             'overall_accuracy': accuarcies})

results_lstm

Unnamed: 0,embedding,train_time,predict_time,overall_accuracy
0,w2v_base_model,2324.503006,44.884675,0.441566
1,w2v_google_news,2215.871378,42.84958,0.470181
2,fasttext_crawl,2403.690729,44.079534,0.490964
3,fasttext_wiki,2126.471193,43.266172,0.460241
4,glove_twitter,1796.038347,35.112366,0.474398
5,glove_wiki,2120.624354,44.435506,0.472892
6,glove_crawl,2115.557431,44.032099,0.480723


## Stacking Classifiers

In [257]:
X_train_stack_wv = np.hstack(tuple(Y_pred_avg_wv.values()))

X_train_stack_lstm = np.hstack(tuple(Y_pred_lstm.values()))

X_train_stack_bow = Y_pred_bow

X_train_stack = np.hstack((X_train_stack_bow,
                           X_train_stack_wv,
                           X_train_stack_lstm))
                          

In [258]:
X_train_stack.shape

(3320, 180)

In [264]:
################################################################################
# Final Train and Predict Model                                                #
################################################################################
t_start = time.time()
print("Training LogReg on Stack")

model_stack = BinaryRelevance(
    classifier = LogisticRegression(solver = 'liblinear', penalty='l1')
)

cv_results = cross_validate(model_stack, X_train_stack, Y_valid, cv = 10)
t_end_train = time.time()

#Y_pred_bow = model_bow.predict(X_valid_bow).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training LogReg on Stack
Elapsed Training time: 6.1 s 
Elapsed Predict time: 0.0 s


In [265]:
cv_results



{'fit_time': array([0.70575595, 0.60740614, 0.56916618, 0.55413079, 0.55089784,
        0.58229303, 0.5561769 , 0.56042814, 0.58310485, 0.55386686]),
 'score_time': array([0.01045513, 0.00864315, 0.00819778, 0.00856209, 0.00759125,
        0.00755572, 0.00776291, 0.00739312, 0.00740504, 0.00758815]),
 'test_score': array([0.49096386, 0.54819277, 0.5       , 0.50903614, 0.46686747,
        0.52409639, 0.51506024, 0.5       , 0.53313253, 0.52710843]),
 'train_score': array([0.54250335, 0.53781794, 0.54618474, 0.54718876, 0.55087015,
        0.54484605, 0.541834  , 0.54718876, 0.54250335, 0.54317269])}

In [222]:
Y_pred_lstm

{'w2v_base_model': array([[7.52391756e-01, 3.34253572e-02, 3.46884574e-03, ...,
         1.54801635e-02, 2.49574939e-03, 1.09362903e-04],
        [8.92291218e-02, 7.26498605e-04, 3.59270684e-02, ...,
         1.00768614e-03, 6.30116642e-01, 6.39148429e-03],
        [3.40824714e-04, 7.77504581e-04, 5.06505417e-03, ...,
         2.06370965e-01, 4.92959082e-01, 7.18037947e-04],
        ...,
        [2.71791779e-03, 1.53935154e-03, 7.52969272e-03, ...,
         1.03432089e-02, 1.68368369e-01, 1.44813256e-03],
        [5.85450158e-02, 1.96871050e-02, 3.46192658e-01, ...,
         3.75147127e-02, 6.53807865e-03, 1.09878345e-03],
        [3.25757498e-03, 1.23962655e-03, 3.19494233e-02, ...,
         9.94157605e-03, 2.71991566e-02, 2.67792889e-03]], dtype=float32),
 'w2v_google_news': array([[6.6701716e-01, 9.1943191e-03, 1.6319435e-03, ..., 2.5237944e-02,
         2.0723536e-03, 1.1605593e-04],
        [2.3216338e-01, 3.1242725e-03, 1.0765996e-01, ..., 3.7268500e-03,
         4.2385617e-01, 7

In [61]:
test = model_stack.fit(X_train_stack,Y_valid)

In [115]:
a = test.classifiers_[0]

In [124]:
a.coef_[0,np.arange(0,180, 12)]

array([ 0.69109725,  3.14212075,  0.27725472,  0.        ,  3.30882647,
       -5.07801455,  1.76126742,  0.97567224,  0.02674363, -0.4414858 ,
        0.37500527, -0.15995709,  0.6882352 ,  0.13877029,  0.47648083])

In [77]:
embeddings.keys()

dict_keys(['w2v_base_model', 'w2v_google_news', 'fasttext_crawl', 'fasttext_wiki', 'glove_twitter', 'glove_wiki', 'glove_crawl'])

In [223]:
X_train_meta, X_valid_meta, Y_train_meta, Y_valid_meta = train_test_split(
    X_train_stack, Y_valid, test_size = 0.25)

In [224]:
X_train_meta.shape

(4587, 180)

In [225]:
t_start = time.time()
print("Training on Stack")

model_stack = BinaryRelevance(
    classifier = LogisticRegression(penalty='l1', solver='liblinear')
)

#cv_results = cross_validate(model_stack, X_train_stack_lstm, Y_valid, cv = 10)
model_stack.fit(X_train_meta, Y_train_meta)

t_end_train = time.time()

Y_pred_stack = model_stack.predict_proba(X_valid_meta).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training on Stack
Elapsed Training time: 1.0 s 
Elapsed Predict time: 0.3 s


In [234]:
predictions = np.round(Y_pred_stack-0.40)

a = theme_results(Y_valid_meta, predictions)
print('average Precision: ', a.Precision.mean())
print(predictions.shape)
print(predictions[predictions.sum(axis = 1) == 0,:].shape)
a

Overall Accuracy: 0.4094 
Hamming Loss: 0.0778 
Hamming Loss (pred. zeros): 0.129
average Precision:  0.904857942114193
(1529, 12)
(512, 12)


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.139961,0.068672,0.079137,0.060824,0.920863,0.942857,0.462617
1,CB,0.164814,0.141269,0.037933,0.12688,0.962067,0.949074,0.813492
2,EWC,0.092871,0.025507,0.079137,0.013734,0.920863,0.769231,0.211268
3,Exec,0.135383,0.033355,0.103336,0.032047,0.896664,0.980392,0.241546
4,FWE,0.041203,0.028123,0.017005,0.024199,0.982995,0.930233,0.634921
5,SP,0.121648,0.052322,0.07325,0.048398,0.92675,0.9625,0.413978
6,RE,0.102681,0.025507,0.083715,0.018967,0.916285,0.871795,0.216561
7,Sup,0.151733,0.052976,0.115762,0.035971,0.884238,0.839506,0.293103
8,SW,0.206017,0.080445,0.138653,0.067364,0.861347,0.918699,0.35873
9,TEPE,0.213211,0.170046,0.062786,0.150425,0.937214,0.942308,0.751534


In [235]:
non_zero_true = Y_valid_meta[predictions.sum(axis = 1) != 0,:]
non_zero_pred = predictions[predictions.sum(axis = 1) != 0,:]

theme_results(non_zero_true, non_zero_pred)

Overall Accuracy: 0.6087 
Hamming Loss: 0.0549 
Hamming Loss (pred. zeros): 0.1318


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.155359,0.103245,0.063913,0.091445,0.936087,0.942857,0.626582
1,CB,0.222222,0.212389,0.031465,0.190757,0.968535,0.949074,0.90708
2,EWC,0.079646,0.038348,0.058997,0.020649,0.941003,0.769231,0.37037
3,Exec,0.113078,0.050147,0.064897,0.048181,0.935103,0.980392,0.434783
4,FWE,0.050147,0.042281,0.013766,0.036382,0.986234,0.930233,0.784314
5,SP,0.12586,0.078663,0.053097,0.072763,0.946903,0.9625,0.601562
6,RE,0.094395,0.038348,0.06588,0.028515,0.93412,0.871795,0.354167
7,Sup,0.127827,0.079646,0.073746,0.054081,0.926254,0.839506,0.523077
8,SW,0.207473,0.120944,0.106195,0.101278,0.893805,0.918699,0.535545
9,TEPE,0.273353,0.255654,0.047198,0.226155,0.952802,0.942308,0.881295


In [102]:
Y_valid_meta

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Compare Indices

In [35]:
df_results = investigate_results(df_X_valid, Y_valid, Y_pred_bow)

indices_bow = set(df_results \
               .query("correct == False") \
               .base_index \
               .values)

In [34]:
test = Y_pred_avg_wv['w2v_base_model']

In [30]:
wrong_index = {}

for embedding in embeddings.keys():
    df_results = investigate_results(df_X_valid, Y_valid, Y_pred_avg_wv[embedding])
    
    wrong_index[embedding] = set(df_results \
                                .query("correct == False") \
                                .base_index \
                                .values)

In [31]:
indices_wv = wrong_index['w2v_base_model']

for embedding in embeddings.keys():
    
    indices_wv = indices_wv.intersection(wrong_index[embedding])
    print(len(indices_wv))

2177
1862
1739
1693
1665
1641
1604


In [37]:
1 - 1053/3320

0.6828313253012048

In [58]:
1 - 1595/3320

0.5195783132530121

In [88]:
len(indices_wv.intersection(indices_bow).intersection(indices_lstm))

1053

In [53]:
df_results = investigate_results(df_X_valid, Y_valid, Y_pred_bow)

indices_bow = set(df_results \
                  .query("correct == False") \
                  .base_index \
                  .values)

In [54]:
len(indices_bow)

1823

In [218]:
wrong_index = {}

for embedding in embeddings.keys():
    df_results = investigate_results(df_X_valid, Y_valid, np.round(Y_pred_lstm[embedding]))
    
    wrong_index[embedding] = set(df_results \
                                .query("correct == False") \
                                .base_index \
                                .values)

In [None]:
investigate_results(df_X_valid, Y_valid, Y)

In [219]:
indices_lstm = wrong_index['w2v_base_model']

for embedding in embeddings.keys():
    
    indices_lstm = indices_lstm.intersection(wrong_index[embedding])
    print(len(indices_lstm))

3052
2540
2280
2148
2035
1917
1829


In [221]:
1 - 1829/6116

0.7009483322432963

In [220]:
Y_valid.shape

(6116, 12)