# Classification Experiments

In [1]:
# Change working directory to be project root
import os
#os.chdir("..")
os.getcwd()

'/Users/aaronquinton/Documents/UBC-MDS/Capstone/BCstats/DSCI_591_capstone-BCStats'

In [19]:
import pandas as pd
import numpy as np
import nltk
import time

# Custom functions for preprocessing and data preparation
from src.data.preprocessing_text import (
    clean_text, clean_numbers, replace_typical_misspell, remove_stopwords,
    balance_themes
)

from src.features.word_vectors import (
    build_vocab, check_coverage, get_average_embeddings
)

from src.models.eval import theme_results

# Functions for preprocessing and data preparation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics


# Classification alogrithms
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


## <span style = "color:Darkblue"> Preprocessing Data & Feature Extraction </span>

In [3]:
# Read in data
df = pd.read_csv("data/interim/train_2018-qualitative-data.csv")

df = df[['2018 Comment']].join(df.loc[:,'CPD':'OTH'])
df = df.rename(columns = {'2018 Comment' : 'comment'})

In [4]:
# Remove punctuation, clean numbers, and fix spelling
df["comment"] = df["comment"].progress_apply(lambda x: clean_text(x))

df["comment"] = df["comment"].progress_apply(lambda x: clean_numbers(x))

df["comment"] = df["comment"].progress_apply(
    lambda x: replace_typical_misspell(x)
)


100%|██████████| 13278/13278 [00:00<00:00, 63450.65it/s]
100%|██████████| 13278/13278 [00:00<00:00, 32687.05it/s]
100%|██████████| 13278/13278 [00:00<00:00, 47329.10it/s]


### Bag of Words

In [5]:
# Prepare arrays for model
X = np.array(df.comment)
Y = np.array(df.loc[:,"CPD":"OTH"])

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, 
                                                      test_size=0.25, 
                                                      random_state=2019)

In [6]:
# Use Count Vectorizer to build bag of word arrays to train on
vectorizer = CountVectorizer(stop_words= 'english',
                             ngram_range=(1,4), 
                             max_features=15000)   

X_train_bow = vectorizer.fit_transform(X_train)
X_valid_bow = vectorizer.transform(X_valid)

In [7]:
#X_train_bow, Y_train = balance_themes(X_train_bow.toarray(), Y_train)

### Average Word Vectors

In [8]:
# Load embeddings to be used for word vectors
from gensim.models import KeyedVectors

news_path = "./references/GoogleNews-vectors-negative300.bin"
google_news = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [9]:
# Prepare text data, and cross check it with google_news
sentences = df["comment"].apply(lambda x: x.split())
sentences = remove_stopwords(sentences)

vocab = build_vocab(sentences)

# Checkout out of vocab words
oov = check_coverage(vocab, google_news)
oov[:10]

100%|██████████| 13278/13278 [00:00<00:00, 95443.37it/s]
100%|██████████| 17246/17246 [00:03<00:00, 4716.57it/s]

Found embeddings for 93.99% of vocab
Found embeddings for  99.67% of all text





[('CYMH', 54),
 ('FLNRORD', 35),
 ('GCPE', 33),
 ('CSNR', 32),
 ('BCWS', 23),
 ('MIRR', 20),
 ('STIIP', 20),
 ('CVSE', 19),
 ('MyPerformance', 18),
 ('FLNRO', 17)]

In [10]:
# Build average word vectors to train on
X_wv = np.array([get_average_embeddings(sentence, embeddings_index=google_news)
                 for sentence in sentences])
Y = np.array(df.loc[:,"CPD":"OTH"])

X_train_wv, X_valid_wv, Y_train, Y_valid = train_test_split(X_wv, Y, 
                                                            test_size=0.25, 
                                                            random_state=2019)

## <span style = "color:Darkblue"> Classification Models </span>
### Baseline Classifier - BOW & Linear SVC 

In [29]:
################################################################################
# Final Train and Predict Model                                                #
################################################################################
t_start = time.time()
print("Training Classifier 1")

clf1 = BinaryRelevance(
    classifier = LinearSVC()
)

clf1.fit(X_train_bow, Y_train)
t_end_train = time.time()

Y_pred1 = clf1.predict(X_valid_bow).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training Classifier 1
Elapsed Training time: 24.0 s 
Elapsed Predict time: 6.3 s


In [34]:
theme_results(Y_valid, Y_pred1)

Overall Accuracy: 0.4337 
Hamming Loss: 0.0775 
Hamming Loss (pred. zeros): 0.1191


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.12741,0.115663,0.07741,0.05,0.92259,0.716146,0.650118
1,CB,0.184639,0.17741,0.048193,0.136446,0.951807,0.88455,0.849918
2,EWC,0.084337,0.065964,0.070783,0.013554,0.929217,0.60274,0.471429
3,Exec,0.103012,0.092771,0.088554,0.014458,0.911446,0.577922,0.520468
4,FWE,0.062048,0.057229,0.027108,0.03494,0.972892,0.805263,0.742718
5,SP,0.096386,0.090964,0.072892,0.023494,0.927108,0.629139,0.59375
6,RE,0.085542,0.070482,0.081928,0.003614,0.918072,0.525641,0.433099
7,Sup,0.127711,0.120783,0.115361,0.012349,0.884639,0.551122,0.521226
8,SW,0.165964,0.146988,0.125602,0.040361,0.874398,0.637295,0.564428
9,TEPE,0.228614,0.216867,0.073795,0.154819,0.926205,0.856944,0.812912


In [13]:
Y_pred[Y_pred.sum(axis = 1) == 0,:].shape

(396, 12)

### Classifier 2 - BOW & Ensemble 

In [26]:
# Initialize Models
clf2a = LinearSVC()
clf2b = RandomForestClassifier(n_estimators=50, random_state=1)
clf2c = LogisticRegression(solver='lbfgs')

eclf = VotingClassifier(estimators=[('svc', clf2a), ('rf', clf2b), ('nb', clf2c)], 
                        voting='hard')

In [28]:
################################################################################
# Train and Predict Model                                                      #
################################################################################
t_start = time.time()
print("Training Classifier 2")

clf2 = BinaryRelevance(
    classifier = eclf
)

clf2.fit(X_train_bow, Y_train)
t_end_train = time.time()
Y_pred2 = clf2.predict(X_valid_bow).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training Classifier 2




Elapsed Training time: 1619.2 s 
Elapsed Predict time: 17.3 s


In [35]:
theme_results(Y_valid, Y_pred2)

NameError: name 'Y_pred2' is not defined

### Classifier 3 - WV & LinearSVC

In [32]:
################################################################################
# Final Train and Predict Model                                                #
################################################################################
t_start = time.time()
print("Training Classifier 3")

clf3 = BinaryRelevance(
    classifier = LinearSVC()
)

clf3.fit(X_train_wv, Y_train)
t_end_train = time.time()

Y_pred3 = clf3.predict(X_valid_wv).toarray()

# Calculate and print elapsed time
t_end = time.time()
print("Elapsed Training time: %.1f s" % (t_end_train - t_start),
      "\nElapsed Predict time: %.1f s" % (t_end - t_end_train))

Training Classifier 3
Elapsed Training time: 10.5 s 
Elapsed Predict time: 0.1 s


In [36]:
theme_results(Y_valid, Y_pred3)

Overall Accuracy: 0.4021 
Hamming Loss: 0.0821 
Hamming Loss (pred. zeros): 0.1191


Unnamed: 0,Label,Y_proportion,Pred_proportion,Error,Dummy_Diff,Accuarcy,Precision,Recall
0,CPD,0.12741,0.06506,0.093072,0.034337,0.906928,0.763889,0.390071
1,CB,0.184639,0.143976,0.071386,0.113253,0.928614,0.893305,0.696574
2,EWC,0.084337,0.021386,0.073795,0.010542,0.926205,0.746479,0.189286
3,Exec,0.103012,0.040964,0.086145,0.016867,0.913855,0.705882,0.280702
4,FWE,0.062048,0.033133,0.040964,0.021084,0.959036,0.818182,0.436893
5,SP,0.096386,0.04006,0.076807,0.019578,0.923193,0.744361,0.309375
6,RE,0.085542,0.014458,0.080723,0.004819,0.919277,0.666667,0.112676
7,Sup,0.127711,0.043373,0.110843,0.016867,0.889157,0.694444,0.235849
8,SW,0.165964,0.072289,0.137651,0.028313,0.862349,0.695833,0.303085
9,TEPE,0.228614,0.198193,0.078614,0.15,0.921386,0.878419,0.761528
