In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from skmultilearn.problem_transform import BinaryRelevance
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
os.getcwd()

'C:\\Users\\fanfan\\Documents\\Capstone\\DSCI_591_capstone-BCStats\\notebooks'

In [3]:
os.chdir('../')

In [4]:
os.getcwd()

'C:\\Users\\fanfan\\Documents\\Capstone\\DSCI_591_capstone-BCStats'

In [5]:
from src.models.eval import theme_results

In [6]:
df_combined_train = pd.read_csv("data/interim/qual_combined_train.csv",index_col=0)
#df_combined_train['clean_comments'] = df_combined_train['comments'].apply(preprocess)

In [7]:
df_combined_train.head()

Unnamed: 0,comments,CPD,CB,EWC,Exec,FWE,SP,RE,Sup,SW,TEPE,VMG,OTH
0,I believe the funding the ministry receives is...,0,0,0,0,0,0,0,0,1,0,0,0
1,"I would appreciate a less chaotic, fear driven...",0,0,1,0,0,0,0,0,1,1,1,0
2,Wage increases cost of living lift freezes. ...,0,1,0,0,0,0,0,0,0,0,0,0
3,Walk the talk with respect to honesty and inte...,0,0,1,0,0,0,1,0,0,0,0,0
4,We require more funding for more FTEs.,0,0,0,0,0,0,0,0,1,0,0,0


### Split the dataset into train and validation

In [8]:
train, validation = train_test_split(df_combined_train, random_state=42, test_size=0.30, shuffle=True)

### Balance the train dataset

In [9]:
x = np.array(train.comments)
y = np.array(train.loc[:,"CPD":"OTH"])

In [10]:
counts = train.sum(axis = 0)[1:13]

In [11]:
counts

CPD     2282
CB      2897
EWC     1656
Exec    2205
FWE      901
SP      1849
RE      1669
Sup     2341
SW      3437
TEPE    3798
VMG     2523
OTH      418
dtype: object

In [13]:
sample_counts = [max(counts) - i for i in counts]

In [14]:
sample_counts

[1516, 901, 2142, 1593, 2897, 1949, 2129, 1457, 361, 0, 1275, 3380]

In [15]:
labels = list(train.columns)[1:13]
df_balanced = train

for index, label in enumerate(labels):
    df_balanced = df_balanced.append(train[train[label]==1].sample(n=sample_counts[index],
                                                             replace = True))

In [16]:
df_balanced.sum(axis = 0)

comments    Increase salaryMy 'department' usually has two...
CPD                                                      5888
CB                                                       5955
EWC                                                      5578
Exec                                                     5871
FWE                                                      4266
SP                                                       5401
RE                                                       5464
Sup                                                      6270
SW                                                       6738
TEPE                                                     6044
VMG                                                      5532
OTH                                                      4033
dtype: object

In [17]:
x_train = np.array(df_balanced['comments'])
x_valid = np.array(validation['comments'])
y_train = np.array(df_balanced.loc[:,"CPD":"OTH"])
y_valid = np.array(validation.loc[:,"CPD":"OTH"])

In [20]:
x_train

array(['Increase salary',
       "My 'department' usually has two persons in it.  I've been without a second person for months (they have several other hires to complete before posting the one that will aid me).  I've been feeling a little swamped with my work, and have gotten overwhelmed at time with work challenges, and I often feel like I'm causing delays (things take so long when you're doing the work of two!).",
       'I love my Job and the work that I do here at NCC, however in the past year I have been only working with one supervisor and his lack of leadership and poor communication skills have made it extremely difficult to feel supported and respected. I have talked to Management about my concerns to see these concerns swept under the rug. and for these reasons I could not give my WES survey the positive marks that I have done in the past. On a positive note I would like to comment that I feel our Warden is moving NCC forward. and in time we will achieve our goal to be on of

In [21]:
y_train

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 1]], dtype=int64)

### Use CountVectorizer

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,4), 
                              stop_words="english", max_features=15000)

In [23]:
x_train = vectorizer.fit_transform(x_train)
x_valid = vectorizer.transform(x_valid)

#### 0. Dummy Classifier

In [24]:
dummy = DummyClassifier()
classifier = BinaryRelevance(classifier=dummy)
classifier.fit(x_train,y_train)
prediction = classifier.predict(x_valid).toarray()

In [25]:
theme_results(y_valid,prediction)

Overall Accuracy: 0.0204 
Hamming Loss: 0.2388 
Hamming Loss (pred. zeros): 0.1283


Unnamed: 0,Label,Y_proportion,Pred_proportion,Accuarcy,Precision,Recall
0,CPD,0.138575,0.156833,0.749012,0.141616,0.160275
1,CB,0.167325,0.170187,0.715629,0.156125,0.158795
2,EWC,0.101104,0.152064,0.780352,0.110215,0.165768
3,Exec,0.129718,0.156833,0.752419,0.12424,0.15021
4,FWE,0.056002,0.121679,0.837308,0.06159,0.13382
5,SP,0.101785,0.153563,0.780079,0.11535,0.174029
6,RE,0.099196,0.143208,0.784303,0.093245,0.134615
7,Sup,0.14021,0.176182,0.734296,0.143852,0.180758
8,SW,0.206568,0.175228,0.696416,0.223173,0.189314
9,TEPE,0.215152,0.159967,0.693827,0.215503,0.160228


In [26]:
prediction[prediction.sum(axis = 1) == 0,:].shape

(978, 12)

In [27]:
prediction.shape

(7339, 12)

#### 1. XGBoost

In [25]:
xgb = XGBClassifier()
classifier = BinaryRelevance(classifier=xgb)
classifier.fit(x_train,y_train)
prediction = classifier.predict(x_valid).toarray()

KeyboardInterrupt: 

### Oops

#### 2. LogisticRegression

In [28]:
from sklearn.linear_model import LogisticRegression
classifier = BinaryRelevance(classifier=LogisticRegression(random_state=0, solver='sag',multi_class='ovr'))
classifier.fit(x_train,y_train)
prediction = classifier.predict(x_valid).toarray()



In [29]:
theme_results(y_valid,prediction)

Overall Accuracy: 0.4299 
Hamming Loss: 0.0742 
Hamming Loss (pred. zeros): 0.1283


Unnamed: 0,Label,Y_proportion,Pred_proportion,Accuarcy,Precision,Recall
0,CPD,0.138575,0.112822,0.928464,0.797101,0.648968
1,CB,0.167325,0.150838,0.958169,0.915989,0.825733
2,EWC,0.101104,0.068538,0.924649,0.687873,0.466307
3,Exec,0.129718,0.109552,0.908162,0.672886,0.568277
4,FWE,0.056002,0.050007,0.975746,0.817439,0.729927
5,SP,0.101785,0.083935,0.938547,0.74026,0.610442
6,RE,0.099196,0.074533,0.923014,0.648995,0.487637
7,Sup,0.14021,0.10669,0.898079,0.679438,0.517007
8,SW,0.206568,0.155335,0.875187,0.763158,0.573879
9,TEPE,0.215152,0.184494,0.929554,0.892171,0.765041


#### 3.Random Forests

>A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. 

In [30]:
forest = RandomForestClassifier()
classifier = BinaryRelevance(forest)
classifier.fit(x_train,y_train)
prediction = classifier.predict(x_valid).toarray()



In [32]:
theme_results(y_valid,prediction)

Overall Accuracy: 0.4065 
Hamming Loss: 0.0802 
Hamming Loss (pred. zeros): 0.1283


Unnamed: 0,Label,Y_proportion,Pred_proportion,Accuarcy,Precision,Recall
0,CPD,0.138575,0.111187,0.926284,0.791667,0.635202
1,CB,0.167325,0.162284,0.956261,0.880772,0.854235
2,EWC,0.101104,0.05464,0.916474,0.660848,0.357143
3,Exec,0.129718,0.082028,0.896716,0.66113,0.418067
4,FWE,0.056002,0.043058,0.972612,0.832278,0.639903
5,SP,0.101785,0.067175,0.934051,0.766734,0.506024
6,RE,0.099196,0.046192,0.920016,0.707965,0.32967
7,Sup,0.14021,0.079984,0.892083,0.701874,0.400389
8,SW,0.206568,0.149067,0.869192,0.754113,0.544195
9,TEPE,0.215152,0.210928,0.929554,0.843023,0.826472


#### 4.Gradient Boosting classifier
>Boosting is an ensemble technique to combine a set of weak learners into a strong learner.

>GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. 

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = BinaryRelevance(classifier=GradientBoostingClassifier())
classifier.fit(x_train,y_train)
prediction = classifier.predict(x_valid).toarray()

MemoryError: 

In [None]:
Yuck!

#### Try XGBoost again

In [33]:
categories = list(train.columns.values)
categories = categories[1:]
categories

['CPD',
 'CB',
 'EWC',
 'Exec',
 'FWE',
 'SP',
 'RE',
 'Sup',
 'SW',
 'TEPE',
 'VMG',
 'OTH']

In [35]:
xgb = XGBClassifier()

prediction_df_xgb = pd.DataFrame()

for category in categories:
        
    # Training Gradient radient Tree Boosting classifier model on train data
    xgb.fit(x_train, y_train[category])
    
    # calculating test accuracy
    prediction = xgb.predict(x_valid)
    prediction_df_xgb[category] = prediction
   

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [34]:
prediction_xgb = np.array(prediction_df_xgb)

In [35]:
theme_results(y_valid,prediction_xgb)

Overall Accuracy: 0.2834 
Hamming Loss: 0.0913 
Hamming Loss (pred. zeros): 0.1536


Unnamed: 0,Label,Y_proportion,Pred_proportion,Accuarcy,Precision,Recall
0,CPD,0.159857,0.133342,0.922621,0.809273,0.67504
1,CB,0.165721,0.140162,0.948435,0.907231,0.767308
2,EWC,0.154631,0.064695,0.894002,0.875862,0.366447
3,Exec,0.16451,0.084518,0.881382,0.771493,0.396358
4,FWE,0.116961,0.089936,0.960227,0.929128,0.714441
5,SP,0.143668,0.093123,0.924979,0.868583,0.562999
6,RE,0.148639,0.079992,0.905348,0.83745,0.450686
7,Sup,0.173242,0.104851,0.89209,0.81155,0.49117
8,SW,0.182739,0.090955,0.877111,0.829012,0.412626
9,TEPE,0.163618,0.088725,0.906113,0.89296,0.484223
