<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Split-data-into-X,-y-variables" data-toc-modified-id="Split-data-into-X,-y-variables-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Split data into X, y variables</a></span></li><li><span><a href="#Define-pipelines" data-toc-modified-id="Define-pipelines-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Define pipelines</a></span></li><li><span><a href="#Test-fit-and-evaluate-models" data-toc-modified-id="Test-fit-and-evaluate-models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Test fit and evaluate models</a></span><ul class="toc-item"><li><span><a href="#Binary-Vectorization,-Logistic-Regression-classifier" data-toc-modified-id="Binary-Vectorization,-Logistic-Regression-classifier-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Binary Vectorization, Logistic Regression classifier</a></span></li><li><span><a href="#Binary-Vectorization" data-toc-modified-id="Binary-Vectorization-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Binary Vectorization</a></span></li><li><span><a href="#TfIdf-Vectorization,-Logistic-Regression-classifier" data-toc-modified-id="TfIdf-Vectorization,-Logistic-Regression-classifier-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>TfIdf Vectorization, Logistic Regression classifier</a></span></li></ul></li><li><span><a href="#Predictions-and-results" data-toc-modified-id="Predictions-and-results-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Predictions and results</a></span></li></ul></div>

 # Train classifier based on semi-supervised labels

In [14]:
import numpy as np
import pandas as pd
import re
import copy

import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, recall_score
from sklearn.pipeline import Pipeline 

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer


In [60]:
nonum_pat = re.compile(r"^([^\d]|[$][\d]+)[^\d]*$", re.I)

In [76]:
train_df = pd.read_csv('data/classifier_input_train.csv', index_col=0)
train_df = train_df[train_df.len > 19][train_df.para_text.apply(lambda x: not re.match(nonum_pat, x))]
val_df = pd.read_csv('data/classifier_input_val.csv', index_col=0)
val_df = val_df[val_df.len > 19][val_df.para_text.apply(lambda x: not re.match(nonum_pat, x))]

  
  after removing the cwd from sys.path.


## Split data into X, y variables

Tokenize dollar values as dollar tokens, numerics as numeric tokens, but leave years as they are

In [78]:
year_pat = re.compile(r"([^0-9])((?:20|19)[0,1,2,9][0-9])([^0-9])")
dollar_pat = re.compile(r"[$]([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}")
num_pat = re.compile(
r"(y(?:20|19)[0,1,2,9][0-9])|(?:(?P<bound1>[\s,.])(?:(?:[0-9]{1,3}[,])*(?:[0-9]{1,3}))(?:[.][0-9]{1,4})?[%]?(?P<bound2>[\s,.]))")
year_fix_pat = re.compile(r"y((?:20|19)[0,1,2,9][0-9])num_tok")

def replace_numeric_toks(s):
    s1 = re.sub(year_pat, r'\1y\2\3', s )
    s2 = re.sub(dollar_pat, r'dollar_tok', s1)
    s3 = re.sub(num_pat, r'\1\g<bound1>num_tok\g<bound2>', s2)
    s4 = re.sub(year_fix_pat, r'\1', s3)
    return s4

In [79]:
def get_x_text(df, text_col = 'para_text', text_prep_func = replace_numeric_toks):
    """Return column of text ready for vectorization"""
    x_text = df[text_col].apply(text_prep_func)
    return x_text

def get_x_y(df, x_cols = ['para_text'], y_col = 'label'):
    """Split df into X,y, performing transformations as needed"""
    X_text = get_x_text(df)
    y = df[y_col]
    X =X_text 
    return X,y

In [80]:
X_train, y_train = get_x_y(train_df)

In [81]:
X_val, y_val = get_x_y(val_df)

Function to cross-validate models

In [15]:
def cv_acc(model, X, y, cv=5, scoring='accuracy'):
    cv_dict = {}
    cvs = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    cv_dict['cv_mean'] = np.mean(cvs)
    cv_dict['cvs'] = cvs
    return cv_dict

In [16]:
model_params_dict = {}
model_cvs = {}

## Define pipelines

In [17]:
count_vec = CountVectorizer(ngram_range=(1,6), max_df=0.6, min_df=.025, max_features=5000)
bin_vec = CountVectorizer(ngram_range=(1,6), max_df=0.2, min_df=.01, binary=True, max_features=5000)
tfidf_vec = TfidfVectorizer(ngram_range=(1,6), max_df=0.62, min_df=.025, max_features=5000)

# Define a pipeline combining a text feature extractor with a simple
# classifier

# Logistic Regression 

lr_bin_pl = Pipeline([
    ('vec', bin_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

lr_count_pl = Pipeline([
    ('vec', count_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

lr_tfidf_pl = Pipeline([
    ('vec', tfidf_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

## Test fit and evaluate models

### Binary Vectorization, Logistic Regression classifier

In [46]:
    # Result: 
#    'vec__max_df': (.5, .75, .9), 
#    'vec__min_df': (.01, .05, .10, .20),
#    'vec__ngram_range' : [(1,6)]

    # {'vec__max_df': 0.9, 'vec__min_df': 0.01, 'vec__ngram_range': (1, 6)}
# max GS test score: 0.908048417923126
#{'cv_mean': 0.8742884854235194, 'cvs': array([0.86956522, 0.87685775, 0.8566879 , 0.89904357, 0.86928799])}
# val score: 0.9275747508305647


#    'vec__max_df': (.9, .95), 
#    'vec__min_df': (.005, .01),
#    'vec__ngram_range' : [(1,5), (1,6), (1,7), (1,8)]
# {'vec__max_df': 0.9, 'vec__min_df': 0.005, 'vec__ngram_range': (1, 5)}
# max GS test score: 0.9125079634741984
# {'cv_mean': 0.9093241854291755, 'cvs': array([0.90774125, 0.91082803, 0.90339703, 0.91710946, 0.90754516])}7
# val score: 0.93421926910299

#    'vec__max_df': (.85, .9, .95), 
#    'vec__min_df': (.005, .01),
#    'vec__ngram_range' : [(1,4), (1,5)]
#{'vec__max_df': 0.85, 'vec__min_df': 0.005, 'vec__ngram_range': (1, 4)}
# max GS test score: 0.9135697600339775
# {'cv_mean': 0.9105967194151221, 'cvs': array([0.91304348, 0.91082803, 0.90552017, 0.92029756, 0.90329437])}
# val score: 0.9335548172757475
param_grid_bin = {
    'vec__max_df': (.85, .9, .95), 
    'vec__min_df': (.005, .01),
    'vec__ngram_range' : [(1,4), (1,5)]

    
    #'vec__ngram_range' : [(1,5), (1,6), (1,7)]
}

### Binary Vectorization

In [47]:
lr_bin_gs = GridSearchCV(lr_bin_pl, param_grid=param_grid_bin, cv=4, return_train_score=True)

In [48]:
lr_bin_gs.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=5000, min_df=0.005,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
        str...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__max_df': (0.85, 0.9, 0.95), 'vec__min_df': (0.005, 0.01), 'vec__ngram_range': [(1, 4), (1, 5)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [49]:
lr_bin_gs.cv_results_['mean_test_score'].max()

0.9135697600339775

In [50]:
# Best hyperparameter settings, which were found by the GridSearchCV - want to keep this:
model_params_dict['lr_bin'] = lr_bin_gs.best_params_
lr_bin_gs.best_params_

{'vec__max_df': 0.85, 'vec__min_df': 0.005, 'vec__ngram_range': (1, 4)}

In [51]:
lr_bin_pl.set_params(**model_params_dict['lr_bin'])
lr_bin_pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=5000, min_df=0.005,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        st...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

Get and print model cross validation scores:

In [52]:
model_cvs['lr_bin'] = cv_acc(lr_bin_pl, X_train, y_train)
print(model_cvs['lr_bin'])

{'cv_mean': 0.9105967194151221, 'cvs': array([0.91304348, 0.91082803, 0.90552017, 0.92029756, 0.90329437])}


Against the unseen holdout/validation set:

In [53]:
lr_bin_pl.score(X_val, y_val)

0.9335548172757475

### TfIdf Vectorization, Logistic Regression classifier

In [19]:
param_grid_tfidf = {
    'vec__max_df': (0.5, .7, .9),
    'vec__min_df': (.005, .01, .015, .025),
    'vec__ngram_range' : [(1,6)]
}

In [39]:
lr_tfidf_gs = GridSearchCV(lr_tfidf_pl, param_grid=param_grid_tfidf, cv=4, return_train_score=True)

In [None]:
lr_tfidf_gs.fit(X_train, y_train)

In [None]:
lr_tfidf_gs.['mean_test_score'].max()

In [None]:
# Best hyperparameter settings, which were found by the GridSearchCV - want to keep this:
model_params_dict['lr_tfidf'] = lr_tfidf_gs.best_params_
lr_tfidf_gs.best_params_

In [None]:
lr_tfidf_pl.set_params(**model_params_dict['lr_tfidf'])
lr_tfidf_pl.fit(X_train, y_train)

In [None]:
model_cvs['lr_tfidf'] = cv_acc(lr_tfidf_pl, X_train, y_train)
print(model_cvs['lr_tfidf'])

## Predictions and results

In [83]:
y_pred = lr_bin_pl.predict(X_train)
y_val_pred = lr_bin_pl.predict(X_val)
y_proba = lr_bin_pl.predict_proba(X_train)

In [84]:
train_df['rel_proba'] = y_proba[:,1].tolist()

In [55]:
print(classification_report(y_val, y_val_pred))

             precision    recall  f1-score   support

          0       0.87      0.86      0.87       385
          1       0.95      0.96      0.96      1120

avg / total       0.93      0.93      0.93      1505



In [91]:
len(X_train)

4066

In [94]:
misses = y_pred != y_train
#X_test_misses = [X_test[i] for i in range(0, len(misses)) if misses[i] == True]
X_misses = X_train[misses].tolist()
len(X_misses)

11

In [96]:
for idx, tup in enumerate(zip(X_misses, y_proba[misses][1].tolist(), y_train[misses])):
    print("--------------------    " + str(idx) + "    --------------------" )
    print("True value :  " + str(tup[2]))
    print("Predicted probability :  " + str(tup[1]))
    print(tup[0])

--------------------    0    --------------------
True value :  0
Predicted probability :  0.3458700339515848
5.num_tok  Employees Eligible for Matching Contributions
--------------------    1    --------------------
True value :  0
Predicted probability :  0.6541299660484152
15.num_tok  Employees Covered by Collective Bargaining Agreement


In [56]:
coef_bin_df = pd.DataFrame.from_records(list(zip(lr_bin_pl.named_steps.vec.get_feature_names(), 
                                             lr_bin_pl.named_steps.lr.coef_.tolist()[0])),
                                   columns = ['token', 'coef']).sort_values('coef', ascending=False).reset_index(drop=True)

In [59]:
coef_bin_df.tail(20)

Unnamed: 0,token,coef
4004,associates and,-0.64192
4005,another,-0.647143
4006,remain,-0.649336
4007,november num_tok,-0.669
4008,by collective bargaining agreement,-0.671258
4009,hold,-0.67172
4010,than num_tok people,-0.673273
4011,many,-0.684738
4012,upon,-0.689394
4013,senior,-0.696885


In [86]:
def print_row_detail(df=train_df, nrow=10, header_list = ['acc_id'],
                    detail_list = [ 'len', 'first_emp_head_block', 'split' ,'para_text'],
                    sortby=['acc_id', 'first_emp_head_block', 'len'], ascending=False):
    df_sorted = df.sort_values(sortby, ascending=ascending).reset_index()
    nrow = min(len(df_sorted), nrow)
    for i in range(0, nrow):
        for h in header_list:
            print('-'*35  + ' ' +  str(df_sorted[h][i]) + ' ' + '-'*35)
        for d in detail_list:
            print(d + '  :' + str(df_sorted[d][i]))
            print('')

In [87]:
print_row_detail(df=train_df, nrow=20, header_list = ['acc_id'],
                    detail_list = ['label', 'rel_proba', 'para_text'],
                    sortby=['acc_id', 'rel_proba'], ascending=True)

----------------------------------- 0000004127-16-000068 -----------------------------------
label  :1

rel_proba  :0.9999990903768085

para_text  :As of September 30, 2016, we employed approximately 7,300 employees world-wide. Approximately 860 of our employees in Mexico, 450 employees in Singapore, and 200 employees in Japan are covered by collective bargaining and other union agreements.

----------------------------------- 0000004904-17-000019 -----------------------------------
label  :1

rel_proba  :0.992104039502212

para_text  :AEP also owns a service company subsidiary, AEPSC. AEPSC provides accounting, administrative, information systems, engineering, financial, legal, maintenance and other services at cost to AEP subsidiaries. The executive officers of AEP and certain of its public utility subsidiaries are employees of AEPSC. As of December 31, 2016, AEPSC had 5,805 employees.

----------------------------------- 0000004904-17-000019 -----------------------------------
label