## Train classifier based on semi-supervised labels

In [14]:
import numpy as np
import pandas as pd
import re
import copy

import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import classification_report, recall_score
from sklearn.pipeline import Pipeline 

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer


In [22]:
train_df = pd.read_csv('data/classifier_input_train.csv', index_col=0)
train_df = train_df[train_df.len > 19]
val_df = pd.read_csv('data/classifier_input_val.csv', index_col=0)
val_df = val_df[val_df.len > 19]

Tokenize dollar values as dollar tokens, numerics as numeric tokens, but leave years as they are

In [9]:
year_pat = re.compile(r"([^0-9])((?:20|19)[0,1,2,9][0-9])([^0-9])")
dollar_pat = re.compile(r"[$]([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}")
num_pat = re.compile(
r"(y(?:20|19)[0,1,2,9][0-9])|(?:(?P<bound1>[\s,.])(?:(?:[0-9]{1,3}[,])*(?:[0-9]{1,3}))(?:[.][0-9]{1,4})?[%]?(?P<bound2>[\s,.]))")
year_fix_pat = re.compile(r"y((?:20|19)[0,1,2,9][0-9])num_tok")

def replace_numeric_toks(s):
    s1 = re.sub(year_pat, r'\1y\2\3', s )
    s2 = re.sub(dollar_pat, r'dollar_tok', s1)
    s3 = re.sub(num_pat, r'\1\g<bound1>num_tok\g<bound2>', s2)
    s4 = re.sub(year_fix_pat, r'\1', s3)
    return s4

In [10]:
def get_x_text(df, text_col = 'para_text', text_prep_func = replace_numeric_toks):
    """Return column of text ready for vectorization"""
    x_text = df[text_col].apply(text_prep_func)
    return x_text

def get_x_y(df, x_cols = ['para_text'], y_col = 'label'):
    """Split df into X,y, performing transformations as needed"""
    X_text = get_x_text(df)
    y = df[y_col]
    X =X_text 
    return X,y

In [None]:
X_train, y_train = get_x_y(train_df)

In [34]:
X_val, y_val = get_x_y(val_df)

Function to cross-validate models

In [15]:
def cv_acc(model, X, y, cv=5, scoring='accuracy'):
    cv_dict = {}
    cvs = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    cv_dict['cv_mean'] = np.mean(cvs)
    cv_dict['cvs'] = cvs
    return cv_dict

In [16]:
model_params_dict = {}
model_cvs = {}

In [17]:
count_vec = CountVectorizer(ngram_range=(1,6), max_df=0.6, min_df=.025, max_features=5000)
bin_vec = CountVectorizer(ngram_range=(1,6), max_df=0.2, min_df=.01, binary=True, max_features=5000)
tfidf_vec = TfidfVectorizer(ngram_range=(1,6), max_df=0.62, min_df=.025, max_features=5000)

# Define a pipeline combining a text feature extractor with a simple
# classifier

# Logistic Regression 

lr_bin_pl = Pipeline([
    ('vec', bin_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

lr_count_pl = Pipeline([
    ('vec', count_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

lr_tfidf_pl = Pipeline([
    ('vec', tfidf_vec),
    ('lr', LogisticRegression(random_state=14, max_iter=1000))
])

In [38]:
    # Result: 
#    'vec__max_df': (.5, .75, .9), 
#    'vec__min_df': (.01, .05, .10, .20),
#    'vec__ngram_range' : [(1,6)]

    # {'vec__max_df': 0.9, 'vec__min_df': 0.01, 'vec__ngram_range': (1, 6)}
# max GS test score: 0.908048417923126
#{'cv_mean': 0.8742884854235194, 'cvs': array([0.86956522, 0.87685775, 0.8566879 , 0.89904357, 0.86928799])}
# val score: 0.9275747508305647
param_grid_bin = {
    'vec__max_df': (.9, .95), 
    'vec__min_df': (.005, .01),
    'vec__ngram_range' : [(1,5), (1,6), (1,7), (1,8)]

    
    #'vec__ngram_range' : [(1,5), (1,6), (1,7)]
}

### Binary Vectorization

In [None]:
lr_bin_gs = GridSearchCV(lr_bin_pl, param_grid=param_grid_bin, cv=4, return_train_score=True)

In [40]:
lr_bin_gs.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=5000, min_df=0.01,
        ngram_range=(1, 6), preprocessor=None, stop_words=None,
        stri...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'vec__max_df': (0.9, 0.95), 'vec__min_df': (0.005, 0.01), 'vec__ngram_range': [(1, 5), (1, 6), (1, 7), (1, 8)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [41]:
lr_bin_gs.cv_results_['mean_test_score'].max()

0.9125079634741984

In [42]:
# Best hyperparameter settings, which were found by the GridSearchCV - want to keep this:
model_params_dict['lr_bin'] = lr_bin_gs.best_params_
lr_bin_gs.best_params_

{'vec__max_df': 0.9, 'vec__min_df': 0.005, 'vec__ngram_range': (1, 5)}

In [36]:
lr_bin_pl.set_params(**model_params_dict['lr_bin'])
lr_bin_pl.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=5000, min_df=0.01,
        ngram_range=(1, 6), preprocessor=None, stop_words=None,
        stri...alty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

Get and print model cross validation scores:

In [32]:
model_cvs['lr_bin'] = cv_acc(lr_bin_pl, X_train, y_train)
print(model_cvs['lr_bin'])

{'cv_mean': 0.8742884854235194, 'cvs': array([0.86956522, 0.87685775, 0.8566879 , 0.89904357, 0.86928799])}


Against the unseen holdout/validation set:

In [37]:
lr_bin_pl.score(X_val, y_val)

0.9275747508305647

### TfIdf Vectorization

In [19]:
param_grid_tfidf = {
    'vec__max_df': (0.5, .7, .9),
    'vec__min_df': (.005, .01, .015, .025),
    'vec__ngram_range' : [(1,6)]
}

In [39]:
lr_tfidf_gs = GridSearchCV(lr_tfidf_pl, param_grid=param_grid_tfidf, cv=4, return_train_score=True)

In [None]:
lr_tfidf_gs.fit(X_train, y_train)

In [None]:
lr_tfidf_gs.['mean_test_score'].max()

In [None]:
# Best hyperparameter settings, which were found by the GridSearchCV - want to keep this:
model_params_dict['lr_tfidf'] = lr_tfidf_gs.best_params_
lr_tfidf_gs.best_params_

In [None]:
lr_tfidf_pl.set_params(**model_params_dict['lr_tfidf'])
lr_tfidf_pl.fit(X_train, y_train)

In [None]:
model_cvs['lr_tfidf'] = cv_acc(lr_tfidf_pl, X_train, y_train)
print(model_cvs['lr_tfidf'])

## Predictions and results

In [None]:
y_pred = lr_bin_pl.predict(X_train)
y_val_pred = lr_bin_pl.predict(X_val)
y_proba = lr_bin_pl.predict_proba(X_train)

In [None]:
print(classification_report(y_val, y_val_pred))