In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
#from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score, plot_confusion_matrix
import joblib
from sklearn.model_selection import RandomizedSearchCV

In [2]:
class ClfSwitcher(BaseEstimator):

    
    def __init__(self, estimator = MultinomialNB()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

In [2]:
df = pd.read_csv("shuffled-full-set-hashed.csv", sep = ",", names = ['label', 'text'])
df.dropna(inplace=True)
#df.columns
#df.head()
#df.shape
#df["label"].unique()
#len(df["label"].unique())
#df.isnull().any()
#df[df['text'].isnull()].shape
#df.groupby('label').size()

In [3]:
# training and testing set split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['label'], 
    test_size=0.3,
    random_state=999, 
    stratify= df['label']
)

In [5]:
# define pipeline
pipeline = Pipeline(
    [('tfidf', TfidfVectorizer()),
     ('over', RandomOverSampler(sampling_strategy='minority')),
     ('clf', ClfSwitcher())
    ])

In [6]:
parameters = [
    {'clf__estimator': [MultinomialNB()]},
    #{'clf__estimator': [AdaBoostClassifier()]},
    #{'clf__estimator': [MLPClassifier()]},
    #{'clf__estimator': [GradientBoostingClassifier()]},
    {'clf__estimator': [RandomForestClassifier(max_depth = 20)]}
]
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=2)
gscv = GridSearchCV(
    pipeline, 
    parameters, 
    cv=cv, 
    n_jobs=8, 
    scoring='f1_micro', 
    return_train_score=False, 
    verbose=3)

In [7]:
%timeit 
gscv.fit(X_train, y_train)

Fitting 8 folds for each of 2 candidates, totalling 16 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=4, random_state=None),
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('over',
                                        RandomOverSampler(sampling_strategy='minority')),
                                       ('clf', ClfSwitcher())]),
             n_jobs=8,
             param_grid=[{'clf__estimator': [MultinomialNB()]},
                         {'clf__estimator': [RandomForestClassifier(max_depth=20)]}],
             scoring='f1_micro', verbose=3)

In [None]:
#joblib.dump(gscv.best_estimator_, 'doc_clf_best_estimator.pkl')

In [8]:
print(gscv.best_score_)

0.7223691496854725


In [9]:
print(gscv.best_estimator_)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('over', RandomOverSampler(sampling_strategy='minority')),
                ('clf',
                 ClfSwitcher(estimator=RandomForestClassifier(max_depth=20)))])


In [10]:
gscv.cv_results_

{'mean_fit_time': array([ 23.08624375, 156.48529518]),
 'std_fit_time': array([0.54787634, 5.4427833 ]),
 'mean_score_time': array([6.23491243, 7.53398657]),
 'std_score_time': array([0.29597938, 1.1109708 ]),
 'param_clf__estimator': masked_array(data=[MultinomialNB(), RandomForestClassifier(max_depth=20)],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__estimator': MultinomialNB()},
  {'clf__estimator': RandomForestClassifier(max_depth=20)}],
 'split0_test_score': array([0.50726236, 0.72219158]),
 'split1_test_score': array([0.50441258, 0.73129252]),
 'split2_test_score': array([0.51176687, 0.72421401]),
 'split3_test_score': array([0.50298796, 0.72418865]),
 'split4_test_score': array([0.50900901, 0.713918  ]),
 'split5_test_score': array([0.5       , 0.71483729]),
 'split6_test_score': array([0.502574  , 0.71961758]),
 'split7_test_score': array([0.51071067, 0.72869357]),
 'mean_test_score': array([0.50609043, 0.72236915]),


In [12]:
pipeline_rf = Pipeline(
    [('tfidf', TfidfVectorizer()),
     ('over', RandomOverSampler(sampling_strategy='minority')),
     ('rf', RandomForestClassifier())
    ])

In [14]:
#sorted(pipeline_rf.get_params().keys())

In [47]:
# hyper parameter for random forest tuning

n_estimators = [50, 100, 150, 200]

max_depth = [15, 20, 25, 30, 35, 40]

# Create the random grid
params = {'rf__n_estimators': n_estimators,
          'rf__max_depth': max_depth
         }
cv_rf = RepeatedStratifiedKFold(n_splits=4, n_repeats=2)

In [48]:
rf_random = GridSearchCV(estimator = pipeline_rf, 
                         param_grid = params, 
                         cv = cv_rf,
                         n_jobs = 8,
                         scoring='f1_micro',
                         return_train_score=False,
                         verbose=3)

In [49]:
# Fit the random search model
rf_random.fit(X_train, y_train)
# took about 3 hours

Fitting 8 folds for each of 24 candidates, totalling 192 fits


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=2, n_splits=4, random_state=None),
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('over',
                                        RandomOverSampler(sampling_strategy='minority')),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=8,
             param_grid={'rf__max_depth': [15, 20, 25, 30, 35, 40],
                         'rf__n_estimators': [50, 100, 150, 200]},
             scoring='f1_micro', verbose=3)

In [50]:
best_rf= rf_random.best_estimator_
rf_random.best_params_

{'rf__max_depth': 40, 'rf__n_estimators': 200}

In [56]:
joblib.dump(best_rf, 'doc_clf_best_estimator.pkl')

['doc_clf_best_estimator.pkl']

In [52]:
rf_random.best_score_

0.818770071309834

In [51]:
best_rf.score(X_test, y_test)

0.8168704418704419

In [53]:
predicted = best_rf.predict(X_test)

In [54]:
metrics.accuracy_score(y_test, predicted)

0.8168704418704419

In [55]:
print(metrics.classification_report(y_test, predicted)),
metrics.confusion_matrix(y_test, predicted)

                         precision    recall  f1-score   support

            APPLICATION       0.80      0.57      0.66        69
                   BILL       0.84      0.93      0.88      5688
            BILL BINDER       0.52      0.13      0.20        87
                 BINDER       0.83      0.87      0.85      2685
    CANCELLATION NOTICE       0.71      0.82      0.76      2919
     CHANGE ENDORSEMENT       0.86      0.78      0.81       267
            DECLARATION       0.55      0.06      0.10       290
   DELETION OF INTEREST       0.95      0.74      0.83      1448
      EXPIRATION NOTICE       0.88      0.40      0.55       220
INTENT TO CANCEL NOTICE       0.83      0.22      0.35        68
     NON-RENEWAL NOTICE       1.00      0.10      0.18       187
          POLICY CHANGE       0.79      0.82      0.80      3185
   REINSTATEMENT NOTICE       0.95      0.79      0.86      1310
         RETURNED CHECK       0.96      0.59      0.73       225

               accuracy

array([[  39,    3,    0,   21,    0,    0,    0,    0,    0,    0,    0,
           5,    0,    1],
       [   2, 5275,    9,   86,  140,    0,    6,    0,    4,    1,    0,
         157,    6,    2],
       [   0,   41,   11,   17,    0,    0,    0,    0,    0,    0,    0,
          18,    0,    0],
       [   3,  102,    0, 2327,   11,    1,    4,    1,    0,    0,    0,
         233,    2,    1],
       [   0,  317,    0,   25, 2400,    0,    1,   43,    3,    2,    0,
          99,   29,    0],
       [   0,   11,    0,    4,    8,  207,    0,    0,    0,    0,    0,
          37,    0,    0],
       [   2,   48,    0,  154,   18,    0,   16,    8,    0,    0,    0,
          38,    4,    2],
       [   0,    9,    0,    1,  354,    3,    0, 1067,    0,    0,    0,
          14,    0,    0],
       [   0,   70,    0,    1,   51,    0,    0,    1,   88,    0,    0,
           8,    1,    0],
       [   0,   27,    0,    2,   22,    0,    0,    1,    0,   15,    0,
           1,    

In [11]:
# Train a light version since AWS lambda would time out with the above model
pipe_rf_light = Pipeline(
    [('tfidf', TfidfVectorizer()),
     ('over', RandomOverSampler(sampling_strategy='minority')),
     ('rf', RandomForestClassifier(n_estimators = 100, max_depth = 30, n_jobs = 8))
    ])

In [12]:
pipe_rf_light.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('over', RandomOverSampler(sampling_strategy='minority')),
                ('rf', RandomForestClassifier(max_depth=30, n_jobs=8))])

In [13]:
joblib.dump(pipe_rf_light, 'doc_clf_best_estimator.pkl')

['doc_clf_best_estimator.pkl']

In [14]:
predicted = pipe_rf_light.predict(X_test)
metrics.accuracy_score(y_test, predicted)

0.7874839124839125

In [15]:
print(metrics.classification_report(y_test, predicted)),
metrics.confusion_matrix(y_test, predicted)

                         precision    recall  f1-score   support

            APPLICATION       0.78      0.55      0.64        69
                   BILL       0.77      0.94      0.85      5688
            BILL BINDER       0.29      0.02      0.04        87
                 BINDER       0.84      0.85      0.84      2685
    CANCELLATION NOTICE       0.66      0.78      0.72      2919
     CHANGE ENDORSEMENT       0.88      0.77      0.82       267
            DECLARATION       0.58      0.02      0.05       290
   DELETION OF INTEREST       0.94      0.69      0.80      1448
      EXPIRATION NOTICE       0.95      0.25      0.39       220
INTENT TO CANCEL NOTICE       1.00      0.18      0.30        68
     NON-RENEWAL NOTICE       1.00      0.07      0.13       187
          POLICY CHANGE       0.79      0.78      0.79      3185
   REINSTATEMENT NOTICE       0.96      0.65      0.78      1310
         RETURNED CHECK       0.97      0.40      0.57       225

               accuracy

array([[  38,    4,    0,   21,    0,    0,    0,    0,    0,    0,    0,
           5,    0,    1],
       [   2, 5349,    5,   77,  122,    0,    2,    0,    1,    0,    0,
         124,    5,    1],
       [   0,   58,    2,   18,    0,    0,    0,    0,    0,    0,    0,
           9,    0,    0],
       [   3,  143,    0, 2279,   10,    0,    2,    0,    0,    0,    0,
         247,    1,    0],
       [   1,  448,    0,   22, 2282,    0,    0,   48,    2,    0,    0,
          94,   22,    0],
       [   0,   21,    0,    5,    6,  206,    0,    0,    0,    0,    0,
          29,    0,    0],
       [   2,   69,    0,  141,   18,    0,    7,    8,    0,    0,    0,
          40,    4,    1],
       [   0,   27,    0,    1,  402,    3,    0,  998,    0,    0,    0,
          17,    0,    0],
       [   0,  100,    0,    1,   50,    0,    0,    1,   54,    0,    0,
          12,    2,    0],
       [   0,   29,    0,    3,   23,    0,    0,    1,    0,   12,    0,
           0,    