In [1]:
# sources
# example - https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0
# https://github.com/ameasure/autocoding-class/blob/master/machine_learning.ipynb

In [2]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [4]:
# read the csv file into a DataFrame
df = pd.read_csv(r'Health_Care_Ticket.csv', encoding='iso-8859-1')
#supplemental = pd.read_csv(r'jobboard_reed_uk_secondary.csv')

In [5]:
# cut down the training data to 10K for manageability
print(df.shape[0])
df = df.dropna()
print(df.shape[0])
df = df.iloc[0:10000,:]
df.head()

57280
53932


Unnamed: 0,fileid,SUMMARY,DATA,categories,sub_categories,previous_appointment,ID
0,2015561331001,Pt aware that he needs ROV for refill,{\rtf1\ansi\ftnbj{\fonttbl{\f0 \fswiss Arial;}...,PRESCRIPTION,REFILL,No,2015_5_6133_1001
1,2015561341001,Mom wants to know if the Focalin needs some do...,{\rtf1\ansi\ftnbj{\fonttbl{\f0 \fswiss Arial;}...,ASK_A_DOCTOR,MEDICATION RELATED,No,2015_5_6134_1001
2,2015561351001,pt called to discuss nortryptiline. she says s...,xxxx-xxxx\f0 \fswiss Arial;}}{\colortbl ;\red2...,ASK_A_DOCTOR,MEDICATION RELATED,No,2015_5_6135_1001
3,2015561361001,FYI Nortryptline medication.,xxxx-xxxx\f0 \fswiss Arial;}}{\colortbl ;\red2...,MISCELLANEOUS,OTHERS,No,2015_5_6136_1001
4,2015561371001,Letter of patient establishment request,{\rtf1\ansi\ftnbj{\fonttbl{\f0 \fswiss Arial;}...,MISCELLANEOUS,"SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)",No,2015_5_6137_1001


#### Split the data in train, valid, test

In [7]:
# first lets stratify the train (+valid) and test data, such that the test data is reflective of the classes we might see
from sklearn.model_selection import train_test_split

train_valid, test = train_test_split(df, test_size=0.2, stratify=df["sub_categories"], random_state=42) 
train, valid = train_test_split(train_valid, test_size=0.25, stratify=train_valid["sub_categories"], random_state=42) 

#train_valid, test = train_test_split(df, test_size=0.2, random_state=42) 
#train, valid = train_test_split(train_valid, test_size=0.25, random_state=42) 

print("training dataset: ", train.shape[0])
print("validation dataset: ", valid.shape[0])
print("testing dataset: ", test.shape[0])
#print("supplemental dataset: ", supplemental.shape[0])

# display class sizes of train_valid split
df_group = train_valid.groupby(['sub_categories']).size().reset_index(name='labels').sort_values(by=['labels'], ascending=False)
df_group.head()

training dataset:  6000
validation dataset:  2000
testing dataset:  2000


Unnamed: 0,sub_categories,labels
15,REFILL,1612
7,MEDICATION RELATED,1449
8,NEW APPOINTMENT,1325
9,OTHERS,633
18,"SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)",562


In [19]:
#df_train = train.copy()
df_train = train.iloc[0:1000,:]
df_test = test.copy()
df_valid = valid.copy()

# rename classes and labels
df_train = df_train.rename(columns={'SUMMARY': 'text', 'sub_categories':'code'})
df_test = df_test.rename(columns={'SUMMARY': 'text', 'sub_categories':'code'})
df_valid = df_valid.rename(columns={'SUMMARY': 'text', 'sub_categories':'code'})

## Custom Analysis Function

In [20]:
# based on the error rate we want to predict at we find what our autocoding rate will be lowered to
def threshold_byerror(error_tolerance, code, pred, prob):
    df = pd.DataFrame({'code': code, 'pred': pred, 'prob': prob})
    
    # create a threshold report
    range = np.arange(0.5, 1.0, 0.01).tolist()
    range.sort(reverse=True)
    df_scores = pd.DataFrame(range, columns=['confidence'])

    # find the threshold 
    def threshold_error(data, pred, prob, value):
        df_temp = data[data[prob] > value]
        accuracy = round(accuracy_score(df_temp['code'], df_temp[pred]) * 100, 2)
        error_rate = 100 - accuracy
        return error_rate

    def threshold_percent(data, prob, value):
        count_overall = data.shape[0]
        df_temp = data[data[prob] > value]
        count_threshold = df_temp.shape[0]
        percent = round((count_threshold / count_overall) * 100, 2)
        return percent
    
    df_scores['error'] =  df_scores.apply(lambda row: threshold_error(df, 'pred', 'prob', row['confidence']), axis=1)
    df_scores['rate'] =  df_scores.apply(lambda row: threshold_percent(df, 'prob', row['confidence']), axis=1)
    df_scores = df_scores.replace(np.nan,0)
    
    df_selected = df_scores.iloc[(df_scores['error'] - error_tolerance).abs().argsort()[:2]]
    threshold_selected = round(df_selected.confidence.iloc[0]*100,2)
    error_selected = round(df_selected.error.iloc[0],2)
    rate_selected = round(df_selected.rate.iloc[0],2)
    #text = ("Threshold: " + str(threshold_selected) +
    #       "%. Error: " + str(error_selected) + "%. Autocoding: " + str(rate_selected) + "%.")
    return threshold_selected, error_selected, rate_selected

## Text Preprocessing

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [22]:
# source: https://stackoverflow.com/questions/37473219/how-to-remove-dates-from-a-list-in-python
# source: https://stackoverflow.com/questions/26294245/python-regex-find-all-matches-in-string-and-replace
import re

def replace_date(text):
    # order matters
    if re.search(r'[0-9]{2}[\/,:][0-9]{2}[\/,:][0-9]{2,4}', text, re.IGNORECASE):
        r = re.compile(r'[0-9]{2}[\/,:][0-9]{2}[\/,:][0-9]{2,4}', re.IGNORECASE)
        text = r.sub(r'date', text)
    if re.search(r'[0-9]{1,2}[\/,:][0-9]{2,4}', text, re.IGNORECASE):
        r = re.compile(r'[0-9]{1,2}[\/,:][0-9]{2,4}', re.IGNORECASE)
        text = r.sub(r'date', text)
    if re.search(r'[0-9]{4}', text, re.IGNORECASE):
        r = re.compile(r'[0-9]{4}', re.IGNORECASE)
        text = r.sub(r'date', text)
    
    return text

string = "11/23 @10 10:25 is a date"
print(string)
print(replace_date(string))


11/23 @10 10:25 is a date
date @10 date is a date


In [23]:
# source: https://ryan-cranfill.github.io/sentiment-pipeline-sklearn-3/

from sklearn.preprocessing import FunctionTransformer

def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})


In [24]:
# function for text cleaning 
import re

def clean_text(text): 
    text = re.sub("\'", "", text)           # remove backslash-apostrophe
    text = re.sub("[^a-zA-Z]"," ",text)     # remove everything except alphabets 
    text = ' '.join(text.split())           # remove whitespaces 
    text = text.lower()                     # convert text to lowercase 
    return text

In [25]:
# vectorizer
vect = CountVectorizer(strip_accents = ascii,
                       lowercase = False,
                       analyzer = 'word',
                       binary=True) # one-hot encoding (true)


# resource: https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html

# https://stackoverflow.com/questions/36253258/how-to-fit-different-inputs-into-an-sklearn-pipeline
# TfidfTransformer and POSTransformer


In [26]:
# weight vectors using tf-idf
tfidf = TfidfTransformer()

In [27]:
# XGBoost model
xgb = XGBClassifier(
                    eta = 0.2, # learning rate
                    nthread = -1,
                    seed = 42,
                    
                    # values to tune
                    max_depth=5, # higher depth allow model to learn relations very specific to a particular sample (3-10)
                    gamma = 0, # specifies the minimum loss reduction required to make a split
                    scale_pos_weight = 3, # >0 should be used in case of high class imbalance as it helps in faster convergence.
                    subsample = 0.5, # (0.5-1) lower to help overfitting, fraction of obs randomly samples for each tree
                    min_child_weight=1, # higher value helps overfitting (min “number of observations”)
                    
                    # overfitting adjustment
                    alpha=0, # L1 regularization term on weight
                    lamda=1 # L2 regularization term on weights
                    
                    ) # set to use all

# resource: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

## Pipeline steps

In [33]:
text_clf = Pipeline(steps=[ ('replace_date',pipelinize(replace_date)),
                            ('clean',pipelinize(clean_text)),
                            ('vect', vect),
                            ('tfidf', tfidf),
                            ('clf', xgb),
                          ])

In [34]:
model = text_clf.fit(df_train.text, df_train.code) # create the model

In [35]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

y_train = df_train.code
y_train_pred = model.predict(df_train.text)
y_train_prob = model.predict_proba(df_train.text) # gives an array of all class probabilities
y_train_prob_max = y_train_prob.max(axis=1) # find the highest prob, associated with clf.predict()

y_valid = df_valid.code
y_valid_pred = model.predict(df_valid.text)
y_valid_prob = model.predict_proba(df_valid.text) # gives an array of all class probabilities
y_valid_prob_max = y_valid_prob.max(axis=1) # find the highest prob, associated with clf.predict()


# metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
train_w_f1 = f1_score(y_train, y_train_pred, average = "weighted")
train_w_precision = precision_score(y_train, y_train_pred, average = "weighted")
train_w_recall = recall_score(y_train, y_train_pred, average = "weighted") 
train_threshold, train_error, train_rate = threshold_byerror(10,y_train,y_train_pred,y_train_prob_max)

valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_w_f1 = f1_score(y_valid, y_valid_pred, average = "weighted")
valid_w_precision = precision_score(y_valid, y_valid_pred, average = "weighted")
valid_w_recall = recall_score(y_valid, y_valid_pred, average = "weighted")    
valid_threshold, valid_error, valid_rate = threshold_byerror(10,y_valid,y_valid_pred,y_valid_prob_max)

# create a dataframe from the logged lists
df_train_valid_results = pd.DataFrame(
    {'dataset': ['train','valid'],
     'accuracy': [train_accuracy, valid_accuracy],
      'weighted_f1': [train_w_f1,valid_w_f1],
      'weighted_precision': [train_w_precision, valid_w_precision],
      'weighted_recall': [train_w_recall, valid_w_recall],
      'threshold_selected':[train_threshold,valid_threshold],
      'error_rate':[train_error, valid_error],
     'autocoding_rate':[train_rate, valid_rate]
        
     })
df_train_valid_results.head()


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,dataset,accuracy,weighted_f1,weighted_precision,weighted_recall,threshold_selected,error_rate,autocoding_rate
0,train,0.802,0.79727,0.813766,0.802,50.0,5.71,70.1
1,valid,0.5005,0.486058,0.491418,0.5005,86.0,10.05,19.4


## HyperParameter Tuning

#### Tune Using RandomSearch

In [36]:
from sklearn.model_selection import RandomizedSearchCV

In [225]:
parameters = {
                'vect__analyzer':('word', 'char', 'char_wb'),
                'vect__ngram_range': [(1, 1), (1, 2)],
                'tfidf__use_idf': (True, False),
                'clf__max_depth': range(3, 10),
                'clf__gamma':(0,1,5,10),
                'clf__min_child_weight':(1,5,10),
                'clf__scale_pos_weight': range(1,5),
                'clf__subsample': (0.5,0.75,1)        
             }

In [226]:
# n_iters = iterations, cv = folds
rs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=20, cv = 3, 
                            verbose=10,
                            n_jobs=-1,
                            random_state = 42)
rs_clf = rs_clf.fit(df_train.text, df_train.code)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed:   45.2s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   55.7s finished


In [227]:
randomsearch = pd.DataFrame(rs_clf.cv_results_)
randomsearch = randomsearch.sort_values(by=['rank_test_score'], ascending=True)
randomsearch.head()
#randomsearch.to_csv(r'xgb_random_search.csv')

#rs_clf.best_estimator_.get_params()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_vect__ngram_range,param_vect__analyzer,param_clf__subsample,param_clf__scale_pos_weight,param_clf__min_child_weight,param_clf__max_depth,param_clf__gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
17,19.284107,0.825339,0.049586,0.007295,"(1, 2)",char,1.0,1,1,7,1,"{'vect__ngram_range': (1, 2), 'vect__analyzer'...",0.479042,0.495495,0.45045,0.474996,0.018611,1
13,8.085725,0.05187,0.073142,0.001427,"(1, 2)",char,0.5,3,5,9,1,"{'vect__ngram_range': (1, 2), 'vect__analyzer'...",0.476048,0.486486,0.42042,0.460985,0.028998,2
15,6.843483,0.067421,0.081121,0.003507,"(1, 2)",char_wb,0.75,2,10,3,1,"{'vect__ngram_range': (1, 2), 'vect__analyzer'...",0.458084,0.471471,0.441441,0.456999,0.012284,3
19,11.091089,0.548493,0.060845,0.011191,"(1, 2)",char,0.5,4,1,7,1,"{'vect__ngram_range': (1, 2), 'vect__analyzer'...",0.488024,0.459459,0.423423,0.456969,0.026432,4
3,16.278071,0.618233,0.050368,0.002479,"(1, 2)",word,0.75,1,1,6,10,"{'vect__ngram_range': (1, 2), 'vect__analyzer'...",0.434132,0.387387,0.423423,0.414981,0.019995,5


#### Tune Using Gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose=10) # verbose adds details during logging, n_jobs(-1) = in parallel
gs_clf = gs_clf.fit(df_train.text, df_train.code)

# results of gridsearch
print(gs_clf.best_score_)
print(gs_clf.best_params_)
gridsearch = pd.DataFrame(gs_clf.cv_results_)
gridsearch.head()

Grid search is very long, a bayesian tuning or random search would be nicer

## Final Model (Train with Entire Dataset)

In [44]:
df_train = train.copy()
df_train = df_train.rename(columns={'SUMMARY': 'text', 'sub_categories':'code'})

In [45]:
# XGBoost model
vect = CountVectorizer(strip_accents = ascii,
                       lowercase = True,
                       ngram_range = (1,2),
                       analyzer = 'char',
                       
                       binary=True) # one-hot encoding (true)

xgb = XGBClassifier(eta = 0.2, 
                    nthread = -1,
                    seed = 42,
                    max_depth=7,
                    gamma = 3,
                    scale_pos_weight = 1,
                    subsample = 0.75,
                    min_child_weight=1,
                    alpha=0, 
                    lamda=1)

text_clf = Pipeline(steps=[('replace_date',pipelinize(replace_date)),
                            ('clean',pipelinize(clean_text)),
                            ('vect', vect),
                            ('tfidf', tfidf),
                            ('clf', xgb),])

model = text_clf.fit(df_train.text, df_train.code) # create the model


In [46]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

y_train = df_train.code
y_train_pred = model.predict(df_train.text)
y_train_prob = model.predict_proba(df_train.text) # gives an array of all class probabilities
y_train_prob_max = y_train_prob.max(axis=1) # find the highest prob, associated with clf.predict()

y_test = df_test.code
y_test_pred = model.predict(df_test.text)
y_test_prob = model.predict_proba(df_test.text) # gives an array of all class probabilities
y_test_prob_max = y_test_prob.max(axis=1) # find the highest prob, associated with clf.predict()


# metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
train_w_f1 = f1_score(y_train, y_train_pred, average = "weighted")
train_w_precision = precision_score(y_train, y_train_pred, average = "weighted")
train_w_recall = recall_score(y_train, y_train_pred, average = "weighted") 
train_threshold, train_error, train_rate = threshold_byerror(10,y_train,y_train_pred,y_train_prob_max)

test_accuracy = accuracy_score(y_test, y_test_pred)
test_w_f1 = f1_score(y_test, y_test_pred, average = "weighted")
test_w_precision = precision_score(y_test, y_test_pred, average = "weighted")
test_w_recall = recall_score(y_test, y_test_pred, average = "weighted")    
test_threshold, test_error, test_rate = threshold_byerror(10,y_test,y_test_pred,y_test_prob_max)

# create a dataframe from the logged lists
df_train_test_results = pd.DataFrame(
    {'dataset': ['train','test'],
     'accuracy': [train_accuracy, test_accuracy],
      'weighted_f1': [train_w_f1,test_w_f1],
      'weighted_precision': [train_w_precision, test_w_precision],
      'weighted_recall': [train_w_recall, test_w_recall],
      'threshold_selected':[train_threshold,test_threshold],
      'error_rate':[train_error, test_error],
     'autocoding_rate':[train_rate, test_rate]
        
     })
df_train_test_results.head()

  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,dataset,accuracy,weighted_f1,weighted_precision,weighted_recall,threshold_selected,error_rate,autocoding_rate
0,train,0.835333,0.835724,0.851122,0.835333,50.0,5.12,68.95
1,test,0.5695,0.549188,0.555195,0.5695,79.0,10.03,29.4
