# <h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import Requirements" data-toc-modified-id="Import-Requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Requirements</a></span></li><li><span><a href="#Prepare Training Data" data-toc-modified-id="Prepare-Training-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare Training Data</a></span><ul class="toc-item"></ul></li><li><span><a href="#Model Training" data-toc-modified-id="Model Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href="#Model Saving" data-toc-modified-id="Model Saving-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Saving</a></span><ul class="toc-item"></ul></li><li><span><a href="#Validation and Results" data-toc-modified-id="Validation and Results-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Validation and Results</a></span><ul class="toc-item"></ul></div>

<a id='Import Requirements'></a>

# Import Requirements

In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer

In [2]:
#pd.set_option('max_rows',None)

<a id='Prepare Training Data'></a>

# Prepare Training Data

Input data for training consists of both historical data and CICD data( Production run data for which manual agent validation has been done for the ML prediction)

In [3]:
def preprocess_text(message):

    #stopwords
    new_stopwords=['default']
    stpwrd = nltk.corpus.stopwords.words('english')
    stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #lowering and removing punctuation
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]','', message.lower())
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #removing extra spaces
    message=re.sub(' +', ' ',message)
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd and len(word)>2])
    message1 = [lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation]
    #lemmatizing the text
    message2 =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #print("message is : ",message)

    return message1,message2

In [5]:
#read input from cicd data into dataframe
data_cicd=pd.read_csv('../data/Old Records Prod TaxML Restaurant-CICD - Prod_Data.csv', usecols = ['Item','Description','establishment_type','Confidence Score','Agent Corrected CAT Name', 'Agent Corrected Integer','CAT NAME_ ValidationScore [0-100]','Integer_ValidationScore[0-100]'])
print(data_cicd.shape)
#low conf and correcr predicted data
#data_cicd_low_conf=data_cicd[(data_cicd['Confidence Score']<=1)]
#data_cicd_low_conf=data_cicd_low_conf[(data_cicd_low_conf['CAT NAME_ ValidationScore [0-100]']==0) &(data_cicd_low_conf['Integer_ValidationScore[0-100]']==0)]
#misclassified data                                        
data_cicd_misclassification=data_cicd[(data_cicd['CAT NAME_ ValidationScore [0-100]']!=100) &(data_cicd['Integer_ValidationScore[0-100]']!=100)]
data_cicd_latest=data_cicd_misclassification
data_cicd_latest=data_cicd_latest[['Item','Description','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer']]
#remove empty rows from dataframe
data_cicd_latest.dropna(how='all',inplace=True)
#remove rows having empty 'Agent Corrected CAT Name', ''Agent Corrected Integer'
data_cicd_latest.dropna(subset=['Agent Corrected CAT Name', 'Agent Corrected Integer'],inplace=True)
#removing all duplicate rows
data_cicd_latest=data_cicd_latest.drop_duplicates(subset=['Item','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer'],keep='first')
print(data_cicd_latest.shape)
#fresh data of cicd
data_cicd_without_duplicate=data_cicd_latest.drop_duplicates(subset=['Item','establishment_type'],keep=False)
print(data_cicd_without_duplicate.shape)
#append two dataframe
data_cicd_final=data_cicd_without_duplicate
#creat target string which will be used for prediction 
data_cicd_final['target']= data_cicd_final['Agent Corrected CAT Name'] + ":" + data_cicd_final['Agent Corrected Integer']
print(data_cicd_final.shape)


  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


(553860, 8)
(89928, 5)
(79442, 5)
(79442, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
# combine the columns Item, Description and establishment_type into one column 'combined_text'
data_cicd['combined_text'] = data_cicd[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)

# apply data preprocessing steps on the prepared column
data_cicd['processed_text']= data_cicd['combined_text'].map(lambda s:preprocess_text(s)[1]) 

data_cicd = data_cicd.reset_index(drop=True)
# prepare the target column by combining 'Agent Corrected CAT Name' and 'Agent Corrected Integer'
data_cicd['target']= data_cicd['Agent Corrected CAT Name'] + ":" + data_cicd['Agent Corrected Integer']

#remove rows having empty target column
data_cicd.dropna(subset=['target'],inplace=True)

data_cicd = data_cicd[data_cicd['target']!= '#REF!:#REF!']

X_cicd= data_cicd[['Item','Description','establishment_type','processed_text']]
y_cicd= data_cicd['target']

# split the cicd data into train and test 
X_train_cicd, X_test_cicd, y_train_cicd, y_test_cicd = train_test_split(X_cicd, y_cicd, test_size =.01, random_state = 42)



In [7]:
#read input from historical data into dataframe
data_df = pd.read_csv('../data/final_historical_data.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','target'])
#choose sample data from entire data
data_df = data_df.sample(frac=1, random_state=42)
print(data_df.shape)
#fill blanks with ''
data_df = data_df.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
data_df['combined_text'] = data_df[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
data_df['processed_text'] = data_df['combined_text'].map(lambda s:preprocess_text(s)[1])
print(data_df.shape)
data_df.drop_duplicates(subset=['processed_text','target'],inplace=True)
print(data_df.shape)
data_df = data_df.reset_index(drop=True)
X = data_df[['Item','Description','establishment_type','processed_text']]
y = data_df['target']
# split the cicd data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

(100820, 4)
(100820, 6)
(91447, 6)


We will append the CICD data to the historical data to create the final train and test data.
Train set has 80% of all historical data and 90% of all cicd data.
Test set consists of 20% of historic data and 10% of all cicd data.

In [8]:
X_train_final = X_train.append(X_train_cicd)
X_test_final = X_test.append(X_test_cicd)
y_train_final = y_train.append(y_train_cicd)
y_test_final = y_test.append(y_test_cicd)

<a id='Model Training'></a>

In [9]:
print('Training data size: {}'.format(len(X_train)))
print('Test data size: {}'.format(len(X_test)))

Training data size: 73157
Test data size: 18290


In [10]:
print('Number of unique labels : {}'.format(len(y_train.unique().tolist())))

Number of unique labels : 102


# Model Training

The Model Pipeline consists of 1. CountVectorizer, 2. Tfidf-Transformer 3. RandomForestClassifier 

In [11]:
#create the model pipeline
rf = Pipeline([('vect', CountVectorizer(token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',stop_words='english',max_df=0.85)),
            ('tfidf', TfidfTransformer()),
            #('mnb', MultinomialNB(alpha= 0.05,fit_prior= False))])
            ('clf', RandomForestClassifier(n_jobs=-1, random_state=42,class_weight='balanced',max_depth=400))])




In [12]:
# perform model training
rf.fit(X_train_final['processed_text'].values, y_train_final.values)

  if LooseVersion(joblib_version) < '0.12':
  if _joblib.__version__ >= LooseVersion('0.12'):


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.85,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
                                 tokenizer=None,...
                 RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                        criterion='gini', max_depth=400,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease

In [13]:
# create a result dataframe to store final results
result=X_test_final
# model prediction
y_pred = rf.predict(X_test_final['processed_text'].values)

result['original_cat']= y_test_final.values
result['predicted_cat'] = y_pred
result['prediction_cat_confscore'] = np.round_(np.max(rf.predict_proba(X_test_final['processed_text']), axis=1), decimals=2)

#
output = {'accuracy': accuracy_score(y_pred,y_test_final),'precision_score':precision_score(y_pred,y_test_final,average='weighted'),'recall_score':recall_score(y_pred,y_test_final,average='weighted')
,'f1_score':f1_score(y_pred,y_test_final,average='weighted')}

result['confusion_matrix'] = str(output)

  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [14]:
len(rf.named_steps['vect'].get_feature_names())

47576

In [15]:
print(rf.named_steps['clf'].estimators_[0].tree_.max_depth)

400


In [16]:
depths = [tree.tree_.max_depth for tree in rf.named_steps['clf'].estimators_]
print(f"Mean tree depth in the Random Forest: {np.round(np.mean(depths))}")

Mean tree depth in the Random Forest: 400.0


<a id='Model Saving'></a>

In [17]:
result['confusion_matrix'][5:6].values

array(["{'accuracy': 0.8072686733556299, 'precision_score': 0.8029800433559108, 'recall_score': 0.8072686733556299, 'f1_score': 0.7904798356764975}"],
      dtype=object)

In [16]:
array(["{'accuracy': 0.8214672148885522, 'precision_score': 0.8704898295343778, 'recall_score': 0.8214672148885522, 'f1_score': 0.8401291751124788}"],
      dtype=object)

NameError: name 'array' is not defined

# Model Saving

In [18]:
import joblib
import datetime
# save the model to disk
filename_primary= 'finalized_model.sav'
joblib.dump(rf, open(filename_primary, 'wb'))

<a id='Validation and Results'></a>

# Validation and Results

In [None]:
#accuracy score of the model
accuracy = rf.score(X_test['processed_text'].values, y_test)
print("Accuracy = {}".format(accuracy))

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
#classification report 
classification_report = metrics.classification_report(y_test_final, y_pred, output_dict=True)

In [None]:
display(pd.DataFrame(classification_report).transpose())

In [None]:
# check the misclassifications
misclassifications= result.loc[result['original_cat']!=result['predicted_cat']]

In [None]:
misclassifications

In [None]:
misclassifications.to_csv('../output/misclassifications.csv')

In [None]:
misclassifications.groupby(['establishment_type']).size()

In [19]:
train_size = 0.8
train_end = int(len(data_df)*train_size)
df_train = data_df[:train_end]
df_test = data_df[train_end:]
df_train = df_train[['Item','Description','establishment_type','target']]
df_test = df_test[['Item','Description','establishment_type','target']]
train_size_cicd=0.02
train_end_cicd = int(len(data_cicd_final)*train_size_cicd)
df2_train = data_cicd_final[:train_end_cicd]
df2_test = data_cicd_final[train_end_cicd:]
df2_train = df2_train[['Item','Description','establishment_type','target']]
df2_test = df2_test[['Item','Description','establishment_type','target']]
X_train_save = df_train.append(df2_train)
X_test_save = df_test.append(df2_test)
X_train_save['label'] = 'train'
X_test_save['label'] = 'test'
X_data = X_train_save.append(X_test_save)
X_data.to_csv('df_traintestdata.csv')

In [None]:
X_data