# <h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import Requirements" data-toc-modified-id="Import-Requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Requirements</a></span></li><li><span><a href="#Prepare Training Data" data-toc-modified-id="Prepare-Training-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare Training Data</a></span><ul class="toc-item"></ul></li><li><span><a href="#Model Training" data-toc-modified-id="Model Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href="#Model Saving" data-toc-modified-id="Model Saving-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Saving</a></span><ul class="toc-item"></ul></li><li><span><a href="#Validation and Results" data-toc-modified-id="Validation and Results-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Validation and Results</a></span><ul class="toc-item"></ul></div>

<a id='Import Requirements'></a>

# Import Requirements

In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer

In [2]:
pd.__version__

'1.3.5'

In [150]:
#pd.set_option('max_rows',None)

<a id='Prepare Training Data'></a>

# Prepare Training Data

Input data for training consists of both historical data and CICD data( Production run data for which manual agent validation has been done for the ML prediction)

In [151]:
def preprocess_text(message):

    #stopwords
    stpwrd = nltk.corpus.stopwords.words('english')
    #stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]',' ', message.lower())
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd and len(word)>1])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    return message

In [203]:
del data_cicd_old

In [204]:
#read input from cicd data into dataframe
data_cicd_old=pd.read_csv('../data/Old Records Prod TaxML Restaurant-CICD - Prod_Data.csv', usecols = ['Item','Description','establishment_type','Confidence Score','Agent Corrected CAT Name', 'Agent Corrected Integer','CAT NAME_ ValidationScore [0-100]','Integer_ValidationScore[0-100]'])
print(data_cicd_old.shape)

(553860, 8)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [205]:
data_cicd_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553860 entries, 0 to 553859
Data columns (total 8 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Item                               553856 non-null  object
 1   Description                        399402 non-null  object
 2   establishment_type                 553860 non-null  object
 3   Confidence Score                   551349 non-null  object
 4   Agent Corrected CAT Name           553675 non-null  object
 5   Agent Corrected Integer            553675 non-null  object
 6   CAT NAME_ ValidationScore [0-100]  553860 non-null  int64 
 7   Integer_ValidationScore[0-100]     553860 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 33.8+ MB


In [206]:
data_cicd_new=pd.read_csv('../data/TaxML Restaurant-CICD - Prod_Data (2).csv', usecols = ['Item','Description','establishment_type','Confidence Score','Agent Corrected CAT Name', 'Agent Corrected Integer','CAT NAME_ ValidationScore [0-100]','Integer_ValidationScore[0-100]'])
print(data_cicd_new.shape)

(410135, 8)


In [207]:
del data_cicd_combined

In [208]:
data_cicd_combined = pd.concat([data_cicd_old, data_cicd_new], join="outer")

In [209]:
print(data_cicd_combined.shape)

(963995, 8)


In [210]:
data_cicd_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 963995 entries, 0 to 410134
Data columns (total 8 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Item                               963987 non-null  object
 1   Description                        711004 non-null  object
 2   establishment_type                 963995 non-null  object
 3   Confidence Score                   959435 non-null  object
 4   Agent Corrected CAT Name           755779 non-null  object
 5   Agent Corrected Integer            755017 non-null  object
 6   CAT NAME_ ValidationScore [0-100]  963995 non-null  int64 
 7   Integer_ValidationScore[0-100]     963995 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 66.2+ MB


In [211]:
data_cicd_combined.columns

Index(['Item', 'Description', 'establishment_type', 'Confidence Score',
       'Agent Corrected CAT Name', 'Agent Corrected Integer',
       'CAT NAME_ ValidationScore [0-100]', 'Integer_ValidationScore[0-100]'],
      dtype='object')

In [212]:
data_cicd_combined['Agent Corrected CAT Name'].nunique()

122

In [213]:
data_cicd_combined = data_cicd_combined.dropna(axis=0, subset=['Agent Corrected CAT Name'])
data_cicd_combined = data_cicd_combined.drop(data_cicd_combined[data_cicd_combined['Agent Corrected CAT Name']=='#REF!'].index)
data_cicd_combined = data_cicd_combined.drop(data_cicd_combined[data_cicd_combined['Agent Corrected CAT Name']=='TEMP_COLD'].index)
print(data_cicd_combined.shape)

(601245, 8)


In [214]:
data_cicd_combined['Agent Corrected CAT Name'].nunique()

120

In [215]:
data_cicd_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 601245 entries, 0 to 410133
Data columns (total 8 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Item                               601242 non-null  object
 1   Description                        439391 non-null  object
 2   establishment_type                 601245 non-null  object
 3   Confidence Score                   598459 non-null  object
 4   Agent Corrected CAT Name           601245 non-null  object
 5   Agent Corrected Integer            600520 non-null  object
 6   CAT NAME_ ValidationScore [0-100]  601245 non-null  int64 
 7   Integer_ValidationScore[0-100]     601245 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 41.3+ MB


In [25]:
#cicd_categories_beforemapping=data_cicd_combined['Agent Corrected CAT Name'].unique()
#cicd_categories_beforemapping=pd.Series(cicd_categories_beforemapping)
#cicd_categories_beforemapping.to_csv('cicd_categories_beforemapping.csv')

ImportError: cannot import name 'ABCIndexClass' from 'pandas.core.dtypes.generic' (/Users/mdevar2/opt/anaconda3/envs/py365/lib/python3.7/site-packages/pandas/core/dtypes/generic.py)

In [216]:
del cat_dict, int_dict

NameError: name 'int_dict' is not defined

In [5]:
mapping_data=pd.read_csv('/Users/mdevar2/Downloads/data discripency/mapping_latest.csv')
cat_dict = dict(zip(mapping_data['Agent CAT Name'], mapping_data['Updated CAT Name']))
#data_cicd_combined["Agent Corrected CAT Name standarized"] = data_cicd_combined["Agent Corrected CAT Name"].replace(cat_dict)

In [6]:
mapping_data_1=pd.read_csv('')
mapping1 = dict(mapping_data_1[['Updated CAT Name', 'Updated Integer']].values)
#data_cicd_combined['Agent Corrected Integer standardized'] = data_cicd_combined['Agent Corrected CAT Name standarized'].map(mapping1)

In [219]:
data_cicd_combined['Agent Corrected CAT Name standarized'].nunique()

101

In [20]:
master_mapping_data = pd.read_csv("/Users/jghosh2/Documents/my-notebook/restro folder/MASTER_Tax_Category_Mapping_latest1.csv")

In [21]:
#master_mapping_data = pd.read_csv("/Users/jghosh2/Documents/my-notebook/restro folder//Users/jghosh2/Documents/my-notebook/restro folder/MASTER_Tax_Category_Mapping_latest1.csv")

In [22]:
master_mapping_data.columns

Index(['Agent CAT Name', 'Updated CAT Name', 'Updated Integer'], dtype='object')

In [23]:
#set_cicd_categories_aftermapping = set(data_cicd_combined['Agent Corrected CAT Name standarized'].unique())
set_master_mapping_data_categories = set(master_mapping_data['Updated CAT Name'].unique())

In [222]:
dif_categories = list(set_cicd_categories_aftermapping.difference(set_master_mapping_data_categories))
len(dif_categories)

0

In [223]:
dif_categories

[]

In [224]:
data_cicd_latest=data_cicd_combined[['Item','Description','establishment_type','Agent Corrected CAT Name standarized', 'Agent Corrected Integer standardized']]
data_cicd_latest['target']=data_cicd_latest['Agent Corrected CAT Name standarized'] + ":" + data_cicd_latest['Agent Corrected Integer standardized']
data_cicd_latest.drop(['Agent Corrected CAT Name standarized', 'Agent Corrected Integer standardized'],inplace=True,axis=1)
print(data_cicd_latest.shape)

(601245, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  * 'all' : If all values are NA, drop that row or column.


In [225]:
data_cicd_latest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 601245 entries, 0 to 410133
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Item                601242 non-null  object
 1   Description         439391 non-null  object
 2   establishment_type  601245 non-null  object
 3   target              601245 non-null  object
dtypes: object(4)
memory usage: 22.9+ MB


In [226]:
data_historical = pd.read_csv('../data/final_historical_data.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','target'])
print(data_historical.shape)

(100820, 4)


In [227]:
data_historical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100820 entries, 0 to 100819
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Item                100818 non-null  object
 1   Description         74981 non-null   object
 2   establishment_type  100820 non-null  object
 3   target              100820 non-null  object
dtypes: object(4)
memory usage: 3.1+ MB


In [228]:
#read input from historical data into dataframe
#data_df = pd.read_csv('../data/final_historical_data.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','target'])
#choose sample data from entire data
data_historical = data_historical.sample(frac=1, random_state=42)
print(data_historical.shape)
#fill blanks with ''
data_historical = data_historical.fillna('')
print(data_historical.shape)
# combine the columns Item, Description and establishment_type into one column 'combined_text'

(100820, 4)
(100820, 4)


In [229]:
data_cicd_latest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 601245 entries, 0 to 410133
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Item                601242 non-null  object
 1   Description         439391 non-null  object
 2   establishment_type  601245 non-null  object
 3   target              601245 non-null  object
dtypes: object(4)
memory usage: 22.9+ MB


In [231]:
data_historical[['Agent Corrected CAT Name','Agent Corrected int']]= data_historical.target.str.split(":",expand=True)

In [287]:
set_historical_categories_aftermapping = set(data_historical['Agent Corrected CAT Name'].unique())
#set_master_mapping_data_categories = set(master_mapping_data['CAT NAME,CAT_TEMP'].unique())
dif_categories = list(set_historical_categories_aftermapping.difference(set_master_mapping_data_categories))
len(dif_categories)

29

In [288]:
data_historical["Agent Corrected CAT Name standarized"] = data_historical["Agent Corrected CAT Name"].replace(cat_dict)

In [289]:
data_historical['Agent Corrected Integer standardized'] = data_historical['Agent Corrected CAT Name standarized'].map(mapping1)


In [290]:
set_historical_categories_aftermapping = set(data_historical['Agent Corrected CAT Name standarized'].unique())
#set_master_mapping_data_categories = set(master_mapping_data['CAT NAME,CAT_TEMP'].unique())
dif_categories = list(set_historical_categories_aftermapping.difference(set_master_mapping_data_categories))
len(dif_categories)

7

In [291]:
data_historical1 = data_historical[~data_historical['Agent Corrected CAT Name standarized'].isin(dif_categories)]

In [293]:
set_historical_categories_aftermapping1 = set(data_historical1['Agent Corrected CAT Name standarized'].unique())
#set_master_mapping_data_categories = set(master_mapping_data['CAT NAME,CAT_TEMP'].unique())
dif_categories = list(set_historical_categories_aftermapping1.difference(set_master_mapping_data_categories))
len(dif_categories)

0

In [241]:
data_historical1.shape

(100774, 8)

In [243]:
dif_categories

[]

In [294]:
del data_combined_final

In [295]:
data_combined_final = pd.concat([data_cicd_latest, data_historical1], ignore_index=True)

In [319]:
set_combined_categories_aftermapping1 = set(data_combined_final['Agent Corrected CAT Name standarized'].unique())
#set_master_mapping_data_categories = set(master_mapping_data['CAT NAME,CAT_TEMP'].unique())
dif_combined_categories = list(set_combined_categories_aftermapping1.difference(set_master_mapping_data_categories))
len(dif_combined_categories)

0

In [300]:
dif_combined_categories

[]

In [298]:
data_combined_final = data_combined_final.dropna(axis=0, subset=['Agent Corrected CAT Name standarized'])
#data_cicd = data_cicd.dropna(axis=0, subset=['Agent Corrected CAT Name'])

In [256]:
data_combined_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702019 entries, 0 to 702018
Data columns (total 8 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   Item                                  702016 non-null  object
 1   Description                           540165 non-null  object
 2   establishment_type                    702019 non-null  object
 3   target                                702019 non-null  object
 4   Agent Corrected CAT Name              100774 non-null  object
 5   Agent Corrected int                   100774 non-null  object
 6   Agent Corrected CAT Name standarized  100774 non-null  object
 7   Agent Corrected Integer standardized  100774 non-null  object
dtypes: object(8)
memory usage: 42.8+ MB


In [301]:
data_combined_final.shape

(100774, 8)

In [302]:
data_combined_final = data_combined_final.astype(str)

In [303]:
data_combined_final['combined_text'] = data_combined_final[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)

In [304]:
data_combined_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100774 entries, 601245 to 702018
Data columns (total 9 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   Item                                  100774 non-null  object
 1   Description                           100774 non-null  object
 2   establishment_type                    100774 non-null  object
 3   target                                100774 non-null  object
 4   Agent Corrected CAT Name              100774 non-null  object
 5   Agent Corrected int                   100774 non-null  object
 6   Agent Corrected CAT Name standarized  100774 non-null  object
 7   Agent Corrected Integer standardized  100774 non-null  object
 8   combined_text                         100774 non-null  object
dtypes: object(9)
memory usage: 7.7+ MB


In [305]:
data_combined_final['processed_text']= data_combined_final['combined_text'].map(lambda s:preprocess_text(s)) 
print(data_combined_final.shape)

(100774, 10)


In [306]:

#data_combined_final['combined_text'] = data_combined_final[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)

data_combined_final.drop_duplicates(subset=['processed_text','target'],inplace=True)
print(data_combined_final.shape)
data_combined_final.dropna(subset=['target'],inplace=True)
data_combined_final = data_combined_final.reset_index(drop=True)
data_combined_final = data_combined_final[data_combined_final['target']!= '#REF!:#REF!']
print(data_combined_final.shape)


(92066, 10)
(92066, 10)


In [307]:
data_combined_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92066 entries, 0 to 92065
Data columns (total 10 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Item                                  92066 non-null  object
 1   Description                           92066 non-null  object
 2   establishment_type                    92066 non-null  object
 3   target                                92066 non-null  object
 4   Agent Corrected CAT Name              92066 non-null  object
 5   Agent Corrected int                   92066 non-null  object
 6   Agent Corrected CAT Name standarized  92066 non-null  object
 7   Agent Corrected Integer standardized  92066 non-null  object
 8   combined_text                         92066 non-null  object
 9   processed_text                        92066 non-null  object
dtypes: object(10)
memory usage: 7.7+ MB


In [335]:
import pandas as pd
pd.__version__

'1.3.5'

In [3]:
data_combined_final.to_csv("data_combined_final.csv")

NameError: name 'data_combined_final' is not defined

In [5]:
data_combined_final = pd.read_csv("/Users/jghosh2/Documents/my-notebook/restro folder/data_combined_final.csv")

In [6]:
#combine historical with combined cicd

In [7]:
X = data_combined_final[['processed_text']]
y = data_combined_final['target']
# split the cicd data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)

<a id='Model Training'></a>

In [8]:
print('Training data size: {}'.format(len(X_train)))
print('Test data size: {}'.format(len(X_test)))

Training data size: 73652
Test data size: 18414


In [9]:
print('Number of unique labels : {}'.format(len(y_train.unique().tolist())))

Number of unique labels : 96


# Model Training

The Model Pipeline consists of 1. CountVectorizer, 2. Tfidf-Transformer 3. RandomForestClassifier 

In [14]:
#del rf, result

NameError: name 'rf' is not defined

In [10]:
#create the model pipeline
rf1 = Pipeline([('vect', CountVectorizer(token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',stop_words='english',max_df=0.85)),
            ('tfidf', TfidfTransformer()),
            #('mnb', MultinomialNB(alpha= 0.05,fit_prior= False))])
            ('clf', RandomForestClassifier(n_jobs=-1, random_state=42,class_weight='balanced',max_depth=400))])




In [11]:
# perform model training
rf1.fit(X_train['processed_text'].values, y_train.values)

  if LooseVersion(joblib_version) < '0.12':
  if _joblib.__version__ >= LooseVersion('0.12'):


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.85,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
                                 tokenizer=None,...
                 RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                        criterion='gini', max_depth=400,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease

In [12]:
# create a result dataframe to store final results
result1=X_test
# model prediction
y_pred = rf1.predict(X_test['processed_text'].values)

result1['original_cat']= y_test.values
result1['predicted_cat'] = y_pred
result1['prediction_cat_confscore'] = np.round_(np.max(rf1.predict_proba(X_test['processed_text']), axis=1), decimals=2)

#
output = {'accuracy': accuracy_score(y_pred,y_test),'precision_score':precision_score(y_pred,y_test,average='weighted'),'recall_score':recall_score(y_pred,y_test,average='weighted')
,'f1_score':f1_score(y_pred,y_test,average='weighted')}

result1['confusion_matrix'] = str(output)

  if _joblib.__version__ >= LooseVersion('0.12'):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
  if _joblib.__version__ >= LooseVersion('0.12'):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'pre

In [17]:
len(rf1.named_steps['vect'].get_feature_names())

19078

In [18]:
print(rf1.named_steps['clf'].estimators_[0].tree_.max_depth)

400


In [19]:
depths = [tree.tree_.max_depth for tree in rf1.named_steps['clf'].estimators_]
print(f"Mean tree depth in the Random Forest: {np.round(np.mean(depths))}")

Mean tree depth in the Random Forest: 400.0


In [13]:
result1['confusion_matrix'][5:6].values

array(["{'accuracy': 0.7975453459324428, 'precision_score': 0.8138793467109812, 'recall_score': 0.7975453459324428, 'f1_score': 0.7848983695716372}"],
      dtype=object)

In [197]:
result['confusion_matrix'][5:6].values

array(["{'accuracy': 0.7541733509109103, 'precision_score': 0.7564808336603325, 'recall_score': 0.7541733509109103, 'f1_score': 0.74356504529723}"],
      dtype=object)

In [147]:
result['confusion_matrix'][5:6].values

array(["{'accuracy': 0.7541733509109103, 'precision_score': 0.7564808336603325, 'recall_score': 0.7541733509109103, 'f1_score': 0.74356504529723}"],
      dtype=object)

<a id='Model Saving'></a>

In [17]:
result['confusion_matrix'][5:6].values

array(["{'accuracy': 0.8072686733556299, 'precision_score': 0.8029800433559108, 'recall_score': 0.8072686733556299, 'f1_score': 0.7904798356764975}"],
      dtype=object)

In [16]:
array(["{'accuracy': 0.8214672148885522, 'precision_score': 0.8704898295343778, 'recall_score': 0.8214672148885522, 'f1_score': 0.8401291751124788}"],
      dtype=object)

NameError: name 'array' is not defined

In [27]:
result1.to_csv("result1.csv")

In [24]:
result1[['predicted_cat_name','predicted_int']]= result1.predicted_cat.str.split(":",expand=True)
result_categories1 = set(result1['predicted_cat_name'].unique())
dif_result_categories1 = list(result_categories1.difference(set_master_mapping_data_categories))
len(dif_result_categories1)

18

In [25]:
dif_result_categories1

['CAT_WATER,TRAIT_NONCARB,TRAIT_UNFLV_UNSWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_COFFEE,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_PREPARED_DRINK,TEMP_HEATED',
 'CAT_WATER,TRAIT_NONCARB,TRAIT_FLV_SWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_CANDY,TRAIT_FLOUR,TEMP_HEATED',
 'CAT_WATER,TRAIT_CARB,TRAIT_FLV_SWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_JUICE,TRAIT_PCT_100,TEMP_HEATED',
 'CAT_ICECREAM,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,CAT_ICECREAM,TEMP_HEATED',
 'CAT_WATER,TRAIT_CARB,TRAIT_UNFLV_UNSWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_SOFT_DRINK,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_TEA,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_MILK_COCOA,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_ALCOHOL,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,TEMP_HEATED',
 'CAT_ENERGY_DRINK,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,CAT_SNACK,TEMP_HEATED',
 'CAT_DELI_PLATTER,TEMP_COLD']

In [332]:
set_master_mapping_data_categories.to_csv("set_master_mapping_data_categories.csv")

AttributeError: 'set' object has no attribute 'to_csv'

In [333]:
set_master_mapping_data_categories

{'CAT_ALCOHOL,TEMP_COLD',
 'CAT_ANTI_FREEZE',
 'CAT_BABY_FORMULA',
 'CAT_BABY_WIPES',
 'CAT_BAKERY_ITEM',
 'CAT_BAKERY_ITEM_GROCERY_STORE',
 'CAT_BANDAGES',
 'CAT_BATHING_SUITS',
 'CAT_BATTERIES',
 'CAT_BEER',
 'CAT_BREATH_MINTS',
 'CAT_CANDY,TEMP_COLD',
 'CAT_CANDY,TRAIT_FLOUR,TEMP_COLD',
 'CAT_CANDY_COATED_NUTS',
 'CAT_CANNABIS',
 'CAT_CHARCOAL_BRIQUETTES',
 'CAT_CHOCOLATE',
 'CAT_CIDER',
 'CAT_CLOTHING',
 'CAT_COFFEE,CONTAINER_BOTTLED,TEMP_COLD',
 'CAT_COMBOS_BUNDLES',
 'CAT_COMP_HARDWARE',
 'CAT_COMP_SOFTWARE',
 'CAT_CONDOMS',
 'CAT_CONFECTIONARY',
 'CAT_CONTACT_LENS_SOLUTION',
 'CAT_COSTUMES',
 'CAT_DELI_PLATTER,TEMP_UNHEATED',
 'CAT_DIAPERS',
 'CAT_DISPOSABLE_GLOVES',
 'CAT_ENERGY_DRINK,CONTAINER_BOTTLED,TEMP_COLD',
 'CAT_ENGINE_OIL',
 'CAT_FEMININE_HYGIENE_PRODUCTS',
 'CAT_FERTILIZER',
 'CAT_FIREWOOD',
 'CAT_FIRST_AID_KITS',
 'CAT_FLASHLIGHT',
 'CAT_FOOD_BY_WT_VOL,TEMP_UNHEATED',
 'CAT_FOOTWEAR',
 'CAT_FORTIFIED_WINE',
 'CAT_FRUIT_VEG_PLANTS',
 'CAT_GIFT_CARDS',
 'CAT_GLOVES',
 

In [331]:
dif_result_categories1

['CAT_ALCOHOL,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,TEMP_HEATED',
 'CAT_JUICE,TRAIT_PCT_100,TEMP_HEATED',
 'CAT_COFFEE,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_PREPARED_DRINK,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,CAT_SNACK,TEMP_HEATED',
 'CAT_CANDY,TRAIT_FLOUR,TEMP_HEATED',
 'CAT_MILK_COCOA,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,CAT_ICECREAM,TEMP_HEATED',
 'CAT_WATER,TRAIT_CARB,TRAIT_FLV_SWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_ICECREAM,TEMP_HEATED',
 'CAT_TEA,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_WATER,TRAIT_NONCARB,TRAIT_UNFLV_UNSWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_DELI_PLATTER,TEMP_COLD',
 'CAT_SOFT_DRINK,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_WATER,TRAIT_NONCARB,TRAIT_FLV_SWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_WATER,TRAIT_CARB,TRAIT_UNFLV_UNSWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_ENERGY_DRINK,CONTAINER_BOTTLED,TEMP_HEATED']

In [202]:
dif_result_categories

['CAT_ALCOHOL,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,TEMP_HEATED',
 'CAT_JUICE,TRAIT_PCT_100,TEMP_HEATED',
 'CAT_PREPARED_DRINK,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,CAT_SNACK,TEMP_HEATED',
 'TEMP_HEATED',
 'CAT_MILK_COCOA,CONTAINER_BOTTLED,TEMP_HEATED',
 'TEMP_COLD',
 'CAT_ICECREAM,TEMP_HEATED',
 'CAT_WATER,TRAIT_CARB,TRAIT_FLV_SWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_TEA,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_WATER,TRAIT_NONCARB,TRAIT_UNFLV_UNSWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_SOFT_DRINK,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_WATER,TRAIT_NONCARB,TRAIT_FLV_SWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_WATER,TRAIT_CARB,TRAIT_UNFLV_UNSWT,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_ENERGY_DRINK,CONTAINER_BOTTLED,TEMP_HEATED',
 'CAT_PREPACKAGED_FOOD,CAT_ICECREAM,TEMP_HEATED']

# Model Saving

In [148]:
import joblib
import datetime
# save the model to disk
filename_primary= 'finalized_model.sav'
joblib.dump(rf, open(filename_primary, 'wb'))

<a id='Validation and Results'></a>

# Validation and Results

In [None]:
#accuracy score of the model
accuracy = rf.score(X_test['processed_text'].values, y_test)
print("Accuracy = {}".format(accuracy))

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
#classification report 
classification_report = metrics.classification_report(y_test_final, y_pred, output_dict=True)

In [None]:
display(pd.DataFrame(classification_report).transpose())

In [None]:
# check the misclassifications
misclassifications= result.loc[result['original_cat']!=result['predicted_cat']]

In [None]:
misclassifications

In [None]:
misclassifications.to_csv('../output/misclassifications.csv')

In [None]:
misclassifications.groupby(['establishment_type']).size()

In [19]:
train_size = 0.8
train_end = int(len(data_df)*train_size)
df_train = data_df[:train_end]
df_test = data_df[train_end:]
df_train = df_train[['Item','Description','establishment_type','target']]
df_test = df_test[['Item','Description','establishment_type','target']]
train_size_cicd=0.02
train_end_cicd = int(len(data_cicd_final)*train_size_cicd)
df2_train = data_cicd_final[:train_end_cicd]
df2_test = data_cicd_final[train_end_cicd:]
df2_train = df2_train[['Item','Description','establishment_type','target']]
df2_test = df2_test[['Item','Description','establishment_type','target']]
X_train_save = df_train.append(df2_train)
X_test_save = df_test.append(df2_test)
X_train_save['label'] = 'train'
X_test_save['label'] = 'test'
X_data = X_train_save.append(X_test_save)
X_data.to_csv('df_traintestdata.csv')

In [None]:
X_data