# <h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import Requirements" data-toc-modified-id="Import-Requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Requirements</a></span></li><li><span><a href="#Prepare Training Data" data-toc-modified-id="Prepare-Training-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare Training Data</a></span><ul class="toc-item"></ul></li><li><span><a href="#Model Training" data-toc-modified-id="Model Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href="#Model Saving" data-toc-modified-id="Model Saving-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Saving</a></span><ul class="toc-item"></ul></li><li><span><a href="#Validation and Results" data-toc-modified-id="Validation and Results-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Validation and Results</a></span><ul class="toc-item"></ul></div>

<a id='Import Requirements'></a>

# Import Requirements

In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string
import matplotlib.pyplot as plt

<a id='Prepare Training Data'></a>

# Prepare Training Data

Input data for training consists of both historical data and CICD data( Production run data for which manual agent validation has been done for the ML prediction)

In [2]:
def preprocess_text(message):

    #stopwords
    stpwrd = nltk.corpus.stopwords.words('english')
    #stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]','', message.lower())
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd and len(word)>2])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #print("message is : ",message)
    message=message.split()[0:15]
    message = " ".join([word for word in message])
    return message

In [3]:
#read input from cicd data into dataframe
data_cicd=pd.read_csv('../data/TaxML-CICD - Prod_Data (12).csv', usecols = ['Item','Description','establishment_type','Confidence Score','Agent Corrected CAT Name', 'Agent Corrected Integer','CAT NAME_ ValidationScore [0-100]','Integer_ValidationScore[0-100]'])
#low conf and correcr predicted data
data_cicd_low_conf=data_cicd[(data_cicd['Confidence Score']<0.6)]
data_cicd_low_conf=data_cicd_low_conf[(data_cicd_low_conf['CAT NAME_ ValidationScore [0-100]']==100) &(data_cicd_low_conf['Integer_ValidationScore[0-100]']==100)]
#misclassified data                                        
data_cicd_misclassification=data_cicd[(data_cicd['CAT NAME_ ValidationScore [0-100]']!=100) &(data_cicd['Integer_ValidationScore[0-100]']!=100)]
data_cicd_latest=data_cicd_low_conf.append(data_cicd_misclassification)
data_cicd_latest=data_cicd_latest[['Item','Description','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer']]

ongoing_phase1_cicd=pd.read_csv('../data/Ongoing Training_Duplicate Data.csv', usecols = ['Item','Description','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer'])

#remove empty rows from dataframe
data_cicd_latest.dropna(how='all',inplace=True)
ongoing_phase1_cicd.dropna(how='all',inplace=True)

#remove rows having empty 'Agent Corrected CAT Name', ''Agent Corrected Integer'
data_cicd_latest.dropna(subset=['Agent Corrected CAT Name', 'Agent Corrected Integer'],inplace=True)
ongoing_phase1_cicd.dropna(subset=['Agent Corrected CAT Name', 'Agent Corrected Integer'],inplace=True)
#removing all duplicate rows
data_cicd_latest=data_cicd_latest.drop_duplicates(subset=['Item','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer'],keep='first')
print(data_cicd_latest.shape)
ongoing_phase1_cicd=ongoing_phase1_cicd.drop_duplicates(subset=['Item','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer'],keep='first')
print(ongoing_phase1_cicd.shape)
#fresh data of cicd
data_cicd_without_duplicate=data_cicd_latest.drop_duplicates(subset=['Item','establishment_type'],keep=False)
print(data_cicd_without_duplicate.shape)

#append two dataframe
data_cicd_final=data_cicd_without_duplicate.append(ongoing_phase1_cicd)
#creat target string which will be used for prediction 
data_cicd_final['target']= data_cicd_final['Agent Corrected CAT Name'] + ":" + data_cicd_final['Agent Corrected Integer']
print(data_cicd_final.shape)
#data_cicd_duplicate=data_cicd[data_cicd.duplicated(subset=['Item','establishment_type'],keep=False)]
#data_cicd_duplicate.to_csv('data_cicd_duplicate.csv')
#print(data_cicd_duplicate.shape)

(35365, 5)
(1383, 5)
(32714, 5)
(34097, 6)


In [4]:
# combine the columns Item, Description and establishment_type into one column 'combined_text'
data_cicd_final['combined_text'] = data_cicd_final[['Item','establishment_type','Description']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)

# apply data preprocessing steps on the prepared column
data_cicd_final['processed_text']= data_cicd_final['combined_text'].map(lambda s:preprocess_text(s)) 

data_cicd_final = data_cicd_final.reset_index(drop=True)
# prepare the target column by combining 'Agent Corrected CAT Name' and 'Agent Corrected Integer'

data_cicd_final=data_cicd_final.drop_duplicates(subset=['processed_text','target'],keep='first')
print(data_cicd_final.shape)

#remove rows having empty target column
data_cicd_final.dropna(subset=['target'],inplace=True)

data_cicd_final = data_cicd_final[data_cicd_final['target']!= '#REF!:#REF!']
data_cicd_final.to_csv('data_cicd_final.csv')
X_cicd= data_cicd_final[['Item','Description','establishment_type','processed_text']]
y_cicd= data_cicd_final['target']

# split the cicd data into train and test 
X_train_cicd, X_test_cicd, y_train_cicd, y_test_cicd = train_test_split(X_cicd, y_cicd,shuffle=True, test_size = .02, random_state = 42)



(31540, 8)


In [5]:
#read input from historical data into dataframe
data_df = pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/historical_data_14_01_22.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','target'])
#choose sample data from entire data
data_df = data_df.sample(frac=1, random_state=42)

#fill blanks with ''
data_df = data_df.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
data_df['combined_text'] = data_df[['Item','establishment_type','Description']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
data_df['processed_text'] = data_df['combined_text'].map(lambda s:preprocess_text(s)) 
print(data_df.shape)
data_df=data_df.drop_duplicates(subset=['processed_text','target'],keep='first')
print(data_df.shape)
data_df = data_df.reset_index(drop=True)

X = data_df[['Item','Description','establishment_type','processed_text']]
y = data_df['target']

# split the cicd data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20,shuffle=True, random_state = 42)


(204725, 6)
(160865, 6)


We will append the CICD data to the historical data to create the final train and test data.
Train set has 80% of all historical data and 90% of all cicd data.
Test set consists of 20% of historic data and 10% of all cicd data.

In [6]:
X_train_final = X_train.append(X_train_cicd)
X_test_final = X_test.append(X_test_cicd)
y_train_final = y_train.append(y_train_cicd)
y_test_final = y_test.append(y_test_cicd)

In [7]:
X_train_final.to_csv('final_xtrain.csv')

<a id='Model Training'></a>

In [8]:
print('Training data size: {}'.format(len(X_train_final)))
print('Test data size: {}'.format(len(X_test_final)))

Training data size: 159597
Test data size: 32804


In [9]:
print('Number of unique labels : {}'.format(len(y_train.unique().tolist())))

Number of unique labels : 350


In [10]:
category_count = data_df.groupby(['target'],sort=False).agg({'target':'count'})
category_count.rename(columns={'target':'count'},inplace=True)
category_count.sort_values('count',ascending=False)

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
CAT_LIQUOR:535,21161
CAT_WINE:534,15628
CAT_TPP:531,12477
"CAT_PREPARED_FOOD,TEMP_HEATED:101,1",11158
CAT_BEER:533,9966
...,...
"CAT_TPP_AIR_FRESHENER,TEMP_UNHEATED:773,1",1
"CAT_SPARKLING_WINE,TEMP_HEATED:716,1",1
"CAT_PREPACKAGED_FOOD_CAKES,TEMP_HEATED:718,1",1
"CAT_CHOCOLATE,TEMP_HEATED:706,1",1


In [11]:
category_count.to_csv('category_count.csv')

In [12]:
# Check on how the training data sets perform with varying split percentages and 1000 shuffles
def acc_check(X1,y1):
    t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]
    rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english', max_df=0.85)),
       ('tfidf', TfidfTransformer()),
       ('clf', RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42))])
    plt.figure()
    for s in t:
        scores = []
        for i in range(1,1000):
            X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 1-s,random_state=42)
            rf.fit(X_train, y_train)
            scores.append(rf.score(X_test, y_test))
        plt.plot(s, np.mean(scores), 'bo')
    plt.xlabel('Training set proportion (%)')
    plt.ylabel('accuracy');

print('ready')

ready


In [13]:
#c1 = acc_check(X['processed_text'],y)
#print('tax category',c1)

# Model Training

The Model Pipeline consists of 1. CountVectorizer, 2. Tfidf-Transformer 3. RandomForestClassifier 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(strip_accents='ascii',token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english', max_df=0.85)

X = vectorizer.fit_transform(X_train_final['processed_text'].values)

features = vectorizer.get_feature_names()

len(features)

50347

In [15]:
# create a result dataframe to store final results
result=X_test_final

#create the model pipeline
rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii',token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english')),
             #('pca',  PCA(n_components=2)),
            ('tfidf', TfidfTransformer()),
            #('auto', AutoSklearnClassifier(time_left_for_this_task=2*60, per_run_time_limit=30, n_jobs=8)),
            ('clf', RandomForestClassifier(class_weight='balanced',n_jobs=-1,random_state=42))])

In [16]:
# perform model training
rf.fit(X_train_final['processed_text'].values, y_train_final.values)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents='ascii',
                                 token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
                                 tokenizer=Non...
                 RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decreas

In [17]:
# model prediction
result=X_test_final
y_pred = rf.predict(X_test_final['processed_text'].values)

result['original_cat']= y_test_final.values
result['predicted_cat'] = y_pred
result['prediction_cat_confscore'] = rf.predict_proba(X_test_final['processed_text']).max()

#
output = {'accuracy': accuracy_score(y_pred,y_test_final),'precision_score':precision_score(y_pred,y_test_final,average='weighted'),'recall_score':recall_score(y_pred,y_test_final,average='weighted')
,'f1_score':f1_score(y_pred,y_test_final,average='weighted')}

result['confusion_matrix'] = str(output)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [18]:
result['confusion_matrix'][5:6].values

array(["{'accuracy': 0.6906474820143885, 'precision_score': 0.7129271150817615, 'recall_score': 0.6906474820143885, 'f1_score': 0.6973517725442497}"],
      dtype=object)

In [19]:
import pickle
import joblib
import datetime
# save the model to disk
filename_primary= 'finalized_model.sav'
joblib.dump(rf, open(filename_primary, 'wb'))

In [20]:
rf.named_steps['clf']

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [21]:
#Check the depth of the first tree in the Random Forest
print(rf.named_steps['clf'].estimators_[0].tree_.max_depth)

2569


In [22]:
#Let’s check the depth of all the trees in the Forest:
depths = [tree.tree_.max_depth for tree in rf.named_steps['clf'].estimators_]
print(len(depths))
print(f"Mean tree depth in the Random Forest: {np.round(np.mean(depths))}")


10
Mean tree depth in the Random Forest: 2564.0


In [23]:
import joblib

In [24]:
#Check the size of single tree in the disk after saving with joblib:

joblib.dump(rf.named_steps['clf'].estimators_[0], "first_tree_from_RF.joblib") 
print(f"Single tree size: {np.round(os.path.getsize('first_tree_from_RF.joblib') / 1024 / 1024, 2) } MB")

Single tree size: 379.02 MB


In [25]:
joblib.dump(rf.named_steps['clf'].estimators_, "RandomForest_100_trees.joblib") 
print(f"Random Forest size: {np.round(os.path.getsize('RandomForest_100_trees.joblib') / 1024 / 1024, 2) } MB")

Random Forest size: 3755.42 MB


In [26]:
from sklearn.metrics import log_loss

In [27]:
#accuracy score of the model
accuracy = rf.score(X_test_final['processed_text'].values, y_test_final)
print("Accuracy = {}".format(accuracy))

Accuracy = 0.6906474820143885


In [28]:
# y_predicted = rf.predict_proba(X_test_final['processed_text'].values)
# rf_loss = log_loss(y_test_final, y_predicted)
# print(rf_loss)


In [29]:
shallow_rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english', max_df=0.85)),
       ('tfidf', TfidfTransformer()),
       ('clf', RandomForestClassifier(n_jobs=-1, random_state=42,class_weight='balanced_subsample'))])

In [30]:
# shallow_rf = RandomForestClassifier(max_depth=6)
# shallow_rf.fit(X_train, y_train)
# perform model training
shallow_rf.fit(X_train_final['processed_text'].values, y_train_final.values)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.85,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents='ascii',
                                 token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
                                 tokenizer=No...
                 RandomForestClassifier(bootstrap=True,
                                        class_weight='balanced_subsample',
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
          

In [31]:
#Check the depth of the first tree in the Random Forest
print(shallow_rf.named_steps['clf'].estimators_[0].tree_.max_depth)

2352


In [32]:
#Check the size of single tree in the disk after saving with joblib:

joblib.dump(shallow_rf.named_steps['clf'].estimators_[0], "first_tree_from_RF.joblib") 
print(f"Single tree size: {np.round(os.path.getsize('first_tree_from_RF.joblib') / 1024 / 1024, 2) } MB")

Single tree size: 374.66 MB


In [33]:
#Check the size of single tree in the disk after saving with joblib:

joblib.dump(shallow_rf.named_steps['clf'].estimators_, "all_tree_from_RF.joblib") 
print(f"Single tree size: {np.round(os.path.getsize('all_tree_from_RF.joblib') / 1024 / 1024, 2) } MB")

Single tree size: 3748.89 MB


In [34]:
#accuracy score of the model
accuracy = shallow_rf.score(X_test_final['processed_text'].values, y_test_final)
print("Accuracy = {}".format(accuracy))

Accuracy = 0.6861053530057311


In [35]:
y_pred = shallow_rf.predict(X_test_final['processed_text'].values)

In [36]:
import sklearn.metrics as metrics

In [37]:
classification_report = metrics.classification_report(y_test_final, y_pred, output_dict=True)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [38]:
display(pd.DataFrame(classification_report).transpose())

Unnamed: 0,precision,recall,f1-score,support
"CAT_ALCOHOL,TEMP_COLD:109,1",0.234483,0.251852,0.242857,135.000000
"CAT_ALCOHOL,TEMP_HEATED:109,1",0.582569,0.496094,0.535865,1280.000000
"CAT_ALCOHOL,TEMP_UNHEATED:109,1",0.165605,0.091873,0.118182,283.000000
CAT_BABY_FORMULA:515,1.000000,1.000000,1.000000,3.000000
CAT_BABY_WIPES:513,0.555556,0.625000,0.588235,8.000000
...,...,...,...,...
"CAT_WINE,TEMP_UNHEATED:534,1",0.108108,0.042105,0.060606,95.000000
CAT_WINE:534,0.865049,0.865329,0.865189,3089.000000
accuracy,0.686105,0.686105,0.686105,0.686105
macro avg,0.445018,0.417342,0.417562,32804.000000


In [39]:
# shallow_rf2 = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english', max_df=0.85)),
#        ('tfidf', TfidfTransformer()),
#        ('clf', RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42, max_depth = 200, n_estimators=20 ))])

In [40]:
# shallow_rf2.fit(X_train_final['processed_text'].values, y_train_final.values)

In [41]:
# #Check the size of single tree in the disk after saving with joblib:

# joblib.dump(shallow_rf2.named_steps['clf'].estimators_[0], "first_tree_from_RF.joblib") 
# print(f"Single tree size: {np.round(os.path.getsize('first_tree_from_RF.joblib') / 1024 / 1024, 2) } MB")

In [42]:
# joblib.dump(rf.named_steps['clf'].estimators_, "RandomForest_100_trees.joblib") 
# print(f"Random Forest size: {np.round(os.path.getsize('RandomForest_100_trees.joblib') / 1024 / 1024, 2) } MB")

In [43]:
# #accuracy score of the model
# accuracy = shallow_rf2.score(X_test_final['processed_text'].values, y_test_final)
# print("Accuracy = {}".format(accuracy))

In [44]:
# model prediction
result=X_test_final
y_pred = shallow_rf.predict(X_test_final['processed_text'].values)

result['original_cat']= y_test_final
result['predicted_cat'] = y_pred

result['prediction_cat_confscore'] = shallow_rf.predict_proba(X_test_final['processed_text'].values).max()

#
output = {'accuracy': accuracy_score(y_pred,y_test_final),'precision_score':precision_score(y_pred,y_test_final,average='weighted'),'recall_score':recall_score(y_pred,y_test_final,average='weighted')
,'f1_score':f1_score(y_pred,y_test_final,average='weighted')}

result['confusion_matrix'] = str(output)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [45]:
# import pickle
# import datetime
# # save the model to disk
# filename_primary= 'finalized_model.sav'
# pickle.dump(shallow_rf2, open(filename_primary, 'wb'))

<a id='Model Saving'></a>

# Model Saving

In [46]:
import pickle
import joblib
import datetime
# save the model to disk
filename_primary= 'finalized_model.sav'
joblib.dump(shallow_rf, open(filename_primary, 'wb'),3)

In [47]:
import joblib
x=joblib.load('finalized_model.sav')

In [48]:
y_pred = x.predict(X_test_final['processed_text'].values)

result['original_cat']= y_test_final
result['predicted_cat'] = y_pred

result['prediction_cat_confscore'] = x.predict_proba(X_test_final['processed_text'].values).max()

#
output = {'accuracy': accuracy_score(y_pred,y_test_final),'precision_score':precision_score(y_pred,y_test_final,average='weighted'),'recall_score':recall_score(y_pred,y_test_final,average='weighted')
,'f1_score':f1_score(y_pred,y_test_final,average='weighted')}

result['confusion_matrix'] = str(output)

In [49]:
result['confusion_matrix'] [5:6].values

array(["{'accuracy': 0.6861053530057311, 'precision_score': 0.7091731522419424, 'recall_score': 0.6861053530057311, 'f1_score': 0.6928526114294886}"],
      dtype=object)

In [50]:
array(["{'accuracy': 0.7656088560885609, 'precision_score': 0.795161345923433, 'recall_score': 0.7656088560885609, 'f1_score': 0.7773461118314546}"],
      dtype=object)

NameError: name 'array' is not defined

<a id='Validation and Results'></a>

# Validation and Results

In [None]:
#accuracy score of the model
accuracy = x.score(X_test_final['processed_text'].values, y_test_final)
print("Accuracy = {}".format(accuracy))

# Regression test

In [None]:
#accuracy score of the model of regression
accuracy = x.score(X_cicd['processed_text'].values, y_cicd)
print("Accuracy = {}".format(accuracy))

In [None]:
from sklearn import metrics


In [None]:
#classification report 
classification_report = metrics.classification_report(y_test_final, y_pred, output_dict=True)

In [None]:
display(pd.DataFrame(classification_report).transpose())

Saving the train and test data for reference 

In [None]:
train_size = 0.8
train_end = int(len(data_df)*train_size)
df_train = data_df[:train_end]
df_test = data_df[train_end:]
train_size_cicd=0.9
train_end_cicd = int(len(data_cicd)*train_size_cicd)
df2_train = data_cicd[:train_end_cicd]
df2_test = data_cicd[train_end_cicd:]
df2_train = df2_train[['Item','Description','establishment_type','combined_text','processed_text','target']]
df2_test = df2_test[['Item','Description','establishment_type','combined_text','processed_text','target']]
X_train_save = df_train.append(df2_train)
X_test_save = df_test.append(df2_test)
X_train_save['label'] = 'train'
X_test_save['label'] = 'test'
X_data = X_train_save.append(X_test_save)
X_data.to_csv('df_traintestdata.csv')

In [None]:
result['confusion_matrix'][5:6].values

In [None]:
array(["{'accuracy': 0.7351734317343174, 'precision_score': 0.7736218733293683, 'recall_score': 0.7351734317343174, 'f1_score': 0.749379108225673}"],
      dtype=object)

In [None]:
# check the misclassifications
misclassifications= result.loc[result['original_cat']!=result['predicted_cat']]

In [None]:
len(misclassifications)

In [None]:
misclassifications.to_csv('misclassifications.csv')

In [None]:
output = {'accuracy': accuracy_score(y_pred,y_test_final),'precision_score':precision_score(y_pred,y_test_final,average='weighted'),'recall_score':recall_score(y_pred,y_test_final,average='weighted')
,'f1_score':f1_score(y_pred,y_test_final,average='weighted')}

result['confusion_matrix'] = str(output)