# <h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import Requirements" data-toc-modified-id="Import-Requirements-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Requirements</a></span></li><li><span><a href="#Prepare Training Data" data-toc-modified-id="Prepare-Training-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare Training Data</a></span><ul class="toc-item"></ul></li><li><span><a href="#Model Training" data-toc-modified-id="Model Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href="#Model Saving" data-toc-modified-id="Model Saving-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model Saving</a></span><ul class="toc-item"></ul></li><li><span><a href="#Validation and Results" data-toc-modified-id="Validation and Results-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Validation and Results</a></span><ul class="toc-item"></ul></div>

<a id='Import Requirements'></a>

# Import Requirements

In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.model_selection import GridSearchCV

<a id='Prepare Training Data'></a>

# Prepare Training Data

Input data for training consists of both historical data and CICD data( Production run data for which manual agent validation has been done for the ML prediction)

In [2]:
def preprocess_text(message):

    #stopwords
    stpwrd = nltk.corpus.stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    message=message.lower()
    message = re.sub(r'-',' ', message)
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message)
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]',' ',message)
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #removing hyperlinks
    message = re.sub(r'http\S+', ' ', message)
    return message

In [12]:
data_inca1=pd.read_csv('../data/retrain_data/INDIRECTTX-1954-03-15.csv', usecols = ['entity_uuid','product_name','product_description','establishment_type','CAT NAME,CAT_TEMP','Integers'])
data_inca2=pd.read_csv('../data/retrain_data/INDIRECTTX-1954-02-15.csv', usecols = ['entity_uuid','product_name','product_description','establishment_type','CAT NAME,CAT_TEMP','Integers'])
data_inca3=pd.read_csv('../data/retrain_data/2023-05-22 - Albertsons Liquor in Illinois - Menu Items & Existing Tax Categories.csv', usecols = ['entity_uuid','product_name','product_description','establishment_type','CAT NAME,CAT_TEMP','Integers'])
data_inca4=pd.read_csv('../data/retrain_data/INDIRECTTX-1954 - 2023-06-09.csv', usecols = ['entity_uuid','product_name','product_description','establishment_type','CAT NAME,CAT_TEMP','Integers'])
data_inca=pd.concat([data_inca1,data_inca2,data_inca3,data_inca4])
data_inca['target_new']=data_inca['CAT NAME,CAT_TEMP']+":"+ data_inca['Integers']
data_inca.drop(['CAT NAME,CAT_TEMP', 'Integers'],inplace=True,axis=1)
data_inca=data_inca.rename(columns={'product_name': 'Item', 'product_description': 'Description','entity_uuid':'UniqueUUID'})

In [13]:
data_inca.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35207 entries, 0 to 21127
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   UniqueUUID          35207 non-null  object
 1   Item                35207 non-null  object
 2   Description         33530 non-null  object
 3   establishment_type  35207 non-null  object
 4   target_new          35207 non-null  object
dtypes: object(5)
memory usage: 1.6+ MB


In [9]:
#read input from historical data into dataframe
data_df = pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/historical_data_24_11_22.csv', encoding='utf8',engine='python',usecols=['UniqueUUID','Item','Description','establishment_type','target_new'])

In [10]:
#read input from cicd data into dataframe
data_cicd=pd.read_csv('../data/retrain_data/TaxML-CICD - Prod_Data_latest.csv', usecols = ['UniqueUUID','Item','Description','establishment_type','Confidence Score','Agent Corrected CAT Name', 'Agent Corrected Integer','CAT NAME_ ValidationScore [0-100]','Integer_ValidationScore[0-100]'])
print(data_cicd.shape)
#misclassified data                                        
data_cicd_misclassification=data_cicd[(data_cicd['CAT NAME_ ValidationScore [0-100]']==0)| (data_cicd['Integer_ValidationScore[0-100]']==0)]
data_cicd_latest=data_cicd_misclassification[['UniqueUUID','Item','Description','establishment_type','Agent Corrected CAT Name', 'Agent Corrected Integer']]
data_cicd_latest['target_new']=data_cicd_latest['Agent Corrected CAT Name'] + ":" + data_cicd_latest['Agent Corrected Integer']
data_cicd_latest.drop(['Agent Corrected CAT Name', 'Agent Corrected Integer'],inplace=True,axis=1)
print(data_cicd_latest.info())
data_cicd_final = pd.concat([data_inca, data_cicd_latest], join="outer")
final_data=pd.concat([data_df, data_cicd_final], join="outer")
final_data.to_csv('training_data.csv',index=False)
# combine the columns Item, Description and establishment_type into one column 'combined_text'
data_cicd_final['combined_text'] = data_cicd_final[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
data_cicd_final['processed_text']= data_cicd_final['combined_text'].map(lambda s:preprocess_text(s)) 

data_cicd_final = data_cicd_final.reset_index(drop=True)
# prepare the target column by combining 'Agent Corrected CAT Name' and 'Agent Corrected Integer'

data_cicd_final=data_cicd_final.drop_duplicates(subset=['processed_text','target_new'],keep='first')
print(data_cicd_final.shape)

#remove rows having empty target column
data_cicd_final.dropna(subset=['target_new'],inplace=True)


X_cicd= data_cicd_final[['Item','Description','establishment_type','processed_text']]
y_cicd= data_cicd_final['target_new']

# split the cicd data into train and test 
X_train_cicd, X_test_cicd, y_train_cicd, y_test_cicd = train_test_split(X_cicd, y_cicd,shuffle=True, test_size = .01, random_state = 42)



  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


(583371, 9)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39928 entries, 11374 to 583282
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   UniqueUUID          39928 non-null  object
 1   Item                39928 non-null  object
 2   Description         31371 non-null  object
 3   establishment_type  39928 non-null  object
 4   target_new          39928 non-null  object
dtypes: object(5)
memory usage: 1.8+ MB
None
(21971, 7)


In [12]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396190 entries, 0 to 396189
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   UniqueUUID          396190 non-null  object
 1   Item                396187 non-null  object
 2   Description         271012 non-null  object
 3   establishment_type  396190 non-null  object
 4   target_new          396190 non-null  object
dtypes: object(5)
memory usage: 15.1+ MB


In [13]:
#mergedStuff_cicd= data_df.set_index('UniqueUUID').join(data_cicd_latest.set_index('UniqueUUID'))
#final_data=pd.concat([data_df, data_cicd_final], join="outer")

In [14]:
final_data

Unnamed: 0,UniqueUUID,Item,Description,establishment_type,target_new
0,\n,Captain Morgan White Rum.1.75L Bottle,,GROCERY,CAT_LIQUOR:535
1,\n,D’USSÉ® VSOP Cognac.375ml Bottle,,GROCERY,CAT_LIQUOR:535
2,\n,Jim Beam Honey Bourbon Whiskey.750ml Bottle,,GROCERY,CAT_LIQUOR:535
3,\n,Ardbeg Scotch Uigeadail. 750ml Bottle,,GROCERY,CAT_LIQUOR:535
4,\n,Jameson Irish Whiskey.1.75L Bottle Size,,GROCERY,CAT_LIQUOR:535
...,...,...,...,...,...
583278,e64e6ef6-a179-5c09-babe-565f6d743b80:f3589175-...,Planters · Cocktail Peanuts (12 oz),12 oz,GROCERY,CAT_PREPACKAGED_FOOD_SNACK_NUTS:747
583279,e64e6ef6-a179-5c09-babe-565f6d743b80:6d218339-...,Café Bustelo · Espresso Ground Coffee (6 oz),6 oz,GROCERY,CAT_PREPACKAGED_FOOD_INSTANT_COFFEE:733
583280,e64e6ef6-a179-5c09-babe-565f6d743b80:1fd4c419-...,Old El Paso · Vegetarian Refried Beans (16 oz),16 oz,GROCERY,CAT_PREPACKAGED_FOOD_CANNED_BEANS:719
583281,e64e6ef6-a179-5c09-babe-565f6d743b80:d6980d8d-...,The Greek Gods · Greek Style Nonfat Plain Yogu...,24 oz,GROCERY,"CAT_PREPACKAGED_FOOD,CAT_SNACK,TEMP_COLD:106,1..."


In [15]:
# combine the columns Item, Description and establishment_type into one column 'combined_text'
data_df['combined_text'] = data_df[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
data_df['processed_text']= data_df['combined_text'].map(lambda s:preprocess_text(s)) 
print(data_df.shape)

(396190, 7)


In [16]:
data_df=data_df.drop_duplicates(subset=['processed_text','target_new'],keep='first')
print(data_df.shape)
#remove rows having empty target column
data_df.dropna(subset=['target_new'],inplace=True)

X= data_df[['Item','Description','establishment_type','processed_text']]
y= data_df['target_new']

# split the cicd data into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y,shuffle=True, test_size = .20, random_state = 42)
print(X_train.shape)

(343116, 7)
(274492, 4)


We will append the CICD data to the historical data to create the final train and test data.
Train set has 80% of all historical data and 90% of all cicd data.
Test set consists of 20% of historic data and 10% of all cicd data.

In [17]:
X_train_final = X_train.append(X_train_cicd)
X_test_final = X_test.append(X_test_cicd)
y_train_final = y_train.append(y_train_cicd)
y_test_final = y_test.append(y_test_cicd)

#X_train_final = X_train
#X_test_final = X_test
#y_train_final = y_train
#y_test_final = y_test

<a id='Model Training'></a>

In [18]:
print('Training data size: {}'.format(len(X_train_final)))
print('Test data size: {}'.format(len(X_test_final)))

Training data size: 296243
Test data size: 68844


In [19]:
print('Number of unique labels in train data: {}'.format(len(y_train_final.unique().tolist())))
print('Number of unique labels in test data: {}'.format(len(y_test_final.unique().tolist())))

Number of unique labels in train data: 206
Number of unique labels in test data: 197


In [20]:
category_count = data_df.groupby(['target_new'],sort=False).agg({'target_new':'count'})
category_count.rename(columns={'target_new':'count'},inplace=True)
category_count.sort_values('count',ascending=False)

Unnamed: 0_level_0,count
target_new,Unnamed: 1_level_1
CAT_LIQUOR:535,37090
"CAT_PREPARED_FOOD,TEMP_HEATED:101,1",32171
CAT_TPP:531,31809
CAT_WINE:534,22136
CAT_BEER:533,17534
...,...
CAT_TPP_CAMPING_EQUIPMENT:779,4
CAT_POSTAGE:527,2
CAT_PRESCRIPTION_DRUGS:520,2
CAT_OIL:778,1


In [21]:
category_count.to_csv('category_count.csv')

# Model Training

The Model Pipeline consists of 1. CountVectorizer, 2. Tfidf-Transformer 3. MultinomialNB 

In [22]:
#vectorizer = CountVectorizer(strip_accents='ascii',token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', stop_words='english', max_df=0.85)
#X = vectorizer.fit_transform(X_train_final['processed_text'].values)
#features = vectorizer.get_feature_names()
#len(features)

In [24]:
import time
t_start=time.time()

In [25]:
# create a result dataframe to store final results
result=X_test_final

#create the model pipeline
rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii',max_df=0.85)),
            ('tfidf', TfidfTransformer()),
            #('mnb', MultinomialNB(alpha= 0.05,fit_prior= False))])
            #('clf', RandomForestClassifier())])
            ('svc',LinearSVC())])

In [26]:
# perform model training
rf.fit(X_train_final['processed_text'].values, y_train_final.values)

  if LooseVersion(joblib_version) < '0.12':


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.85,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents='ascii',
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('svc',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                 

In [29]:
t_end=time.time()
interval=t_end-t_start
time_minutes=time.strftime("%H:%M:%S", time.gmtime(interval)
print("Total model training time: {}".format(time_minutes))

SyntaxError: invalid syntax (3172894089.py, line 4)

In [30]:
# model prediction
result=X_test_final
y_pred = rf.predict(X_test_final['processed_text'].values)

result['original_cat']= y_test_final.values
result['predicted_cat'] = y_pred
#result['prediction_cat_confscore'] = np.round_(np.max(rf.predict_proba(X_test_final['processed_text']), axis=1), decimals=2)
result['prediction_cat_confscore'] = np.round_(1/(1+(np.max(rf.decision_function(X_test_final['processed_text'].values), axis=1))),decimals=2)

#
output = {'accuracy': accuracy_score(y_pred,y_test_final),'precision_score':precision_score(y_pred,y_test_final,average='weighted'),'recall_score':recall_score(y_pred,y_test_final,average='weighted')
,'f1_score':f1_score(y_pred,y_test_final,average='weighted')}

result['confusion_matrix'] = str(output)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [31]:
result['confusion_matrix'] [5:6].values

array(["{'accuracy': 0.7910638545116495, 'precision_score': 0.8015364216905002, 'recall_score': 0.7910638545116495, 'f1_score': 0.7948336169385961}"],
      dtype=object)

In [None]:
#array(["{'accuracy': 0.7481380408209677, 'precision_score': 0.8104281909828657, 'recall_score': 0.7481380408209677, 'f1_score': 0.772676580987015}"],
      #dtype=object)
result

# Hyperparameter tuning

In [25]:
grid_params = {
  'mnb__alpha': [0.01,0.05,0.1,0.2,0.3],
}
clf = GridSearchCV(rf, grid_params,n_jobs=-1,verbose=1)
clf.fit(X_train_final['processed_text'].values, y_train_final.values)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)



Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   45.5s finished
  if LooseVersion(joblib_version) < '0.12':


Best Score:  0.7174939692303207
Best Params:  {'mnb__alpha': 0.05}


In [None]:
# Gridsearch to determine the value of C
param_grid = {'svc__C':np.arange(0.01,100,10)}
linearSVC = GridSearchCV(rf,param_grid,cv=2,return_train_score=True)
linearSVC.fit(X_train_final['processed_text'].values, y_train_final.values)
print(linearSVC.best_params_)
#linearSVC.coef_
#linearSVC.intercept_

bestlinearSVC = linearSVC.best_estimator_
bestlinearSVC.fit(X_train,y_train)
bestlinearSVC.coef_ = bestlinearSVC.named_steps['SVC'].coef_
bestlinearSVC.score(X_train,y_train)



In [None]:
import sklearn.metrics as metrics

In [None]:
classification_report = metrics.classification_report(y_test_final, y_pred, output_dict=True)

In [None]:
display=pd.DataFrame(classification_report).transpose()

In [None]:
display.to_csv('classification_report.csv')

<a id='Model Saving'></a>

# Model Saving

In [32]:
import pickle
import joblib
import datetime
# save the model to disk
filename_primary= 'finalized_model.sav'
model_dir_taxml='/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/model/latest'
# save the model to disk
filename_primary= 'finalized_model.sav'
model_path = os.path.join(model_dir_taxml, filename_primary) 
print(model_path)
pickle.dump(rf, open(model_path, 'wb'))

/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/model/latest/finalized_model.sav


<a id='Validation and Results'></a>

# Validation and Results

In [34]:
#accuracy score of the model
import joblib
x=joblib.load(model_path)
accuracy = x.score(X_train_final['processed_text'].values, y_train_final)
print("Accuracy = {}".format(accuracy))

Accuracy = 0.8697386942476278


# Regression test

In [35]:
import joblib
x=joblib.load(model_path)

In [36]:
X_train_reg=X_train_final['processed_text'][:70000]

In [37]:
X_test_reg=X_test_final['processed_text'][:30000]

In [38]:
X_reg=pd.concat([X_train_reg,X_test_reg])

In [39]:
y_train_reg=y_train_final[:70000]

In [40]:
y_test_reg=y_test_final[:30000]

In [41]:
y_reg=pd.concat([y_train_reg,y_test_reg])

In [42]:
#accuracy score of the model of regression
accuracy = x.score(X_reg, y_reg)
print("Accuracy = {}".format(accuracy))

Accuracy = 0.84842


In [45]:
accuracy_df=pd.DataFrame()
accuracy_df['Accuracy']=[accuracy]

In [46]:
accuracy_df.to_csv('old_accuracy.csv',index=False)

In [47]:
old_accuracy=pd.read_csv('./old_accuracy.csv')

In [48]:
if old_accuracy['Accuracy'].item()>accuracy:
    print('No need to update model')
else:
     print('Need to update model')

Need to update model


# Saving the train and test data for reference 

In [49]:
train_size = 0.8
train_end = int(len(data_df)*train_size)
df_train = data_df[:train_end]
df_test = data_df[train_end:]
train_size_cicd=0.02
train_end_cicd = int(len(data_cicd_final)*train_size_cicd)
df2_train = data_cicd_final[:train_end_cicd]
df2_test = data_cicd_final[train_end_cicd:]
df2_train = df2_train[['Item','Description','establishment_type','combined_text','processed_text','target_new']]
df2_test = df2_test[['Item','Description','establishment_type','combined_text','processed_text','target_new']]
X_train_save = df_train.append(df2_train)
X_test_save = df_test.append(df2_test)
X_train_save['label'] = 'train'
X_test_save['label'] = 'test'
X_data = X_train_save.append(X_test_save)
X_data.to_csv('df_traintestdata.csv')

In [43]:
# check the misclassifications
misclassifications= result.loc[result['original_cat']!=result['predicted_cat']]

In [44]:
len(misclassifications)

16781

In [45]:
misclassifications.to_csv('misclassifications.csv')

# model tagging on new dataset

In [3]:
import joblib
x=joblib.load('finalized_model.sav')

In [4]:
#read input from historical data into dataframe
df_1= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/before/2023-07-14 - Walmart Canada_7.csv',encoding='latin-1',engine='python')
#choose sample data from entire data
df_1 = df_1.sample(frac=1, random_state=42)
#fill blanks with ''
df_1 = df_1.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_1['combined_text'] = df_1[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_1['processed_text'] = df_1['combined_text'].map(lambda s:preprocess_text(s)) 
print(df_1.shape)
df_1 = df_1.reset_index(drop=True)
y_pred = x.predict(df_1['processed_text'].values)
df_1['target'] = y_pred
df_1[['cat_name','cat_int']] = df_1['target'].str.split(':', expand=True)
df_1['prediction_cat_confscore'] = np.round_(1/(1+(np.max(x.decision_function(df_1['processed_text'].values), axis=1))),decimals=2)
#df_1['prediction_cat_confscore'] =np.round_(np.max(x.predict_proba(df_1['processed_text']), axis=1), decimals=2)
df_1.drop(['combined_text','processed_text','target'], inplace=True, axis=1)
df_1.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/after/2023-07-14 - Walmart Canada_after_tagging_7.csv')

(475000, 14)


AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [7]:
#read input from historical data into dataframe
df_1= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/before/2023-07-14 - Walmart Canada_8.csv',encoding='latin-1',engine='python')
#choose sample data from entire data
df_1 = df_1.sample(frac=1, random_state=42)
#fill blanks with ''
df_1 = df_1.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_1['combined_text'] = df_1[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_1['processed_text'] = df_1['combined_text'].map(lambda s:preprocess_text(s)) 
print(df_1.shape)
df_1 = df_1.reset_index(drop=True)
y_pred = x.predict(df_1['processed_text'].values)
df_1['target'] = y_pred
df_1[['cat_name','cat_int']] = df_1['target'].str.split(':', expand=True)
#df_1['prediction_cat_confscore'] =np.round_(np.max(x.predict_proba(df_1['processed_text']), axis=1), decimals=2)
df_1['prediction_cat_confscore'] = np.round_(1/(1+(np.max(x.decision_function(df_1['processed_text'].values), axis=1))),decimals=2)
df_1.drop(['combined_text','processed_text','target'], inplace=True, axis=1)
df_1.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/after/2023-07-14 - Walmart Canada_after_tagging_8.csv')

(475000, 14)


In [8]:
#read input from historical data into dataframe
df_1= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/before/2023-07-14 - Walmart Canada_9.csv',encoding='latin-1',engine='python')
#choose sample data from entire data
df_1 = df_1.sample(frac=1, random_state=42)
#fill blanks with ''
df_1 = df_1.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_1['combined_text'] = df_1[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_1['processed_text'] = df_1['combined_text'].map(lambda s:preprocess_text(s)) 
print(df_1.shape)
df_1 = df_1.reset_index(drop=True)
y_pred = x.predict(df_1['processed_text'].values)
df_1['target'] = y_pred
df_1[['cat_name','cat_int']] = df_1['target'].str.split(':', expand=True)
#df_1['prediction_cat_confscore'] =np.round_(np.max(x.predict_proba(df_1['processed_text']), axis=1), decimals=2)
df_1['prediction_cat_confscore'] = np.round_(1/(1+(np.max(x.decision_function(df_1['processed_text'].values), axis=1))),decimals=2)
df_1.drop(['combined_text','processed_text','target'], inplace=True, axis=1)
df_1.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/after/2023-07-14 - Walmart Canada_after_tagging_9.csv')

(318749, 14)


In [15]:
#read input from historical data into dataframe
df_1= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/before/2023-07-14 - Walmart Canada_6.csv',encoding='latin-1',engine='python')
#choose sample data from entire data
df_1 = df_1.sample(frac=1, random_state=42)
#fill blanks with ''
df_1 = df_1.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_1['combined_text'] = df_1[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_1['processed_text'] = df_1['combined_text'].map(lambda s:preprocess_text(s)) 
print(df_1.shape)
df_1 = df_1.reset_index(drop=True)
y_pred = x.predict(df_1['processed_text'].values)
df_1['target'] = y_pred
df_1[['cat_name','cat_int']] = df_1['target'].str.split(':', expand=True)
df_1['prediction_cat_confscore'] =np.round_(np.max(x.predict_proba(df_1['processed_text']), axis=1), decimals=2)
df_1.drop(['combined_text','processed_text','target'], inplace=True, axis=1)
df_1.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/after/2023-07-14 - Walmart Canada_after_tagging_6.csv')

(475000, 14)


In [5]:
#read input from historical data into dataframe
df_2= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/before/2023-01-16 - Weekly New GroCo Menu Items.csv', encoding='utf8',engine='python')
#choose sample data from entire data
df_2 = df_2.sample(frac=1, random_state=42)
#fill blanks with ''
df_2 = df_2.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_2['combined_text'] = df_2[['item_name','description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_2['processed_text'] = df_2['combined_text'].map(lambda s:preprocess_text(s)) 
print(df_2.shape)
df_2 = df_2.reset_index(drop=True)
y_pred = x.predict(df_2['processed_text'].values)
df_2['target'] = y_pred
df_2[['cat_name','cat_int']] = df_2['target'].str.split(':', expand=True)
df_2.drop('target', inplace=True, axis=1)
df_2['prediction_cat_confscore'] =np.round_(np.max(x.predict_proba(df_2['processed_text']), axis=1), decimals=2)
df_2.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/after/2023-01-16 - Weekly New GroCo Menu Items_after_tagging.csv')
print(df_2.shape)

(132281, 9)
(132281, 12)


In [8]:
#read input from historical data into dataframe
df_3= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/before/INDIRECTTX-2456 - Uber Eats Market (Jersey City) - INCA Tax Tagging - INCA Products.csv', encoding='latin-1',engine='python')
#choose sample data from entire data
df_3 = df_3.sample(frac=1, random_state=42)
#fill blanks with ''
df_3 = df_3.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_3['combined_text'] = df_3[['product_name','product_description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_3['processed_text'] = df_3['combined_text'].map(lambda s:preprocess_text(s)) 
print(df_3.shape)
df_3 = df_3.reset_index(drop=True)
y_pred = x.predict(df_3['processed_text'].values)
df_3['target'] = y_pred
df_3[['cat_name','cat_int']] = df_3['target'].str.split(':', expand=True)
df_3.drop('target', inplace=True, axis=1)
df_3['prediction_cat_confscore'] =np.round_(np.max(x.predict_proba(df_3['processed_text']), axis=1), decimals=2)
df_3.drop('processed_text', inplace=True, axis=1)
df_3.drop('combined_text', inplace=True, axis=1)
df_3.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/ml_tagged_data/after/INDIRECTTX-2456 - Uber Eats Market (Jersey City) - INCA Tax Tagging - INCA Products_after_tagging.csv')
print(df_3.shape)

(3198, 25)
(3198, 26)


In [3]:
#read input from historical data into dataframe
df_3= pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/bf.csv', encoding='latin-1',engine='python')
#choose sample data from entire data
df_3 = df_3.sample(frac=1, random_state=42)
#fill blanks with ''
df_3 = df_3.fillna('')
# combine the columns Item, Description and establishment_type into one column 'combined_text'
df_3['combined_text'] = df_3[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
# apply data preprocessing steps on the prepared column
df_3['processed_text'] = df_3['combined_text'].map(lambda s:preprocess_text(s)) 


In [5]:
df_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2415 entries, 410 to 860
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Unnamed: 0                           2415 non-null   int64 
 1   CICD Run Date                        2415 non-null   object
 2   Date (BOT Sent Details to COE Team)  2415 non-null   object
 3   UniqueUUID                           2415 non-null   object
 4   store_uuid                           2415 non-null   object
 5   item_uuid                            2415 non-null   object
 6   Item                                 2415 non-null   object
 7   Description                          2415 non-null   object
 8   establishment_type                   2415 non-null   object
 9   ML CAT Name                          2415 non-null   object
 10  ML Integer                           2415 non-null   object
 11  ML Confidence Score                  2415 

In [8]:
print(df_3.shape)
df_3 = df_3.reset_index(drop=True)
y_pred = x.predict(df_3['processed_text'].values)
df_3['target'] = y_pred
df_3[['ML CAT Name','ML Integer']] = df_3['target'].str.split(':', expand=True)
df_3.drop('target', inplace=True, axis=1)
df_3['ML Confidence Score'] =np.round_(np.max(x.predict_proba(df_3['processed_text']), axis=1), decimals=2)
df_3.drop('processed_text', inplace=True, axis=1)
df_3.drop('combined_text', inplace=True, axis=1)
df_3.to_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/bf.csv')
print(df_3.shape)

(2415, 21)
(2415, 19)
