In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string

In [2]:
def preprocess_text(message):

    new_stopwords=['ml','oz','pk','grocery','lb']
    stpwrd = nltk.corpus.stopwords.words('english')
    stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]','', message.lower())
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd and len(word)>1])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #print("message is : ",message)
    return message


In [3]:
df = pd.read_csv('../data/jina_final.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','primary','secondary'])
df1=df.sample(frac=1, random_state=42)
df1 = df1.fillna('')
df1['input_str'] = df1[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
df1['cleanText']=df1['input_str'].map(lambda s:preprocess_text(s)) 
df1 = df1.reset_index(drop=True)
X=df1[['Item','Description','establishment_type','cleanText']]
Y_primary=df1['primary']
Y_secondary=df1['secondary']

In [5]:
#Train test split with stratified sampling for evaluation
X_train, X_test, y_train_primary, y_test_primary = train_test_split(X,
                                                    Y_primary,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )
#Train test split with stratified sampling for evaluation
X_train, X_test,y_train_secondary, y_test_secondary = train_test_split(X,
                                                    Y_secondary,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )

####cicd part

In [6]:
df2_cicd =pd.read_csv('../code/TaxML-CICD - Prod_Data20-12-21_after_preprocess.csv',encoding='utf8',engine='python')

df2 = df2_cicd

In [7]:
len(df2)

1321

In [8]:
df2['input_str'] = df2[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
df2['cleanText']=df2['input_str'].map(lambda s:preprocess_text(s))
X_cicd=df2[['Item','Description','establishment_type','cleanText']]

Y_primary_cicd=df2['primary']
Y_secondary_cicd=df2['secondary']
X_train_cicd, X_test_cicd, y_train_primary_cicd, y_test_primary_cicd = train_test_split(X_cicd,
                                                    Y_primary_cicd,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )
#Train test split with stratified sampling for evaluation
X_train_cicd, X_test_cicd,y_train_secondary_cicd, y_test_secondary_cicd = train_test_split(X_cicd,
                                                    Y_secondary_cicd,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )

In [9]:
X_train

Unnamed: 0,Item,Description,establishment_type,cleanText
74328,"Exotico Blanco, 750mL bottle (40% ABV)",,GROCERY,exotico blanco bottle abv
93071,Cortas Halva Original (16 oz),,GROCERY,cortas halva original
160229,"Meiomi Pinot Noir, 750mL wine (13.7% ABV)",,GROCERY,meiomi pinot noir wine abv
33442,Ito en Peach Veggie Shot,30.36 fluid ounces.,GROCERY,ito en peach veggie shot fluid ounce
129506,"Val Di Giulia Barbaresco, 750mL italian red wi...",,GROCERY,val di giulia barbaresco italian red wine abv
...,...,...,...,...
119879,Half & Half Quart,,GROCERY,half half quart
103694,Alpine Beer Company Duet Pale Ale,This West Coast pale ale made with Simcoe and ...,GROCERY,alpine beer company duet pale ale west coast p...
131932,Romance Flowers Medium Red,,FLOWERS,romance flower medium red flower
146867,Renal Care (32 oz),Our renal care recipe has low protein. It cont...,PET,renal care renal care recipe low protein conta...


In [13]:
X_train_final_1 = X_train.append(X_train_cicd)

In [14]:
len(X_train_final_1)

137012

In [15]:
len(X_test)

33989

In [16]:
len(X_test_cicd)

265

In [17]:
X_test

Unnamed: 0,Item,Description,establishment_type,cleanText
12184,Dungeons and Dragons 5E Candle Keep Mysteries,Great books hide their secrets well. An anthol...,GROCERY,dungeon dragon candle keep mystery great book ...
142293,Tam Fruit Company-Fresh Cut Celery Sticks10 oz,,GROCERY,tam fruit companyfresh cut celery stick
44634,Burnetts Vodka 1.75L 40% ABV,\N,GROCERY,burnett vodka abv
136489,"Llano ""Cab Sav"", 750mL wine (13.2% ABV)",,GROCERY,llano cab sav wine abv
44540,Emergen-C-Super Orange30 ct,,GROCERY,emergencsuper orange ct
...,...,...,...,...
98048,Hartley VSOP 750mL,Hartley VSOP 750mL,GROCERY,hartley vsop hartley vsop
64979,Host Defense My Community Extract Advanced Imm...,High-intensity immune support support your hea...,GROCERY,host defense community extract advanced immune...
133534,BREAD & BUTTER CABERNET SAUVIGNON 750ML,This Cabernet Sauvignon expresses captivating ...,GROCERY,bread butter cabernet sauvignon cabernet sauvi...
64876,1 quart whole milk,brand may vary from picture,GROCERY,quart whole milk brand may vary picture


In [18]:
X_test_cicd

Unnamed: 0,Item,Description,establishment_type,cleanText
727,Aquaphor Advanced Protection (1.75 oz),,GROCERY,aquaphor advanced protection
479,Hormel Natural Hardwood Smoke Original Bacon (...,,GROCERY,hormel natural hardwood smoke original bacon
240,Curly Parsley Bunch (1 lb),,GROCERY,curly parsley bunch
422,REESE'S Peanut Butter Cups (2.8 oz),,GROCERY,reeses peanut butter cup
49,Amita (1 lt),,GROCERY,amita lt
...,...,...,...,...
1182,Infants Tylenol Pain and Fever Grape Flavor (1...,,GROCERY,infant tylenol pain fever grape flavor fl
575,Skittles (2.17 oz),,GROCERY,skittle
953,"5 Hour Energy Extra Strength, (1.93 oz)",,GROCERY,hour energy extra strength
1098,Gold Bond Foot Powder Spray (7 oz),,GROCERY,gold bond foot powder spray


In [19]:
X_test_final_1 = X_test.append(X_test_cicd)

In [20]:
len(X_test_final_1)

34254

In [21]:
X_train_final=X_train['cleanText'].append(X_train_cicd['cleanText'])
y_train_final_primary=y_train_primary.append(y_train_primary_cicd)
y_train_final_secondary=y_train_secondary.append(y_train_secondary_cicd)
X_test_final=X_test['cleanText'].append(X_test_cicd['cleanText'])
y_test_final_primary=y_test_primary.append(y_test_primary_cicd)
y_test_final_secondary=y_test_secondary.append(y_test_secondary_cicd)

In [52]:
# df_train.to_csv('df_train_20-12-21.csv')
# df_test.to_csv('df_test_20-12-21.csv')

In [38]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169945 entries, 0 to 169944
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Item                169945 non-null  object
 1   Description         169945 non-null  object
 2   establishment_type  169945 non-null  object
 3   primary             169945 non-null  object
 4   secondary           169945 non-null  object
 5   input_str           169945 non-null  object
 6   cleanText           169945 non-null  object
dtypes: object(7)
memory usage: 9.1+ MB


In [40]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1321 entries, 0 to 1320
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         1321 non-null   int64  
 1   Date                               1309 non-null   object 
 2   UniqueUUID                         1321 non-null   object 
 3   store_uuid                         1321 non-null   object 
 4   item_uuid                          1321 non-null   object 
 5   Item                               1321 non-null   object 
 6   Description                        127 non-null    object 
 7   establishment_type                 1321 non-null   object 
 8   CAT Name                           1321 non-null   object 
 9   Integer                            1321 non-null   object 
 10  Confidence Score                   1321 non-null   float64
 11  Agent Corrected CAT Name           1321 non-null   objec

In [41]:
train_size = 0.8
train_end = int(len(df1)*train_size)
df_train = df1[:train_end]
df_test = df1[train_end:]

train_end_cicd = int(len(df2)*train_size)
df2_train = df2[:train_end_cicd]
df2_test = df2[train_end_cicd:]

In [43]:
df2_train = df2_train[['Item','Description','establishment_type','primary','secondary','input_str','cleanText']]
df2_test = df2_test[['Item','Description','establishment_type','primary','secondary','input_str','cleanText']]

In [44]:
X_train_save = df_train.append(df2_train)
X_test_save = df_test.append(df2_test)

In [45]:
X_train_save.to_csv('df_train_20-12-21.csv')
X_test_save.to_csv('df_test_20-12-21.csv')

In [49]:
X_train_save['label'] = 'train'
X_test_save['label'] = 'test'
X_data = X_train_save.append(X_test_save)
X_data.to_csv('df_traintestdata_20-12-21.csv')

In [39]:
X_train_cicd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1056 entries, 1171 to 1126
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Item                1056 non-null   object
 1   Description         104 non-null    object
 2   establishment_type  1056 non-null   object
 3   cleanText           1056 non-null   object
dtypes: object(4)
memory usage: 41.2+ KB


,item_name,description,establishment_type,primary,secondary,input_str,cleanText


In [53]:
type(y_test_final_primary)

pandas.core.series.Series

In [22]:
X_test

Unnamed: 0,Item,Description,establishment_type,cleanText
12184,Dungeons and Dragons 5E Candle Keep Mysteries,Great books hide their secrets well. An anthol...,GROCERY,dungeon dragon candle keep mystery great book ...
142293,Tam Fruit Company-Fresh Cut Celery Sticks10 oz,,GROCERY,tam fruit companyfresh cut celery stick
44634,Burnetts Vodka 1.75L 40% ABV,\N,GROCERY,burnett vodka abv
136489,"Llano ""Cab Sav"", 750mL wine (13.2% ABV)",,GROCERY,llano cab sav wine abv
44540,Emergen-C-Super Orange30 ct,,GROCERY,emergencsuper orange ct
...,...,...,...,...
98048,Hartley VSOP 750mL,Hartley VSOP 750mL,GROCERY,hartley vsop hartley vsop
64979,Host Defense My Community Extract Advanced Imm...,High-intensity immune support support your hea...,GROCERY,host defense community extract advanced immune...
133534,BREAD & BUTTER CABERNET SAUVIGNON 750ML,This Cabernet Sauvignon expresses captivating ...,GROCERY,bread butter cabernet sauvignon cabernet sauvi...
64876,1 quart whole milk,brand may vary from picture,GROCERY,quart whole milk brand may vary picture


In [23]:
type(X_test_final)

pandas.core.series.Series

In [24]:
#from sklearn.naive_bayes import MultinomialNB

dimention=['y_train_final_primary','y_train_final_secondary']

result=X_test_final_1
for i in dimention:
    if i =='y_train_final_primary':
        rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')),
               ('tfidf', TfidfTransformer(use_idf=True)),
               ('clf', RandomForestClassifier(oob_score=True,n_jobs=-1)),
              ])
        rf.fit(X_train_final, y_train_final_primary)
        y_pred= rf.predict(X_test_final)
        result['original_cat_primary']=y_test_final_primary
        result['prediction_cat_primary']=y_pred
        result['prediction_cat_primary_confscore']=rf.predict_proba(X_test_final).max()
        output={'accuracy':accuracy_score(y_pred,y_test_final_primary),'precision_score':precision_score(y_pred,y_test_final_primary,average='macro'),'recall_score':recall_score(y_pred,y_test_final_primary,average='macro')
,'f1_score':f1_score(y_pred,y_test_final_primary,average='macro')}
        result['confusion_matrix_primary']=str(output)
        
    else:
        rf1 = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')),
               ('tfidf', TfidfTransformer(use_idf=True)),
               ('clf', RandomForestClassifier(oob_score=True,n_jobs=-1)),
              ])
        rf1.fit(X_train_final, y_train_final_secondary)
        y_pred= rf1.predict(X_test_final)
        result['original_cat_secondary']=y_test_final_secondary
        result['prediction_cat_secondary']=y_pred
        result['prediction_cat_secondary_confscore']=rf1.predict_proba(X_test_final).max()
        output={'accuracy':accuracy_score(y_pred,y_test_final_secondary),'precision_score':precision_score(y_pred,y_test_final_secondary,average='macro'),'recall_score':recall_score(y_pred,y_test_final_secondary,average='macro')
,'f1_score':f1_score(y_pred,y_test_final_secondary,average='macro')}
        result['confusion_matrix_secondary']=str(output)



  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [25]:
y_pred= rf.predict(X_test_final)

In [26]:
result['original_cat_primary']=y_test_final_primary

In [27]:
misclassifications_primary= result.loc[result['original_cat_primary']!=result['prediction_cat_primary']]
misclassifications_secondary= result.loc[result['original_cat_secondary']!=result['prediction_cat_secondary']]
misclassification=pd.merge(misclassifications_primary,misclassifications_secondary,how='inner',on=['Item','Description','establishment_type','cleanText'])
misclassification.to_csv('misclassification_allprod.csv')

In [28]:
result

Unnamed: 0,Item,Description,establishment_type,cleanText,original_cat_primary,prediction_cat_primary,prediction_cat_primary_confscore,confusion_matrix_primary,original_cat_secondary,prediction_cat_secondary,prediction_cat_secondary_confscore,confusion_matrix_secondary
12184,Dungeons and Dragons 5E Candle Keep Mysteries,Great books hide their secrets well. An anthol...,GROCERY,dungeon dragon candle keep mystery great book ...,"CAT_TPP_TOYS,822","CAT_TPP_TOYS,822",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_TPP_TOYS,822","CAT_TPP_TOYS,822",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
142293,Tam Fruit Company-Fresh Cut Celery Sticks10 oz,,GROCERY,tam fruit companyfresh cut celery stick,"CAT_FRUIT_VEG_PLANTS,552","CAT_PREPARED_DRINK,114",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_FRUIT_VEG_PLANTS,552","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
44634,Burnetts Vodka 1.75L 40% ABV,\N,GROCERY,burnett vodka abv,"CAT_LIQUOR,535","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_LIQUOR,535","CAT_LIQUOR,535",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
136489,"Llano ""Cab Sav"", 750mL wine (13.2% ABV)",,GROCERY,llano cab sav wine abv,"CAT_WINE,534","CAT_WINE,534",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_WINE,534","CAT_WINE,534",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
44540,Emergen-C-Super Orange30 ct,,GROCERY,emergencsuper orange ct,"CAT_SUPPLEMENTS,542","CAT_PREPACKAGED_FOOD,106",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_SUPPLEMENTS,542","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1182,Infants Tylenol Pain and Fever Grape Flavor (1...,,GROCERY,infant tylenol pain fever grape flavor fl,"CAT_OTC_MEDICATION_IBUPROFEN,768","CAT_OTC_MEDICATION,521",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_OTC_MEDICATION_IBUPROFEN,768","CAT_OTC_MEDICATION,521",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
575,Skittles (2.17 oz),,GROCERY,skittle,"CAT_CONFECTIONARY,707","CAT_CONFECTIONARY,707",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_CONFECTIONARY,707","CAT_CONFECTIONARY,707",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
953,"5 Hour Energy Extra Strength, (1.93 oz)",,GROCERY,hour energy extra strength,"CAT_SUPPLEMENTS,542","CAT_SUPPLEMENTS,542",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_SUPPLEMENTS,542","CAT_SUPPLEMENTS,542",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
1098,Gold Bond Foot Powder Spray (7 oz),,GROCERY,gold bond foot powder spray,"CAT_MEDICATED_ITEMS,525","CAT_MEDICATED_ITEMS,525",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_MEDICATED_ITEMS,525","CAT_MEDICATED_ITEMS,525",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."


In [35]:
result.to_csv('allprod20-12-21.csv',index=0)

In [33]:
result.loc[0,'confusion_matrix_primary']

"{'accuracy': 0.8135984118643078, 'precision_score': 0.5990277161042454, 'recall_score': 0.6929501277252824, 'f1_score': 0.6245910565405649}"

In [34]:
result.loc[0,'confusion_matrix_secondary']

"{'accuracy': 0.8124890523734455, 'precision_score': 0.5975765642272369, 'recall_score': 0.692646681817416, 'f1_score': 0.6231752250014626}"

In [29]:
len(misclassifications_primary)

6385

In [30]:
misclassification

Unnamed: 0,Item,Description,establishment_type,cleanText,original_cat_primary_x,prediction_cat_primary_x,prediction_cat_primary_confscore_x,confusion_matrix_primary_x,original_cat_secondary_x,prediction_cat_secondary_x,prediction_cat_secondary_confscore_x,confusion_matrix_secondary_x,original_cat_primary_y,prediction_cat_primary_y,prediction_cat_primary_confscore_y,confusion_matrix_primary_y,original_cat_secondary_y,prediction_cat_secondary_y,prediction_cat_secondary_confscore_y,confusion_matrix_secondary_y
0,Emergen-C-Super Orange30 ct,,GROCERY,emergencsuper orange ct,"CAT_SUPPLEMENTS,542","CAT_PREPACKAGED_FOOD,106",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_SUPPLEMENTS,542","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_SUPPLEMENTS,542","CAT_PREPACKAGED_FOOD,106",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_SUPPLEMENTS,542","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
1,Xoxox Pure Milk Bar,\N,GROCERY,xoxox pure milk bar,"CAT_CANDY,108","CAT_CONFECTIONARY,707",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_CANDY,108","CAT_CONFECTIONARY,707",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_CANDY,108","CAT_CONFECTIONARY,707",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_CANDY,108","CAT_CONFECTIONARY,707",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
2,M&M's Peanut King Size (M&M's Arachide King Size),,GROCERY,mm peanut king size mm arachide king size,"CAT_CANDY_COATED_NUTS,704","CAT_CHOCOLATE,706",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_CANDY_COATED_NUTS,704","CAT_CHOCOLATE,706",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_CANDY_COATED_NUTS,704","CAT_CHOCOLATE,706",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_CANDY_COATED_NUTS,704","CAT_CHOCOLATE,706",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
3,Oscar Coffee Ground Bag,ground columbian coffee beans (classic pour ov...,GROCERY,oscar coffee ground bag ground columbian coffe...,"CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_INSTANT_COFFEE,733",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_INSTANT_COFFEE,733",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_INSTANT_COFFEE,733",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_INSTANT_COFFEE,733",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
4,Lenny & Larry'S The Complete Cookie Chocolate...,4 Oz,GROCERY,lenny larrys complete cookie chocolate chip,"CAT_PREPARED_FOOD,101","CAT_PREPACKAGED_FOOD_SNACK_COOKIES,749",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPARED_FOOD,101","CAT_PREPACKAGED_FOOD_SNACK_COOKIES,749",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_PREPARED_FOOD,101","CAT_PREPACKAGED_FOOD_SNACK_COOKIES,749",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPARED_FOOD,101","CAT_PREPACKAGED_FOOD_SNACK_COOKIES,749",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5045,"Beech-Nut Naturals Mango, Apple, & Avocado (4 oz)",,GROCERY,beechnut natural mango apple avocado,"CAT_PREPACKAGED_FOOD_CONDIMENTS,740","CAT_PREPACKAGED_FOOD,106",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD_CONDIMENTS,740","CAT_PREPACKAGED_FOOD_SNACK_NUTS,747",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_PREPACKAGED_FOOD_CONDIMENTS,740","CAT_PREPACKAGED_FOOD,106",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD_CONDIMENTS,740","CAT_PREPACKAGED_FOOD_SNACK_NUTS,747",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
5046,Daikon Radish (Each),,GROCERY,daikon radish,"CAT_PREPACKAGED_FOOD,106","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_FRUIT_VEG_PLANTS,552",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
5047,King Oscar Kipper Snack (3.54 oz),,GROCERY,king oscar kipper snack,"CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_SNACK_CHIPS,746",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_SNACK_NUTS,747",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_SNACK_CHIPS,746",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_PREPACKAGED_FOOD,106","CAT_PREPACKAGED_FOOD_SNACK_NUTS,747",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."
5048,Infants Tylenol Pain and Fever Grape Flavor (1...,,GROCERY,infant tylenol pain fever grape flavor fl,"CAT_OTC_MEDICATION_IBUPROFEN,768","CAT_OTC_MEDICATION,521",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_OTC_MEDICATION_IBUPROFEN,768","CAT_OTC_MEDICATION,521",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc...","CAT_OTC_MEDICATION_IBUPROFEN,768","CAT_OTC_MEDICATION,521",1.0,"{'accuracy': 0.8135984118643078, 'precision_sc...","CAT_OTC_MEDICATION_IBUPROFEN,768","CAT_OTC_MEDICATION,521",1.0,"{'accuracy': 0.8124890523734455, 'precision_sc..."


In [31]:
df2

Unnamed: 0.1,Unnamed: 0,Date,UniqueUUID,store_uuid,item_uuid,Item,Description,establishment_type,CAT Name,Integer,...,Inetger_ValidationScore[0-100],primary_cat,secondary_cat,primary_int,secondary_int,primary_int_prediction,primary,secondary,input_str,cleanText
0,0,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:9abcba67-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,9abcba67-3a79-41a5-987a-034cae6a0ff9,Pure Happiness,A sunny sunflower bouquet gets an autumnal spi...,FLOWERS,CAT_TPP,531,...,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531",Pure Happiness A sunny sunflower bouquet gets ...,pure happiness sunny sunflower bouquet get aut...
1,1,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:03a01438-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,03a01438-c236-40bd-a6d1-9dff278d67e3,Silver Snow Bouquet,"Like a quiet walk through a snowy forest, this...",FLOWERS,CAT_TPP,531,...,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531",Silver Snow Bouquet Like a quiet walk through ...,silver snow bouquet like quiet walk snowy fore...
2,2,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:14312ee6-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,14312ee6-2324-4f0b-81b7-f1e4cb7cf454,Beautiful in Blue,Brighten the home with the beauty of bright bl...,FLOWERS,CAT_TPP,531,...,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531",Beautiful in Blue Brighten the home with the b...,beautiful blue brighten home beauty bright blu...
3,3,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:6cb0e71e-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,6cb0e71e-586d-4fd9-a71a-5dcd86e6f920,Blush Life Bouquet,Put a spring in their step with this beautiful...,FLOWERS,CAT_TPP,531,...,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531",Blush Life Bouquet Put a spring in their step ...,blush life bouquet put spring step beautifully...
4,4,,2198c3e7-9b2f-441e-9e24-813d70f3bf26:1dbd5b6a-...,2198c3e7-9b2f-441e-9e24-813d70f3bf26,1dbd5b6a-0e09-46e8-ad34-54b996c53d57,Red Tulips,Call ahead for this arrangement before orderin...,FLOWERS,CAT_TPP,531,...,100,CAT_TPP,CAT_TPP,531,531,531,"CAT_TPP,531","CAT_TPP,531",Red Tulips Call ahead for this arrangement bef...,red tulip call ahead arrangement ordering tuli...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1316,1316,17-12-2021 2:44 AM,e02292bd-c955-51d5-aeb2-3db665e411cf:d589b687-...,e02292bd-c955-51d5-aeb2-3db665e411cf,d589b687-2b75-4611-95ea-5fce6caf0fc2,Lotrimin Powder Spray (4.6 oz),,GROCERY,CAT_MEDICATED_ITEMS,525,...,100,CAT_MEDICATED_ITEMS,CAT_MEDICATED_ITEMS,525,525,525,"CAT_MEDICATED_ITEMS,525","CAT_MEDICATED_ITEMS,525",Lotrimin Powder Spray (4.6 oz) GROCERY,lotrimin powder spray
1317,1317,17-12-2021 2:44 AM,32d14e2c-98cc-540f-a8c5-f8cd835f61a4:e9917575-...,32d14e2c-98cc-540f-a8c5-f8cd835f61a4,e9917575-c6f5-4ec6-9748-75455b2490e6,Slim Jim Giant Stick,,GROCERY,"CAT_PREPACKAGED_FOOD,CAT_SNACK",106107,...,0,CAT_PREPACKAGED_FOOD,CAT_SNACK,106,107,106,"CAT_PREPACKAGED_FOOD,106","CAT_SNACK,107",Slim Jim Giant Stick GROCERY,slim jim giant stick
1318,1318,17-12-2021 2:44 AM,e02292bd-c955-51d5-aeb2-3db665e411cf:3806222d-...,e02292bd-c955-51d5-aeb2-3db665e411cf,3806222d-dddd-434a-b270-df75fa9cdcb3,Poise Ultra Thin Pads (18 ct),,GROCERY,CAT_FEMININE_HYGIENE_PRODUCTS,544,...,100,CAT_FEMININE_HYGIENE_PRODUCTS,CAT_FEMININE_HYGIENE_PRODUCTS,544,544,544,"CAT_FEMININE_HYGIENE_PRODUCTS,544","CAT_FEMININE_HYGIENE_PRODUCTS,544",Poise Ultra Thin Pads (18 ct) GROCERY,poise ultra thin pad ct
1319,1319,17-12-2021 2:44 AM,e02292bd-c955-51d5-aeb2-3db665e411cf:c0b1caeb-...,e02292bd-c955-51d5-aeb2-3db665e411cf,c0b1caeb-e813-449f-baef-a97064ca2723,Phillips Milk of Magnesia Original (12 fl oz),,GROCERY,CAT_OTC_MEDICATION_LAXATIVES,769,...,100,CAT_OTC_MEDICATION_LAXATIVES,CAT_OTC_MEDICATION_LAXATIVES,769,769,769,"CAT_OTC_MEDICATION_LAXATIVES,769","CAT_OTC_MEDICATION_LAXATIVES,769",Phillips Milk of Magnesia Original (12 fl oz) ...,phillips milk magnesia original fl


In [32]:
import pickle
# save the model to disk
filename_primary= 'finalized_model_rf_primary-20-12-21_allprod.sav'
pickle.dump(rf, open(filename_primary, 'wb'))
filename_secondary= 'finalized_model_rf_secondary-20-12-21_allprod.sav'
pickle.dump(rf1, open(filename_secondary, 'wb'))




In [64]:

# load the model from disk
loaded_model_primary= pickle.load(open('finalized_model_rf.sav', 'rb'))
loaded_model_primary.fit(df2['cleanText'].values, df2['primary'].values)
loaded_model_secondary= pickle.load(open('finalized_model_rf1.sav', 'rb'))
loaded_model_secondary.fit(df2['cleanText'].values, df2['secondary'].values)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents='ascii',
                                 token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b',
                                 tokenizer=Non...
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        mi

In [None]:
#Train test split with stratified sampling for evaluation
X_train, x_val, y_train, y_val = train_test_split(X,
                                                    Y_primary,
                                                    test_size =.7
                                                    )

In [None]:
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
result = rf.score(x_val['cleanText'], y_val)
print(result)

In [None]:
def preprocess_text_demo(item_name, description,establishment_type):

        message=item_name + " " +description+ " "+establishment_type
        sw =nltk.corpus.stopwords.words('english')   
        new_stopwords=['ml','oz','pk','grocery','lb']
        sw.extend(new_stopwords) 
        # 1. Init Lemmatizer
        lemmatizer = WordNetLemmatizer()
        #lowering and removing punctuation
        message = re.sub(r'[^\w\s]','', message.lower())
        #removing the numerical values and working only with text values
        message = re.sub('[^a-zA-Z]', " ", message )
        #removing the stopwords
        message = ' '.join([word for word in message.split() if word not in sw and len(word)>1])
        #lemmatizing the text
        message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
        #print("message is : ",message)
        return message
item_name='Jameson Irish Whiskey.1.75L Bottle Size'
description=''
establishment_type='GROCERY'

message=preprocess_text_demo(item_name, description,establishment_type)
message=[message]
predictions_primary= rf.predict(message)
#res=type(predictions_primary)
print(predictions_primary)
conf_score_primary= rf.predict_proba(message).max()
print(conf_score_primary)
predictions_secondary= rf1.predict(message)
print(predictions_secondary)
conf_score_secondary= rf1.predict_proba(message).max()
print(conf_score_secondary)
predictions_primary= rf.predict(message)
conf_score_primary= rf.predict_proba(message).max()
#print(predictions_primary)
predictions_secondary= rf1.predict(message)
conf_score_secondary= rf1.predict_proba(message).max()
#print(predictions_secondary)
if predictions_primary==predictions_secondary:
    categories=predictions_primary[0].split(',')[0]
    categories_integer=predictions_primary[0].split(',')[1]
else:
    categories = ','.join([predictions_primary[0].split(',')[0],predictions_secondary[0].split(',')[0]])
    categories_integer= ','.join([predictions_primary[0].split(',')[1],predictions_secondary[0].split(',')[1]])
conf_score=round((conf_score_primary+conf_score_secondary)/2,2)

#'conf_score':conf_score_primary,conf_score_secondary
#success_msg = [predictions_primary,predictions_secondary]
success_msg = {'cat_name':categories,'integer':categories_integer,'conf_score':conf_score}
print(success_msg)

In [None]:
model_dict = {'Random Forest': Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('rf', RandomForestClassifier()),
              ]),
             
             'naive bayas': Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ]),
              'logistic': Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('lg', LogisticRegression()),
              ])
             }
            

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in tqdm(model_dict.items()):   
        model_name.append(k)
        v.fit(X_train_final, y_train_final)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df


In [None]:
#################redundent code###################

In [None]:
#secondlevel= df.loc[df['primary']!=df['secondary']]
#secondlevel=secondlevel['secondary'].unique()
#mapping={}
#mapping_primary=pd.Series(df1.cat_name_primary.values,index=df1.Primary_Integer).to_dict()
#mapping_secondary=pd.Series(df1.cat_name_secondary.values,index=df1.Secondary_Integer).to_dict()
#df1['cat_multilevel_count'] = df1.groupby('cat_multilevel')['cat_multilevel'].transform('count')
#df1['cat_name_primary_count'] = df1.groupby('cat_name_primary')['cat_name_primary'].transform('count')
#df1['cat_name_secondary_count'] = df1.groupby('cat_name_secondary')['cat_name_secondary'].transform('count')
#df2_primary=df1[df1['cat_name_primary_count']<5].reset_index()
#df3_primary=df1[df1['cat_name_primary_count']>5].reset_index()
#X_primary=df3_primary[['item_name','description','establishment_type','cleanText']]
#Y_primary=df3_primary['cat_name_primary']
#df2_secondary=df1[df1['cat_name_secondary_count']<5].reset_index()
#df3_secondary=df1[df1['cat_name_secondary_count']>5].reset_index()
#X_secondary=df3_primary[['item_name','description','establishment_type','cleanText']]
#Y_secondary=df3_primary['cat_name_secondary']
#X_train_final_primary=pd.concat([X_train_primary['cleanText'], df2_primary['cleanText']])
#y_train_final_primary=pd.concat([y_train_primary, df2_primary['cat_name_primary']])
#X_train_final_secondary=pd.concat([X_train_secondary['cleanText'], df2_secondary['cleanText']])
#y_train_final_secondary=pd.concat([y_train_secondary, df2_secondary['cat_name_secondary']])
#X_train_final_secondary=X_train_final_secondary.values
#result['final_cat']=result['prediction_cat_primary']+ " ," +result['prediction_cat_secondary']
#result['final_cat'] =  result[['prediction_cat_primary', 'prediction_cat_secondary']].apply(lambda x: ','.join(str(x)), axis=0)
#result['final_cat_int']=result['prediction_cat_primary_integer']+','+result['prediction_cat_secondary_integer']
#result['final_cat']=result['final_cat'].apply(lambda x:','.join(list(set(x.split(',')))))
#result['final_accuracy']=accuracy_score(result['final_cat'],result['cat_multilevel'])
#result['final_cat'] = result['prediction_cat_primary'].astype(str) +"," +result['prediction_cat_secondary'].astype(str)

In [None]:
#Creating the features (tf-idf weights) for the processed text

#texts = df1['input_str'].astype('str')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df = .95)
LE = LabelEncoder()
#tfidf
tfv = TfidfVectorizer(strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1,2), use_idf=1,smooth_idf=1,sublinear_tf=1,max_df = .95,stop_words = 'english')

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) #features
X_test_tfidf= tfidf_vectorizer.fit_transform(X_test) #features

tfv.fit(list(X_train) + list(X_test))
xtrain_tfv =  tfv.transform(X_train) 
xvalid_tfv = tfv.transform(X_test)

#countvec
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
X_train_ctv = ctv.fit_transform(X_train) #features
X_test_ctv= ctv.fit_transform(X_test) #features



y_train_final=LE.fit_transform(y_train)
y_test_final=LE.fit_transform(y_test)

In [None]:

#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=100, 
                   n_iter=10, 
                   random_state=3)

X_train_final = lsa.fit_transform(X_train_tfidf)
X_test_final = lsa.fit_transform(X_test_tfidf)

X_train_final_ctv= lsa.fit_transform(X_train_ctv)
X_test_final_ctv= lsa.fit_transform(X_test_ctv)

In [None]:
model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
randomforestmodel=RandomForestClassifier(random_state=3)
#tfidf
#randomforestmodel.fit(xtrain_tfv, y_train_final)
#y_pred = randomforestmodel.predict(xvalid_tfv)
#countvec
#randomforestmodel.fit(X_train_final_ctv, y_train_final)
#y_pred = randomforestmodel.predict(X_test_final_ctv)
#svd
randomforestmodel.fit(X_train_final, y_train_final)
y_pred = randomforestmodel.predict(X_test_final)

ac_score_list.append(accuracy_score(y_test_final, y_pred))
p_score_list.append(precision_score(y_test_final, y_pred, average='macro'))
r_score_list.append(recall_score(y_test_final, y_pred, average='macro'))
f1_score_list.append(f1_score(y_test_final, y_pred, average='macro'))
model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
print(model_comparison_df)

In [None]:

#Preliminary model evaluation using default parameters
from sklearn.naive_bayes import MultinomialNB
#Creating a dict of the models
model_dict = {'Random Forest': RandomForestClassifier(random_state=3)}
            

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in tqdm(model_dict.items()):   
        model_name.append(k)
        v.fit(X_train_final, y_train_final)
        y_pred = v.predict(X_test_final)
        ac_score_list.append(accuracy_score(y_test_final, y_pred))
        p_score_list.append(precision_score(y_test_final, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test_final, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test_final, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df,v



In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#clf = RandomForestClassifier()
from pprint import pprint
from time import time
import logging
rf=Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', RandomForestClassifier()),
              ])

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigramslf__
     "clf__bootstrap":[True, False],
     "clf__max_depth":[10, 50, 100,500, None],
     "clf__max_features":['auto', 'sqrt'],
     "clf__min_samples_leaf":[1,2,4],
     "clf__min_samples_split":[2,5,10],
     "clf__n_estimators":[400,600,800],
     "clf__random_state":[3]

}

RandomizedSearch = RandomizedSearchCV(rf,
                          parameters, 
                          cv=5,
                          verbose=1, 
                          n_jobs=-1)

t0 = time()
rf_best_model = RandomizedSearch.fit(X_train_final, y_train_final)
print("done in %0.3fs" % (time() - t0))
#print()
print("Best score: %0.3f" % rf_best_model.best_score_)
print("Best parameters set:")
best_parameters = rf_best_model.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


In [None]:
rf_best_model = RandomForestClassifier(bootstrap = False,
                                       max_depth = 50,
                                       max_features = 'auto',
                                       min_samples_leaf = 1,
                                       n_estimators = 1400,
                                       random_state=3)