In [3]:
import pandas as pd
import numpy as np
import json
import os
import re
#from tqdm.notebook import tqdm
#tqdm.pandas()
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import string

In [4]:
def preprocess_text(message):

    new_stopwords=['ml','oz','pk','grocery','lb']
    stpwrd = nltk.corpus.stopwords.words('english')
    stpwrd.extend(new_stopwords)
    # 1. Init Lemmatizer
    lemmatizer = WordNetLemmatizer()
    #lowering and removing punctuation
    message = re.sub(r'[^\w\s]','', message.lower())
    #removing the numerical values and working only with text values
    message = re.sub('[^a-zA-Z]', " ", message )
    #removing the stopwords
    message = ' '.join([word for word in message.split() if word not in stpwrd and len(word)>1])
    #lemmatizing the text
    message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
    #print("message is : ",message)
    return message


In [3]:
df = pd.read_csv('jina_final.csv', encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','primary','secondary'])
df1=df.sample(frac=1, random_state=42)
df1 = df1.fillna('')
df1['input_str'] = df1[['Item', 'Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
df1['cleanText']=df1['input_str'].map(lambda s:preprocess_text(s)) 
df1 = df1.reset_index(drop=True)
X=df1[['Item','Description','establishment_type','cleanText']]
Y_primary=df1['primary']
Y_secondary=df1['secondary']

In [4]:
#Train test split with stratified sampling for evaluation
X_train, X_test, y_train_primary, y_test_primary = train_test_split(X,
                                                    Y_primary,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )
#Train test split with stratified sampling for evaluation
X_train, X_test,y_train_secondary, y_test_secondary = train_test_split(X,
                                                    Y_secondary,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )

####cicd part

In [5]:
df2=pd.read_csv('TaxML-CICD - Prod_Data_after_preprocess.csv',encoding='utf8',engine='python',usecols=['Item','Description','establishment_type','primary','secondary'])
df2['input_str'] = df2[['Item','Description','establishment_type']].apply(lambda x: ' '.join(x[x.notnull()]), axis = 1)
df2['cleanText']=df2['input_str'].map(lambda s:preprocess_text(s))
X_cicd=df2[['Item','Description','establishment_type','cleanText']]
Y_primary_cicd=df2['primary']
Y_secondary_cicd=df2['secondary']
X_train_cicd, X_test_cicd, y_train_primary_cicd, y_test_primary_cicd = train_test_split(X_cicd,
                                                    Y_primary_cicd,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )
#Train test split with stratified sampling for evaluation
X_train_cicd, X_test_cicd,y_train_secondary_cicd, y_test_secondary_cicd = train_test_split(X_cicd,
                                                    Y_secondary_cicd,
                                                    test_size = .20, 
                                                    random_state = 42
                                                    )

In [20]:
X_train

Unnamed: 0,Item,Description,establishment_type,cleanText
74328,"Exotico Blanco, 750mL bottle (40% ABV)",,GROCERY,exotico blanco bottle abv
93071,Cortas Halva Original (16 oz),,GROCERY,cortas halva original
160229,"Meiomi Pinot Noir, 750mL wine (13.7% ABV)",,GROCERY,meiomi pinot noir wine abv
33442,Ito en Peach Veggie Shot,30.36 fluid ounces.,GROCERY,ito en peach veggie shot fluid ounce
129506,"Val Di Giulia Barbaresco, 750mL italian red wi...",,GROCERY,val di giulia barbaresco italian red wine abv
...,...,...,...,...
119879,Half & Half Quart,,GROCERY,half half quart
103694,Alpine Beer Company Duet Pale Ale,This West Coast pale ale made with Simcoe and ...,GROCERY,alpine beer company duet pale ale west coast p...
131932,Romance Flowers Medium Red,,FLOWERS,romance flower medium red flower
146867,Renal Care (32 oz),Our renal care recipe has low protein. It cont...,PET,renal care renal care recipe low protein conta...


In [6]:
X_train_final=X_train['cleanText'].append(X_train_cicd['cleanText'])
y_train_final_primary=y_train_primary.append(y_train_primary_cicd)
y_train_final_secondary=y_train_secondary.append(y_train_secondary_cicd)
X_test_final=X_test['cleanText'].append(X_test_cicd['cleanText'])
y_test_final_primary=y_test_primary.append(y_test_primary_cicd)
y_test_final_secondary=y_test_secondary.append(y_test_secondary_cicd)

In [7]:
train_size = 0.8
train_end = int(len(df1)*train_size)
df_train = df1[:train_end]
df_test = df1[train_end:]
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')

In [9]:
type(y_test_final_primary)

pandas.core.series.Series

In [8]:
#from sklearn.naive_bayes import MultinomialNB

dimention=['y_train_final_primary','y_train_final_secondary']

result=X_test
for i in dimention:
    if i =='y_train_final_primary':
        rf = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')),
               ('tfidf', TfidfTransformer(use_idf=True)),
               ('clf', RandomForestClassifier(oob_score=True,n_jobs=-1)),
              ])
        rf.fit(X_train_final, y_train_final_primary)
        y_pred= rf.predict(X_test_final)
        result['original_cat_primary']=y_test_final_primary
        result['prediction_cat_primary']=y_pred
        result['prediction_cat_primary_confscore']=rf.predict_proba(X_test_final)
        output={'accuracy':accuracy_score(y_pred,y_test_final_primary),'precision_score':precision_score(y_pred,y_test_final_primary,average='macro'),'recall_score':recall_score(y_pred,y_test_final_primary,average='macro')
,'f1_score':f1_score(y_pred,y_test_final_primary,average='macro')}
        result['confusion_matrix_primary']=str(output)
        
    else:
        rf1 = Pipeline([('vect', CountVectorizer(strip_accents='ascii', token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=True, stop_words='english')),
               ('tfidf', TfidfTransformer(use_idf=True)),
               ('clf', RandomForestClassifier(oob_score=True,n_jobs=-1)),
              ])
        rf1.fit(X_train_final, y_train_final_secondary)
        y_pred= rf1.predict(X_test_final)
        result['original_cat_secondary']=y_test_final_secondary
        result['prediction_cat_secondary']=y_pred
        result['prediction_cat_secondary_confscore']=rf1.predict_proba(X_test_final) 
        output={'accuracy':accuracy_score(y_pred,y_test_final_secondary),'precision_score':precision_score(y_pred,y_test_final_secondary,average='macro'),'recall_score':recall_score(y_pred,y_test_final_secondary,average='macro')
,'f1_score':f1_score(y_pred,y_test_final_secondary,average='macro')}
        result['confusion_matrix_secondary']=str(output)



ValueError: Length of values (34135) does not match length of index (33989)

In [28]:
y_pred= rf.predict(X_test_final)

In [10]:
result['original_cat_primary']=y_test_final_primary

ValueError: cannot reindex from a duplicate axis

In [24]:
misclassifications_primary= result.loc[result['original_cat_primary']!=result['prediction_cat_primary']]
misclassifications_secondary= result.loc[result['original_cat_secondary']!=result['prediction_cat_secondary']]
misclassification=pd.merge(misclassifications_primary,misclassifications_secondary,how='inner',on=['item_name','description','establishment_type','cleanText'])
misclassification.to_csv('misclassification.csv')

In [6]:
df2

Unnamed: 0,Item,Description,establishment_type,primary,secondary,input_str,cleanText
0,Pure Happiness,A sunny sunflower bouquet gets an autumnal spi...,FLOWERS,"CAT_TPP,531","CAT_TPP,531",Pure Happiness A sunny sunflower bouquet gets ...,pure happiness sunny sunflower bouquet get aut...
1,Silver Snow Bouquet,"Like a quiet walk through a snowy forest, this...",FLOWERS,"CAT_TPP,531","CAT_TPP,531",Silver Snow Bouquet Like a quiet walk through ...,silver snow bouquet like quiet walk snowy fore...
2,Beautiful in Blue,Brighten the home with the beauty of bright bl...,FLOWERS,"CAT_TPP,531","CAT_TPP,531",Beautiful in Blue Brighten the home with the b...,beautiful blue brighten home beauty bright blu...
3,Blush Life Bouquet,Put a spring in their step with this beautiful...,FLOWERS,"CAT_TPP,531","CAT_TPP,531",Blush Life Bouquet Put a spring in their step ...,blush life bouquet put spring step beautifully...
4,Red Tulips,Call ahead for this arrangement before orderin...,FLOWERS,"CAT_TPP,531","CAT_TPP,531",Red Tulips Call ahead for this arrangement bef...,red tulip call ahead arrangement ordering tuli...
...,...,...,...,...,...,...,...
722,Misko (1 lb),,GROCERY,"CAT_PREPACKAGED_FOOD_PASTA,737","CAT_PREPACKAGED_FOOD_PASTA,737",Misko (1 lb) GROCERY,misko
723,Chunky Chocolates (2.4 oz),Chunks of chocolate with premium ingredients m...,GROCERY,"CAT_CONFECTIONARY,707","CAT_CONFECTIONARY,707",Chunky Chocolates (2.4 oz) Chunks of chocolate...,chunky chocolate chunk chocolate premium ingre...
724,"Essential Everyday Salsa, Restaurant Style, an...",,GROCERY,"CAT_PREPACKAGED_FOOD_CONDIMENTS,740","CAT_PREPACKAGED_FOOD_CONDIMENTS,740","Essential Everyday Salsa, Restaurant Style, an...",essential everyday salsa restaurant style medium
725,"Woodbridge Chardonnay, 1.5L white wine (13.5% ...",,GROCERY,"CAT_WINE,534","CAT_WINE,534","Woodbridge Chardonnay, 1.5L white wine (13.5% ...",woodbridge chardonnay white wine abv


In [9]:
import pickle
# save the model to disk
filename_primary= 'finalized_model_rf.sav'
pickle.dump(rf, open(filename_primary, 'wb'))
filename_secondary= 'finalized_model_rf1.sav'
pickle.dump(rf1, open(filename_secondary, 'wb'))




NameError: name 'rf' is not defined

In [12]:

# load the model from disk
loaded_model_primary= pickle.load(open('finalized_model_rf.sav', 'rb'))
loaded_model_primary.fit(df2['cleanText'].values, df2['primary'].values)
loaded_model_secondary= pickle.load(open('finalized_model_rf1.sav', 'rb'))
loaded_model_secondary.fit(df2['cleanText'].values, df2['secondary'].values)

Pipeline(steps=[('vect',
                 CountVectorizer(stop_words='english', strip_accents='ascii',
                                 token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b')),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_jobs=-1, oob_score=True))])

In [None]:
#Train test split with stratified sampling for evaluation
X_train, x_val, y_train, y_val = train_test_split(X,
                                                    Y_primary,
                                                    test_size =.7
                                                    )

In [None]:
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
result = rf.score(x_val['cleanText'], y_val)
print(result)

In [None]:
def preprocess_text_demo(item_name, description,establishment_type):

        message=item_name + " " +description+ " "+establishment_type
        sw =nltk.corpus.stopwords.words('english')   
        new_stopwords=['ml','oz','pk','grocery','lb']
        sw.extend(new_stopwords) 
        # 1. Init Lemmatizer
        lemmatizer = WordNetLemmatizer()
        #lowering and removing punctuation
        message = re.sub(r'[^\w\s]','', message.lower())
        #removing the numerical values and working only with text values
        message = re.sub('[^a-zA-Z]', " ", message )
        #removing the stopwords
        message = ' '.join([word for word in message.split() if word not in sw and len(word)>1])
        #lemmatizing the text
        message =  " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(message) if w not in string.punctuation])
        #print("message is : ",message)
        return message
item_name='Jameson Irish Whiskey.1.75L Bottle Size'
description=''
establishment_type='GROCERY'

message=preprocess_text_demo(item_name, description,establishment_type)
message=[message]
predictions_primary= rf.predict(message)
#res=type(predictions_primary)
print(predictions_primary)
conf_score_primary= rf.predict_proba(message).max()
print(conf_score_primary)
predictions_secondary= rf1.predict(message)
print(predictions_secondary)
conf_score_secondary= rf1.predict_proba(message).max()
print(conf_score_secondary)
predictions_primary= rf.predict(message)
conf_score_primary= rf.predict_proba(message).max()
#print(predictions_primary)
predictions_secondary= rf1.predict(message)
conf_score_secondary= rf1.predict_proba(message).max()
#print(predictions_secondary)
if predictions_primary==predictions_secondary:
    categories=predictions_primary[0].split(',')[0]
    categories_integer=predictions_primary[0].split(',')[1]
else:
    categories = ','.join([predictions_primary[0].split(',')[0],predictions_secondary[0].split(',')[0]])
    categories_integer= ','.join([predictions_primary[0].split(',')[1],predictions_secondary[0].split(',')[1]])
conf_score=round((conf_score_primary+conf_score_secondary)/2,2)

#'conf_score':conf_score_primary,conf_score_secondary
#success_msg = [predictions_primary,predictions_secondary]
success_msg = {'cat_name':categories,'integer':categories_integer,'conf_score':conf_score}
print(success_msg)

In [None]:
model_dict = {'Random Forest': Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('rf', RandomForestClassifier()),
              ]),
             
             'naive bayas': Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ]),
              'logistic': Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('lg', LogisticRegression()),
              ])
             }
            

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in tqdm(model_dict.items()):   
        model_name.append(k)
        v.fit(X_train_final, y_train_final)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df


In [None]:
#################redundent code###################

In [None]:
#secondlevel= df.loc[df['primary']!=df['secondary']]
#secondlevel=secondlevel['secondary'].unique()
#mapping={}
#mapping_primary=pd.Series(df1.cat_name_primary.values,index=df1.Primary_Integer).to_dict()
#mapping_secondary=pd.Series(df1.cat_name_secondary.values,index=df1.Secondary_Integer).to_dict()
#df1['cat_multilevel_count'] = df1.groupby('cat_multilevel')['cat_multilevel'].transform('count')
#df1['cat_name_primary_count'] = df1.groupby('cat_name_primary')['cat_name_primary'].transform('count')
#df1['cat_name_secondary_count'] = df1.groupby('cat_name_secondary')['cat_name_secondary'].transform('count')
#df2_primary=df1[df1['cat_name_primary_count']<5].reset_index()
#df3_primary=df1[df1['cat_name_primary_count']>5].reset_index()
#X_primary=df3_primary[['item_name','description','establishment_type','cleanText']]
#Y_primary=df3_primary['cat_name_primary']
#df2_secondary=df1[df1['cat_name_secondary_count']<5].reset_index()
#df3_secondary=df1[df1['cat_name_secondary_count']>5].reset_index()
#X_secondary=df3_primary[['item_name','description','establishment_type','cleanText']]
#Y_secondary=df3_primary['cat_name_secondary']
#X_train_final_primary=pd.concat([X_train_primary['cleanText'], df2_primary['cleanText']])
#y_train_final_primary=pd.concat([y_train_primary, df2_primary['cat_name_primary']])
#X_train_final_secondary=pd.concat([X_train_secondary['cleanText'], df2_secondary['cleanText']])
#y_train_final_secondary=pd.concat([y_train_secondary, df2_secondary['cat_name_secondary']])
#X_train_final_secondary=X_train_final_secondary.values
#result['final_cat']=result['prediction_cat_primary']+ " ," +result['prediction_cat_secondary']
#result['final_cat'] =  result[['prediction_cat_primary', 'prediction_cat_secondary']].apply(lambda x: ','.join(str(x)), axis=0)
#result['final_cat_int']=result['prediction_cat_primary_integer']+','+result['prediction_cat_secondary_integer']
#result['final_cat']=result['final_cat'].apply(lambda x:','.join(list(set(x.split(',')))))
#result['final_accuracy']=accuracy_score(result['final_cat'],result['cat_multilevel'])
#result['final_cat'] = result['prediction_cat_primary'].astype(str) +"," +result['prediction_cat_secondary'].astype(str)

In [None]:
#Creating the features (tf-idf weights) for the processed text

#texts = df1['input_str'].astype('str')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df = .95)
LE = LabelEncoder()
#tfidf
tfv = TfidfVectorizer(strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1,2), use_idf=1,smooth_idf=1,sublinear_tf=1,max_df = .95,stop_words = 'english')

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) #features
X_test_tfidf= tfidf_vectorizer.fit_transform(X_test) #features

tfv.fit(list(X_train) + list(X_test))
xtrain_tfv =  tfv.transform(X_train) 
xvalid_tfv = tfv.transform(X_test)

#countvec
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')
X_train_ctv = ctv.fit_transform(X_train) #features
X_test_ctv= ctv.fit_transform(X_test) #features



y_train_final=LE.fit_transform(y_train)
y_test_final=LE.fit_transform(y_test)

In [None]:

#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=100, 
                   n_iter=10, 
                   random_state=3)

X_train_final = lsa.fit_transform(X_train_tfidf)
X_test_final = lsa.fit_transform(X_test_tfidf)

X_train_final_ctv= lsa.fit_transform(X_train_ctv)
X_test_final_ctv= lsa.fit_transform(X_test_ctv)

In [None]:
model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
randomforestmodel=RandomForestClassifier(random_state=3)
#tfidf
#randomforestmodel.fit(xtrain_tfv, y_train_final)
#y_pred = randomforestmodel.predict(xvalid_tfv)
#countvec
#randomforestmodel.fit(X_train_final_ctv, y_train_final)
#y_pred = randomforestmodel.predict(X_test_final_ctv)
#svd
randomforestmodel.fit(X_train_final, y_train_final)
y_pred = randomforestmodel.predict(X_test_final)

ac_score_list.append(accuracy_score(y_test_final, y_pred))
p_score_list.append(precision_score(y_test_final, y_pred, average='macro'))
r_score_list.append(recall_score(y_test_final, y_pred, average='macro'))
f1_score_list.append(f1_score(y_test_final, y_pred, average='macro'))
model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
print(model_comparison_df)

In [None]:

#Preliminary model evaluation using default parameters
from sklearn.naive_bayes import MultinomialNB
#Creating a dict of the models
model_dict = {'Random Forest': RandomForestClassifier(random_state=3)}
            

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in tqdm(model_dict.items()):   
        model_name.append(k)
        v.fit(X_train_final, y_train_final)
        y_pred = v.predict(X_test_final)
        ac_score_list.append(accuracy_score(y_test_final, y_pred))
        p_score_list.append(precision_score(y_test_final, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test_final, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test_final, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df,v



In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#clf = RandomForestClassifier()
from pprint import pprint
from time import time
import logging
rf=Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', RandomForestClassifier()),
              ])

parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigramslf__
     "clf__bootstrap":[True, False],
     "clf__max_depth":[10, 50, 100,500, None],
     "clf__max_features":['auto', 'sqrt'],
     "clf__min_samples_leaf":[1,2,4],
     "clf__min_samples_split":[2,5,10],
     "clf__n_estimators":[400,600,800],
     "clf__random_state":[3]

}

RandomizedSearch = RandomizedSearchCV(rf,
                          parameters, 
                          cv=5,
                          verbose=1, 
                          n_jobs=-1)

t0 = time()
rf_best_model = RandomizedSearch.fit(X_train_final, y_train_final)
print("done in %0.3fs" % (time() - t0))
#print()
print("Best score: %0.3f" % rf_best_model.best_score_)
print("Best parameters set:")
best_parameters = rf_best_model.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


In [None]:
rf_best_model = RandomForestClassifier(bootstrap = False,
                                       max_depth = 50,
                                       max_features = 'auto',
                                       min_samples_leaf = 1,
                                       n_estimators = 1400,
                                       random_state=3)