In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# from gensim.models import Word2Vec, KeyedVectors

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from tqdm.notebook import tqdm

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from statistics import mean

In [2]:
def preprocess_corpus(text):
    #importing stop words like in, the, of so that these can be removed from texts
    #as these words dont help in determining the classes(Whether a sentence is toxic or not)
    #removing the numerical values and working only with text values
    text = re.sub('[^a-zA-Z]', " ", text)
    
    def remove_stops_digits(tokens):
        #Nested function that lowercases, removes stopwords and digits from a list of tokens
        return [token.lower() for token in tokens if token not in mystopwords and not token.isdigit()
               and token not in punctuation and len(token)>1]
    #This return statement below uses the above function and tokenizes output further. 
    return remove_stops_digits(word_tokenize(text))

#Preprocess both for training and test data
# train_texts_processed = preprocess_corpus(train_texts)
# test_texts_processed = preprocess_corpus(test_texts)

In [3]:
pip uninstall transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
mystopwords = set(stopwords.words("english"))

In [5]:
df_item = pd.read_csv('/Users/jghosh2/Documents/my-notebook/Tax_ml_poc/data/historical_data_14_01_22.csv')

In [6]:
df_item.shape

(204725, 7)

In [7]:
df_item = df_item[~df_item['target'].isnull()]

In [8]:
item_columns = ['Item', 'Description', 'establishment_type']

In [9]:
import re

In [10]:
df_item['text'] = df_item[item_columns[0]].astype('str') + " " + df_item[item_columns[1]].astype('str') + " " + df_item[item_columns[2]].astype('str')

In [11]:
df_item['processed_text'] = df_item['text'].apply(preprocess_corpus)

In [12]:
df_item.head()

Unnamed: 0.1,Unnamed: 0,Item,Description,establishment_type,CAT_Name,Integer,target,text,processed_text
0,2406,"1800 Lime Margarita, 1.75mL margarita (9.95% ABV)",,GROCERY,"CAT_ALCOHOL,TEMP_COLD",1091,"CAT_ALCOHOL,TEMP_COLD:109,1","1800 Lime Margarita, 1.75mL margarita (9.95% A...","[lime, margarita, ml, margarita, abv, nan, gro..."
1,2434,"1800 peach Margarita, 1.75mL margarita ( 9.95%...",,GROCERY,"CAT_ALCOHOL,TEMP_COLD",1091,"CAT_ALCOHOL,TEMP_COLD:109,1","1800 peach Margarita, 1.75mL margarita ( 9.95%...","[peach, margarita, ml, margarita, abv, nan, gr..."
2,2437,"1800 pineapple margarita, 1.75mL margarita ( 9...",,GROCERY,"CAT_ALCOHOL,TEMP_COLD",1091,"CAT_ALCOHOL,TEMP_COLD:109,1","1800 pineapple margarita, 1.75mL margarita ( 9...","[pineapple, margarita, ml, margarita, abv, nan..."
3,2566,"1800 Reserva Reposado, 1.75lt tequila (40.0% ABV)",,GROCERY,"CAT_ALCOHOL,TEMP_COLD",1091,"CAT_ALCOHOL,TEMP_COLD:109,1","1800 Reserva Reposado, 1.75lt tequila (40.0% A...","[reserva, reposado, lt, tequila, abv, nan, gro..."
4,2579,"1800 Reserva Silver, 1.75lt tequila (40.0% ABV)",,GROCERY,"CAT_ALCOHOL,TEMP_COLD",1091,"CAT_ALCOHOL,TEMP_COLD:109,1","1800 Reserva Silver, 1.75lt tequila (40.0% ABV...","[reserva, silver, lt, tequila, abv, nan, grocery]"


In [13]:
X_train, X_test = train_test_split(df_item, train_size = 0.8, random_state = 42 )

In [14]:
#pip install keras==2.3.1

In [15]:
import sklearn

In [16]:
print(sklearn.__version__)

0.21.1


In [17]:
sentences = X_train['processed_text'].tolist()

In [18]:
flat_list = [item for sublist in sentences for item in sublist]

In [19]:
tokens = flat_list

In [20]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
# print(vocab)

In [21]:
vocab_size

58288

In [22]:
# the keras model/graph would look something like this:
#from keras import layers, optimizers, Model
from tensorflow.keras import optimizers,Model,layers
#from tensorflow.keras import models
# adjustable parameter that control the dimension of the word vectors
embed_size = 100

input_center = layers.Input((1,))
input_context = layers.Input((1,))

embedding = layers.Embedding(vocab_size, embed_size, input_length=1, name='embed_in')
center = embedding(input_center)  # shape [seq_len, # features (1), embed_size]
context = embedding(input_context)

center = layers.Reshape((embed_size,))(center)
context = layers.Reshape((embed_size,))(context)

dot_product = layers.dot([center, context], axes=1)
output = layers.Dense(1, activation='sigmoid')(dot_product)
model = Model(inputs=[input_center, input_context], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=optimizers.RMSprop(lr=0.01))
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embed_in (Embedding)            (None, 1, 100)       5828800     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 100)          0           embed_in[0][0]               

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [23]:
# then we can feed in the skipgram and its label (whether the word pair is in or outside
# the context)
batch_center = [2354, 2354, 2354, 69, 69]
batch_context = [4288, 203, 69, 2535, 815]
batch_label = [0, 1, 1, 0, 1]
model.train_on_batch([batch_center, batch_context], batch_label)

AttributeError: 'int' object has no attribute 'shape'

In [24]:
from transformers import KerasWord2VecVectorizer
keras_word2vec_tr = KerasWord2VecVectorizer(embed_size=50, min_count=3, epochs=5000,
                                            negative_samples=2)
keras_word2vec_tr

KerasWord2VecVectorizer(batch_size=64, embed_size=50, epochs=5000,
                        learning_rate=0.05, min_count=3, negative_samples=2,
                        sort_vocab=True, use_sampling_table=True,
                        window_size=5)

In [25]:
# keras_w2v_xgb = Pipeline([
#     ('w2v', keras_word2vec_tr), 
#     ('xgb', xgb)
# ])

# keras_w2v_xgb.fit(X_train['processed_text'], X_train['item_category'])

In [26]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [27]:
# y_train_pred = keras_w2v_xgb.predict(X_train['processed_text'])
# print('Training set accuracy %s' % accuracy_score(X_train['item_category'], y_train_pred))
# confusion_matrix(X_train['item_category'], y_train_pred)

In [28]:
# y_test_pred = keras_w2v_xgb.predict(X_test['processed_text'])
# print('Test set accuracy %s' % accuracy_score(X_test['item_category'], y_test_pred))
# print(classification_report(X_test['item_category'], y_test_pred))


In [29]:
rf_clf = RandomForestClassifier(random_state=1,class_weight= 'balanced' )

In [30]:
keras_w2v_rf_clf = Pipeline([
    ('w2v', keras_word2vec_tr), 
    ('clf', rf_clf)
])

keras_w2v_rf_clf.fit(X_train['processed_text'], X_train['target'])

  rand_indexed_texts = np.random.choice(indexed_texts)
100%|██████████| 5000/5000 [36:09<00:00,  2.30it/s] 


Pipeline(memory=None,
         steps=[('w2v',
                 KerasWord2VecVectorizer(batch_size=64, embed_size=50,
                                         epochs=5000, learning_rate=0.05,
                                         min_count=3, negative_samples=2,
                                         sort_vocab=True,
                                         use_sampling_table=True,
                                         window_size=5)),
                ('clf',
                 RandomForestClassifier(bootstrap=True, class_weight='balanced',
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                       

In [31]:
y_train_pred = keras_w2v_rf_clf.predict(X_train['processed_text'])
print('Training set accuracy %s' % accuracy_score(X_train['target'], y_train_pred))
# confusion_matrix(X_train['item_category'], y_train_pred)

Training set accuracy 0.9722859934057883


In [32]:
y_test_pred = keras_w2v_rf_clf.predict(X_test['processed_text'])
print('Test set accuracy %s' % accuracy_score(X_test['target'], y_test_pred))
print(classification_report(X_test['target'], y_test_pred))


Test set accuracy 0.6031017218219563


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


                                                                                        precision    recall  f1-score   support

                                                           CAT_ALCOHOL,TEMP_COLD:109,1       0.24      0.21      0.22       124
                                                         CAT_ALCOHOL,TEMP_HEATED:109,1       0.41      0.47      0.44      1476
                                                       CAT_ALCOHOL,TEMP_UNHEATED:109,1       0.14      0.11      0.12       298
                                                                   CAT_ANTI_FREEZE:774       0.00      0.00      0.00         2
                                                                  CAT_BABY_FORMULA:515       0.11      1.00      0.20         1
                                                                    CAT_BABY_WIPES:513       0.08      0.14      0.11         7
                                                       CAT_BAKERY_ITEM,TEMP_COLD:562,1       0.00      

In [33]:
import joblib
joblib.dump(keras_w2v_rf_clf, "items_word2vec_model_dsw_2.joblib")

TypeError: can't pickle weakref objects

In [None]:
test_df = pd.read_csv('test_set200.csv')

In [None]:
test_df.columns

In [None]:
test_df = test_df[['storefront_uuid', 'parent_chain_or_store_name',
       'merchant_type_analytics', 'segment', 'country_name', 'location_type',
       'menu_item', 'menu_items_subsection_name','y' ]]

In [None]:
test_df.head()

In [None]:
test_df['text'] = test_df['menu_item'].astype('str') + " " + test_df['menu_items_subsection_name'].astype('str') + " " + test_df['parent_chain_or_store_name'].astype('str')

In [None]:
test_df['processed_text'] = test_df['text'].apply(preprocess_corpus)

In [None]:
def merchant_categorization_rule(category_count_dict):
    
    total_count = sum(category_count_dict.values())
    # print(category_count_dict)
    
#     valuedf.loc[valuedf['category'] == 'GSE','counts'].values
    if category_count_dict['pharmacy'] >= 20 and category_count_dict['restaurant'] < 0.5 * total_count:
        category = 'pharmacy'
        
    elif category_count_dict['alcohol'] >= 0.51 * total_count:
        category = 'alcohol'
    
    elif category_count_dict['flowers'] >= 0.51 * total_count:
        category = 'flowers' 
        
    elif category_count_dict['pet supplies'] >= 0.51 * total_count:
        category = 'pet supplies'
        
    elif category_count_dict['gse'] >= 20:
        category = 'gse'
        
    elif category_count_dict['restaurant'] >= 0.51 * total_count:
        category = 'restaurant'
        
    else:
        category = 'retail'
    
    return category, total_count

In [None]:
def get_default_dict():
    dict_categories = {
        'pharmacy': 0,
        'restaurant': 0,
        'alcohol': 0,
        'flowers': 0,
        'pet supplies': 0,
        'gse': 0,
        'retail':0
        
    }
    
    return dict_categories

In [None]:
def predict_csv(df, item_model):
    
#     df.drop(["merchant_level_prediction", "merchant_prediction_confidence"], axis=1, inplace=True)
#     df_merch = df.drop_duplicates(subset=['storefront_uuid'])
#     df_merch['merchant_level_prediction'] = merchant_model.predict(df_merch['mer_preprocessed_text'])
#     df_merch['merchant_prediction_confidence'] = np.round_(np.max(merchant_model.predict_proba(df_merch['mer_preprocessed_text']), axis=1), decimals=4)
    

#     df = pd.concat([df, df_merch[['merchant_level_prediction','merchant_prediction_confidence']]], axis=1)
#     df['merchant_level_prediction'].fillna(method='ffill', inplace=True)
#     df['merchant_prediction_confidence'].fillna(method='ffill', inplace=True)
#     print('merchant_prediction_done')
    
    df['item_level_prediction'] = item_model.predict(df['processed_text'])
    df['item_prediction_confidence'] = np.round_(np.max(item_model.predict_proba(df['processed_text']), axis=1), decimals=4)
    print('item_level_prediction done')
    
    df_grps = df.groupby(['storefront_uuid'])
    print("no of groups: ", df_grps.ngroups)
    groups = []
    for name, grouped in df_grps:

#         print(grouped['item_level_prediction'].value_counts())
        category_count_dict = get_default_dict()
        items_pred_counts_df = grouped['item_level_prediction'].value_counts().rename_axis('category').reset_index(name='counts')

        for i,row in items_pred_counts_df.iterrows():
            category_count_dict[row[0]] = row[1]
        

        if len(items_pred_counts_df) > 0:
            merchant_category_pred, total_count = merchant_categorization_rule(category_count_dict)
       
            merchant_confidence_pred = round(sum(grouped['item_prediction_confidence'])/total_count,4)  
            grouped['merchant_prediction_from_items'] = merchant_category_pred
            grouped['merchant_prediction_from_items_confidence'] = merchant_confidence_pred
            
        groups.append(grouped)
        
    df_final = pd.concat(grp for grp in groups)
    df_final['final_merchant_prediction'] = df_final['merchant_prediction_from_items']
    df_final['final_merchant_prediction_confidence'] = df_final['merchant_prediction_from_items_confidence']
    df_final.drop(['text', 'processed_text'], axis=1, inplace=True)
 
    # df_final['timestamp'] = '2022-03-04' # need to be string, hive will use this as partition column
    return df_final

In [None]:
test_df_final = predict_csv(test_df, keras_w2v_rf_clf)

In [None]:
len(test_df_final)

In [None]:
test_df_final.head()

In [None]:
test_df_final['merchant_prediction_from_items'].value_counts()

In [None]:
accuracy_score(test_df_final['y'],test_df_final['merchant_prediction_from_items'])

In [None]:
import joblib

In [None]:
import joblib
joblib.dump(keras_w2v_rf_clf, "items_word2vec_model_dsw_2.joblib")

In [None]:
# model = joblib.load("items_word2vec_model.joblib")

In [None]:
item_names = df2_filtered_merch['menu_item'].unique().tolist()

In [None]:
len(item_names)

In [None]:
item_english_names = []
for item in item_names:
    if preprocess_corpus(str(item)) != []:
        item_english_names.append(item)

In [None]:
len(item_english_names)

In [None]:
df2_filtered_item = df2_filtered_merch[df2_filtered_merch['menu_item'].isin(item_english_names)]

In [None]:
df2_filtered_item.shape

In [None]:
df2_filtered_item['text'] = df2_filtered_item['menu_item'].astype('str') + " " + df2_filtered_item['menu_items_subsection_name'].astype('str') + " " + df2_filtered_item['parent_chain_or_store_name'].astype('str')

In [None]:
df2_filtered_item.head()

In [None]:
df2_filtered_item['processed_text'] = df2_filtered_item['text'].apply(preprocess_corpus)

In [None]:
df2_filtered_item.to_csv('smb_all_data_preprocessed.csv', index= False)

In [None]:
df_test_new = pd.read_csv('../data/test/test_samples_for_validation - test_samples_for_validation.csv')

In [None]:
df_test_new.columns

In [None]:
df_test_new = df_test_new[['storefront_uuid', 'parent_chain_or_store_name',
       'merchant_type_analytics', 'segment', 'country_name', 'location_type',
       'menu_item', 'menu_items_subsection_name','y' ]]

In [None]:
df_test_new.shape

In [None]:
df_test_new['text'] = df_test_new['menu_item'].astype('str') + " " + df_test_new['menu_items_subsection_name'].astype('str') + " " + df_test_new['parent_chain_or_store_name'].astype('str')

In [None]:
df_test_new['processed_text'] = df_test_new['text'].apply(preprocess_corpus)

In [None]:
df_test_new_final = predict_csv(df_test_new, model)

In [None]:
len(df_test_new_final)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.metrics import f1_score

In [None]:
from sklearn.metrics import classification_report

In [None]:
accuracy_score(df_test_new_final['y'],df_test_new_final['merchant_prediction_from_items'])

In [None]:
f1_score(df_test_new_final['y'],df_test_new_final['merchant_prediction_from_items'], average='weighted')

In [None]:
class_report = classification_report(df_test_new_final['y'],df_test_new_final['merchant_prediction_from_items'])

In [None]:
print(class_report)

In [None]:
df_test_new_final.head()

In [None]:
# df2_filtered_item

In [None]:
df_test_new_set = df_test_new[0:200]

In [None]:
df_test_new_set.to_csv('test_set200.csv', index=False)

In [None]:
df2['merch_processed'] = df2['parent_chain_or_store_name'].apply(preprocess_corpus)

In [None]:
df2.columns

In [None]:
df2['text'] = df2['menu_item'].astype('str') + " " + df2['menu_items_subsection_name'].astype('str') + " " + df2['parent_chain_or_store_name'].astype('str')

In [None]:
df2['processed_text'] = df2['text'].apply(preprocess_corpus)

In [None]:
df2_final = predict_csv(df2, model)

In [None]:
accuracy_score(test_df_final['y'],test_df_final['merchant_prediction_from_items'])