In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import json
import ast
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import xgboost as xgb
import catboost as cb
import gensim

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
user_data = pd.read_csv('~/Eshan/CodeLinux/TrendMind/data.csv')
pro_data = pd.read_excel('~/Eshan/CodeLinux/TrendMind/Zepto.xlsx', sheet_name=None)

In [6]:
# Drop irrelevant columns 
colDrop = [ "query_product_plt_clicks_60_days", "query_product_plt_ctr_60_days", "CTR_plt_30_days", "product_atcs_plt_30_days", "total_unique_orders_plt_30_days"]
user_d2 = user_data.drop(columns=colDrop)
user_d2.dropna(subset=['product_name', 'category_name', 'subcategory_name'], inplace=True)

In [7]:
# Get Non-numerical category / text columns
obj_cols = []
for col in user_d2.columns:
  # print(f'Col: {col} | type: {user_d2[col].dtype}')
  if user_d2[col].dtype == 'object':
    obj_cols.append(col)
# obj_cols
# 'search_term' -> queries,
# 'product_variant_id' -> 30353 uni ids,
# 'city_id' -> 12 uni ids,
# 'query_type'-> 2 [head, tail] | categorical ,
# 'predicted_category_name' -> 481 cate | json dict w top 2-3 pred cate, 
# 'predicted_subcategory_name' -> 1605 cate | json dict w 1-4 pred subcate ,
# 'product_name' -> 26276 uni | w brand name,
# 'brand_name' -> 1843 cate | 1-2 words,
# 'category_name' -> 38 cate, 
# 'subcategory_name'] -> 287 cate
text = ['search_term', 'product_name'] # 'brand_name']
cate = ['query_type', 'category_name', 'subcategory_name']
ids = ['product_variant_id', 'city_id']
# i = torch.randint(1,user_d2.shape[0], (1,))
# user_d2['subcategory_name'].value_counts().shape 

In [10]:
user_d3 = pd.get_dummies(user_d2, columns=['query_type'])
user_d3['product_variant_id'], product_variant_ids = pd.factorize(user_d3['product_variant_id'])
user_d3['city_id'], city_id = pd.factorize(user_d3['city_id'])
user_d3.drop(columns=['brand_name', 'predicted_category_name', 'predicted_subcategory_name'], inplace=True)

In [9]:
unique_cate = []
for i in user_d2['predicted_category_name']:
  unique_cate.extend(list(json.loads(i.replace("'", '"')).keys()))
  # unique_cate.extend(list(ast.literal_eval(i).keys()))
print(len(set(unique_cate)))
# Out: 34
unique_subcate = []
for i in user_d2['predicted_subcategory_name']:
  # print(i)
  try:
    unique_subcate.extend(list(json.loads(i.replace("'", '"')).keys()))
  # break 
  except:
    # print('Changed String')
    # print(ast.literal_eval(i))
    unique_cate.extend(list(ast.literal_eval(i).keys()))
print(len(set(unique_subcate)))
# Out: 248


34


248


In [11]:
# Number of categories 4m pro inventory -> 15 
# len(list(pro_data.keys()))
all_pro = []
for cate in pro_data.keys():
  # print(pro_data[cate])
  all_pro.extend([str(i).strip().lower() for i in pro_data[cate]['name']])
  # break
print(len(all_pro))
print(len(set(all_pro)))
# 1699 - unique products

3778
1699


In [12]:
print(len(user_d2['product_name'].unique()))
# i = torch.randint(0, len(all_pro) - 10, (1,)).item()
# all_pro[i:i+10]
print(len(set(user_d2['product_name']).intersection(set(all_pro))))
print(len(set(user_d2['product_name'].unique()).symmetric_difference(set(all_pro))))

26276
0
27975


In [14]:
# sentTrans = SentenceTransformer('all-MiniLM-L6-v2')
# search_term_emb = sentTrans.encode(user_d3['search_term'].tolist(), show_progress_bar=True)
# product_name_emb = sentTrans.encode(user_d3['product_name'].tolist(), show_progress_bar=True)
embed_model = gensim.models.KeyedVectors.load_word2vec_format('~/Eshan/CodeLinux/TrendMind/wiki-news-300d-1M-subword.vec', binary=False)
def embedTxt(txt: str) -> np.ndarray:
    tokens = txt.lower().split()
    try:
        vecs = [embed_model[token] for token in tokens]
    except:
        return np.zeros(embed_model.vector_size, dtype=float)
    return np.mean(vecs, axis=0)
search_term_emb = np.vstack([embedTxt(query) for query in user_d3['search_term']])
product_name_emb = np.vstack([embedTxt(query) for query in user_d3['product_name']]) 
fea = user_d3[list(set(user_d3.columns) - set(text) - set('query_product_similarity'))]

In [17]:
fea.shape 

(493970, 22)

In [18]:
test_size = int(fea.shape[0]*0.2)
# test_idx = torch.randint(0,fea.shape[0], (test_size,))
test_idx = np.random.choice(range(0, fea.shape[0]), size=test_size, replace=False)
train_idx = set(np.arange(0,fea.shape[0])) - set(test_idx)
print(fea.shape[0], len(train_idx), test_idx.shape)

493970 395176 (98794,)


In [19]:
# search_term_emb = pd.DataFrame(search_term_emb)
# product_name_emb = pd.DataFrame(product_name_emb, columns=np.arange(300,300+300))

# search_term_emb_train = search_term_emb.filter(items=train_idx, axis=0)
# search_term_emb_test = search_term_emb.filter(items=test_idx.tolist(), axis=0)
# product_name_emb_train = product_name_emb.filter(items=train_idx, axis=0)
# product_name_emb_test = product_name_emb.filter(items=test_idx.tolist(), axis=0)
# fea_train = fea.filter(items=train_idx, axis=0)
# fea_test = fea.filter(items=test_idx.tolist(), axis=0)

# fea_train = fea_train.reset_index(drop=True)
# fea_test = fea_test.reset_index(drop=True)
# search_term_emb_train = search_term_emb_train.reset_index(drop=True)
# search_term_emb_test = search_term_emb_test.reset_index(drop=True)
# product_name_emb_train = product_name_emb_train.reset_index(drop=True)
# product_name_emb_test = product_name_emb_test.reset_index(drop=True)
# X_train = pd.concat([search_term_emb_train, product_name_emb_train, fea_train], axis=1)
# del search_term_emb_train, product_name_emb_train , fea_train , fea, search_term_emb, product_name_emb
# import gc 
# gc.collect()
# X_test = pd.concat([search_term_emb_test, product_name_emb_test, fea_test], axis=1)
# del search_term_emb_test, product_name_emb_test , fea_test
# gc.collect()

# user_d3 = user_d3.reset_index(drop=True)
# y_train = user_d3['query_product_similarity'].filter(items=train_idx, axis=0)
# y_test = user_d3['query_product_similarity'].filter(items=test_idx, axis=0)

In [20]:
# X = np.hstack([search_term_emb, product_name_emb, fea])
# y = user_d3['query_product_similarity'].values

search_term_emb = pd.DataFrame(search_term_emb)
product_name_emb = pd.DataFrame(product_name_emb, columns=np.arange(300,300+300))
print(search_term_emb.shape , product_name_emb.shape , fea.shape)
# search_term_emb = search_term_emb.reset_index(drop=True)
# product_name_emb = product_name_emb.reset_index(drop=True)
fea = fea.reset_index(drop=True)
user_d3 = user_d3.reset_index(drop=True)
X = pd.concat([search_term_emb, product_name_emb, fea], axis=1)
y = user_d3['query_product_similarity']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_train = X.filter(items=train_idx, axis=0)
X_test = X.filter(items=test_idx.tolist(), axis=0)
y_train = y.filter(items=train_idx, axis=0)
y_test = y.filter(items=test_idx.tolist(), axis=0)
del X, y, user_d2, pro_data
import gc
gc.collect()
# print( X.shape, y.shape)

(493970, 300) (493970, 300) (493970, 22)


9823

In [13]:
# model = cb.CatBoostRegressor(
#     iterations=500,
#     learning_rate=0.1,
#     depth=6,
#     cat_features=['category_name','subcategory_name'],
#     early_stopping_rounds=30,
#     eval_metric='RMSE',
# )
# model.fit(X_train, y_train,
#          eval_set=(X_test, y_test),
#          # plot=True,
#          # plot_file='./cb_train_plot.png'
#          )
# model.save_model('./q_score_cbm',
#            format="cbm",
#            export_parameters=None,
#            pool=None)

In [21]:
user_d3

Unnamed: 0,search_term,product_variant_id,city_id,is_clicked,total_clicks,session_views,query_products_clicks_last_30_days,CTR_last_30_days,CTR_last_7_days,CTR_product_30_days,...,latest_margin,savings,savings_with_pass,ad_revenue,total_unique_orders,product_atcs_30_days,product_ctr_city_30_days,query_product_similarity,query_type_head,query_type_tail
0,akshayakalpa milk,0,0,0.0,0,0,0,0.000000,0.000000,0.000000,...,6.900000,0.040000,2.718400,0.000000,155,376,0.054913,0.384844,True,False
1,ice cubes,1,1,0.0,0,57,0,0.000000,0.000000,0.003239,...,42.010000,0.250000,5.009091,0.000000,14,41,0.003239,0.324977,True,False
2,protien,2,2,1.0,29,545,9,0.040179,0.000000,0.061085,...,10.243929,0.160000,3.239500,18.622949,1469,3335,0.061085,0.220603,True,False
3,comb,3,3,0.0,0,13,0,0.000000,0.000000,0.000000,...,21.433421,0.000000,5.118421,0.000000,14567,28849,0.120340,0.061622,True,False
4,nail polish remover,4,3,0.0,0,1,0,0.000000,0.000000,0.052714,...,13.487105,0.036957,4.915103,20.365139,1043,2446,0.052714,0.390254,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493965,almonds,5265,10,0.0,0,16,0,0.000000,0.000000,0.009776,...,69.694583,0.207859,5.207621,0.000000,52,121,0.009776,0.352788,True,False
493966,papad,7590,5,1.0,75,6973,44,0.013686,0.005976,0.010622,...,16.018000,0.246165,4.036273,0.000000,84,250,0.010622,0.506673,True,False
493967,pad,11454,9,1.0,4,2200,3,0.004032,0.000000,0.001323,...,37.173909,0.175255,4.439781,39.240450,0,0,0.001323,0.445771,True,False
493968,ban,19668,7,0.0,0,3,0,0.000000,0.000000,0.003018,...,49.663636,0.314684,10.354217,0.000000,16,93,0.003018,0.099492,False,True


In [34]:
lmodel = cb.CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    cat_features=['category_name','subcategory_name'],
    early_stopping_rounds=30,
    eval_metric='RMSE',
)
lmodel.load_model('/home/eshan/Eshan/CodeLinux/TrendMind/q_score_cbm', format='cbm')

<catboost.core.CatBoostRegressor at 0x7d191cf428f0>

In [26]:
len(X_train['subcategory_name'].unique()), len(X_train['category_name'].unique())
len(set(X_train['subcategory_name'].unique()).intersection(set(X_test['subcategory_name'].unique()))), len(set(X_train['category_name'].unique()).intersection(set(X_test['category_name'].unique())))
len(X_test['subcategory_name'].unique()), len(X_test['category_name'].unique())

(287, 38)

In [51]:
test_pool = cb.Pool(X_test, cat_features=['category_name','subcategory_name'])
pred_test = lmodel.predict(test_pool)
(((pred_test - y_test)**2).sum()/y_test.shape[0])**0.5

0.002484495719736641

In [38]:
set(user_d3.columns) - set(text) - set('query_product_similarity')

{'CTR_last_30_days',
 'CTR_last_7_days',
 'CTR_product_30_days',
 'ad_revenue',
 'category_name',
 'city_id',
 'is_clicked',
 'latest_margin',
 'product_atcs_30_days',
 'product_ctr_city_30_days',
 'product_variant_id',
 'query_product_plt_clicks_30_days',
 'query_product_similarity',
 'query_products_clicks_last_30_days',
 'query_type_head',
 'query_type_tail',
 'savings',
 'savings_with_pass',
 'session_views',
 'subcategory_name',
 'total_clicks',
 'total_unique_orders'}

In [52]:
for col in user_d3.columns:
  print(f'Col : {col} | type: {user_d3[col].dtype}')

Col : search_term | type: object
Col : product_variant_id | type: int64
Col : city_id | type: int64
Col : is_clicked | type: float64
Col : total_clicks | type: int64
Col : session_views | type: int64
Col : query_products_clicks_last_30_days | type: int64
Col : CTR_last_30_days | type: float64
Col : CTR_last_7_days | type: float64
Col : CTR_product_30_days | type: float64
Col : query_product_plt_clicks_30_days | type: int64
Col : product_name | type: object
Col : category_name | type: object
Col : subcategory_name | type: object
Col : latest_margin | type: float64
Col : savings | type: float64
Col : savings_with_pass | type: float64
Col : ad_revenue | type: float64
Col : total_unique_orders | type: int64
Col : product_atcs_30_days | type: int64
Col : product_ctr_city_30_days | type: float64
Col : query_product_similarity | type: float64
Col : query_type_head | type: bool
Col : query_type_tail | type: bool


In [49]:
list_col = ['search_term', 'category_name', 'subcategory_name']
agg_dict = {f'{col}':'mean' for col in user_d3.columns if col not in list_col}
for lcol in list_col:
  agg_dict[f'{lcol}'] = list 
user_d3.groupby('product_name').agg(agg_dict).reset_index().to_sql('./pro_data.sql')

TypeError: agg function failed [how->mean,dtype->object]

In [50]:
user_d3.columns

Index(['search_term', 'product_variant_id', 'city_id', 'is_clicked',
       'total_clicks', 'session_views', 'query_products_clicks_last_30_days',
       'CTR_last_30_days', 'CTR_last_7_days', 'CTR_product_30_days',
       'query_product_plt_clicks_30_days', 'product_name', 'category_name',
       'subcategory_name', 'latest_margin', 'savings', 'savings_with_pass',
       'ad_revenue', 'total_unique_orders', 'product_atcs_30_days',
       'product_ctr_city_30_days', 'query_product_similarity',
       'query_type_head', 'query_type_tail'],
      dtype='object')

## Candidate Generation with DNN

In [10]:
user_d3

Unnamed: 0,search_term,product_variant_id,city_id,is_clicked,total_clicks,session_views,query_products_clicks_last_30_days,CTR_last_30_days,CTR_last_7_days,CTR_product_30_days,...,latest_margin,savings,savings_with_pass,ad_revenue,total_unique_orders,product_atcs_30_days,product_ctr_city_30_days,query_product_similarity,query_type_head,query_type_tail
0,akshayakalpa milk,0,0,0.0,0,0,0,0.000000,0.000000,0.000000,...,6.900000,0.040000,2.718400,0.000000,155,376,0.054913,0.384844,True,False
1,ice cubes,1,1,0.0,0,57,0,0.000000,0.000000,0.003239,...,42.010000,0.250000,5.009091,0.000000,14,41,0.003239,0.324977,True,False
2,protien,2,2,1.0,29,545,9,0.040179,0.000000,0.061085,...,10.243929,0.160000,3.239500,18.622949,1469,3335,0.061085,0.220603,True,False
3,comb,3,3,0.0,0,13,0,0.000000,0.000000,0.000000,...,21.433421,0.000000,5.118421,0.000000,14567,28849,0.120340,0.061622,True,False
4,nail polish remover,4,3,0.0,0,1,0,0.000000,0.000000,0.052714,...,13.487105,0.036957,4.915103,20.365139,1043,2446,0.052714,0.390254,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504984,almonds,5265,10,0.0,0,16,0,0.000000,0.000000,0.009776,...,69.694583,0.207859,5.207621,0.000000,52,121,0.009776,0.352788,True,False
504985,papad,7590,5,1.0,75,6973,44,0.013686,0.005976,0.010622,...,16.018000,0.246165,4.036273,0.000000,84,250,0.010622,0.506673,True,False
504986,pad,11454,9,1.0,4,2200,3,0.004032,0.000000,0.001323,...,37.173909,0.175255,4.439781,39.240450,0,0,0.001323,0.445771,True,False
504987,ban,19668,7,0.0,0,3,0,0.000000,0.000000,0.003018,...,49.663636,0.314684,10.354217,0.000000,16,93,0.003018,0.099492,False,True


In [12]:
user_d3['search_term'].value_counts()

search_term
comb        5452
combo       2985
zepto       2201
munch       2137
munchies    2090
            ... 
gokul          2
volini         2
yam            1
aavin          1
caul           1
Name: count, Length: 1997, dtype: int64

In [18]:
search_pro_df = user_d3[['product_name', 'search_term']].groupby('search_term').agg(list)

In [19]:
# query_item_interaction_matrix = 
search_pro_df.map(lambda x : )

Unnamed: 0_level_0,product_name
search_term,Unnamed: 1_level_1
3 roses,"[Fackelmann Stainless Steel Sink Strainer, 6.3..."
5 star,[Asian Diamond Pet Fridge Water Bottle With St...
50 50,[Yoga Bar Breakfast Protein Bar - Blueberry Pi...
5star,[Cadbury Five Star Oreo Chocolate 42 gms Combo...
7up,"[UPF Healthy Eggs White, Conscious Food Organi..."
...,...
zep,"[Jamun Cooler, Vaseline Light Hydrate Serum In..."
zepto,"[Lakme Vit C Brilliance - Serum, Borosil Stain..."
zepto cafe,"[Organic Watermelon, Zep Cake Brownie Love, Am..."
zero,"[Paper Boat Zero Ginger Lemon Sparkling Water,..."


In [31]:
for i in search_pro_df.values:
    print(i)
    break

[list(['Fackelmann Stainless Steel Sink Strainer, 6.3 Cm | Long-Lasting Construction | Easy To Use & Clean', 'Plum Rice Water & Niacinamide 3% Toner', 'Coloressence Intense Liquid Lip Color Matte Finish - Rose Petal Llc 3', 'Citizen 3 Fold 22 Inch Auto Royal Blue Mono Silver Umbrella', 'Maybelline New York Fit Me Blush Rose 30 | Powder Blusher', 'Maharaja Whiteline Odacio Plus 550 Watt Juicer Mixer Grinder With 3 Versatile Jars - Black & Silver', 'Sanfe Spray Away Hair Removal Spray 3 Full Body Usage', 'Fabiano Racer 3 Star BEE Ceiling Fan', 'Vinod Pressure Cooker Inner Lid Europa 3 Litre', 'Havells Mixwell 500 W 3 Jar Mixer Grinder | 304 Ss Blades I High Speed 21000 Rpm Motor', 'Blue Heaven Jelly & Butter Hydrating Tinted Lip Balm - Dusty Rose', 'Tennis Ball Green (Pack Of 3)', 'Lakme Rose Powder With Sunscreen 01 Soft Pink', 'Prestige Omega Deluxe Granite Non-Stick Byk Cookware Set (Aluminum)|Black, 3 Piece', 'Paragon Kids Black School Shoes, Everyday Wear Ultra Comfortable, Lightwei

In [45]:
all_pro = []
for i in search_pro_df.values:
    all_pro.extend(i[0])
all_pro = set(all_pro)

In [42]:
pro2idx 

26276