In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc
from tqdm import tqdm
import pickle5 as pickle

In [5]:
ls data/

categories.pkl    old2new_dict.pkl           session_item_view.npz
df_features.pkl   PAB_cart.npz               session_order.npz
df_scores.pkl     PAB_view.npz               similarities_cart_add.npz
item_mapping.csv  PB_cart.npy                similarities_view.npz
new2old_dict.pkl  PB_view.npy                top50.txt
num_of_carts.pkl  processed.json
num_of_views.pkl  session_item_cart_add.npz


In [6]:

df = pd.read_pickle("data/df_features.pkl")

with open('data/num_of_carts.pkl', 'rb') as handle:
    num_of_carts = pickle.load(handle)

with open('data/num_of_views.pkl', 'rb') as handle:
    num_of_views = pickle.load(handle)

num_of_views = dict((str(k), v) for k, v in num_of_views.items())
num_of_carts = dict((str(k), v) for k, v in num_of_carts.items())

categories = pd.read_pickle("data/categories.pkl")
categories.head()
categories['itemid'] = categories['itemid'].astype(str)

categories_1 = dict(categories[['itemid','category_name_1_level_en']].values.tolist())
categories_2 = dict(categories[['itemid','category_name_2_level_en']].values.tolist())

del categories
gc.collect()

def get_prediction_and_score(x):
    if len(x)==2:
        prediction = x[0]
        score = x[1]
    else:
        prediction = x
        score = 0.0
    return prediction, score

def transform(df):
    exploded = df.explode('ovr_pred')
    exploded = exploded.reset_index()
    #exploded = exploded.drop(['index'], axis=1)
    #exploded.drop(['to_cart','view'], axis=1, inplace=True)
    pred_score = exploded['ovr_pred'].apply(get_prediction_and_score)
    
    exploded['prediction'] = [x[0] for x in pred_score]
    exploded['score'] = [x[1] for x in pred_score]
    exploded['target'] = exploded[['orders','prediction']].apply(lambda x: 1 if x[1] in x[0] else 0, axis=1)
    exploded['mean_score_for_prediction'] = exploded.groupby(['prediction'])['score'].transform('mean')
    
    exploded['num_of_cart'] = exploded['prediction'].map(num_of_carts)
    exploded['num_of_views'] = exploded['prediction'].map(num_of_views)
    exploded['num_of_cart'].fillna(0, inplace=True)
    exploded['num_of_views'].fillna(0, inplace=True)
    
    
    exploded['category_1'] = exploded['prediction'].map(categories_1)
    exploded['category_2'] = exploded['prediction'].map(categories_2)
    exploded['category_1'].fillna("None", inplace=True)
    exploded['category_2'].fillna("None", inplace=True)
    

    exploded['mean_ovr'].fillna(0, inplace=True)
    exploded['avg_w2vec'].fillna(0, inplace=True)
    exploded['avg_cosine'].fillna(0, inplace=True)
    
    #Были ли предикты для этой сесии от косинусной модели или от в2век
    exploded['is_popular'] = exploded['len_of_pred'] > 0
    
    exploded.drop(['ovr_pred','cos_pred','w2vec_pred','top_pred','orders'], axis=1, inplace=True)
    #exploded.fillna(0, inplace=True)
    return exploded


size = df.shape[0]
exp = transform(df)
exp.to_pickle("data/exploded_df.pkl")
del df
gc.collect()
exp.head()

Unnamed: 0,index,view,to_cart,avg_w2vec,avg_cosine,sum_w2vec,sum_cosine,len_of_w2vec,len_of_cos,len_of_pred,...,intersection_cosine,prediction,score,target,mean_score_for_prediction,num_of_cart,num_of_views,category_1,category_2,is_popular
0,0,[],"[29288920, 29288922, 19063915, 29288924, 16026...",0.741602,0.139342,37.08008,6.688423,50,48,98,...,0,152693522,0.803079,0,0.627441,3.0,2.0,Bytovaja himija,Sredstva dlja stirki,True
1,0,[],"[29288920, 29288922, 19063915, 29288924, 16026...",0.741602,0.139342,37.08008,6.688423,50,48,98,...,0,145642383,0.789692,0,0.66834,2.0,1.0,Bytovaja himija,Sredstva dlja stirki,True
2,0,[],"[29288920, 29288922, 19063915, 29288924, 16026...",0.741602,0.139342,37.08008,6.688423,50,48,98,...,0,165059968,0.776835,0,0.711385,4.0,3.0,Bytovaja himija,Sredstva dlja stirki,True
3,0,[],"[29288920, 29288922, 19063915, 29288924, 16026...",0.741602,0.139342,37.08008,6.688423,50,48,98,...,0,182754424,0.775425,0,0.726659,3.0,7.0,"Odezhda, obuv' i aksessuary",Uhod za odezhdoj i obuv'ju,True
4,0,[],"[29288920, 29288922, 19063915, 29288924, 16026...",0.741602,0.139342,37.08008,6.688423,50,48,98,...,0,185694412,0.772421,0,0.766464,6.0,1.0,,,True


In [12]:
cd data

/home/huvi/Documents/Python/ozon/prod2vec_all_in_one/data


In [15]:
ls

categories.pkl    num_of_views.pkl  session_item_cart_add.npz
df_features.pkl   old2new_dict.pkl  session_item_view.npz
df_scores.pkl     PAB_cart.npz      session_order.npz
exploded_df.pkl   PAB_view.npz      similarities_cart_add.npz
item_mapping.csv  PB_cart.npy       similarities_view.npz
new2old_dict.pkl  PB_view.npy       top50.txt
num_of_carts.pkl  processed.json


In [16]:
import scipy.sparse as sp
with open('old2new_dict.pkl', 'rb') as handle:
    old2new_dict = pickle.load(handle)
    
sim_view = sp.load_npz("similarities_view.npz")
sim_cart = sp.load_npz("similarities_cart_add.npz")

#Proba features martices and vecs
PAB_view = sp.load_npz("PAB_view.npz")
PAB_cart = sp.load_npz("PAB_cart.npz")

with open("PB_view.npy", 'rb') as f:
    PB_view = np.load(f).T
    
with open("PB_cart.npy", 'rb') as f:
    PB_cart = np.load(f).T

In [17]:
exp.shape

(5670112, 23)

In [6]:
#вероятность что товар А купили если В добавили в корзину.

In [18]:
%%time
order_view_sim = np.zeros(exp.shape[0])
order_cart_sim = np.zeros(exp.shape[0])
order_cart_sim_max = np.zeros(exp.shape[0])
order_view_sim_max = np.zeros(exp.shape[0])

P_view = np.zeros(exp.shape[0])
P_cart = np.zeros(exp.shape[0])

for i, row in tqdm(exp[['view', 'to_cart', 'prediction']].iterrows()):

    view = row[0]
    cart = row[1]
    prediction = row[2]
    new_ind = old2new_dict[prediction]
    #new_ind_prob = old2new_dict_prob[prediction]
    
    ovr_view = 0
    ovr_cart = 0
    max_view = 0
    max_cart = 0
    
    prob_view = 0
    prob_cart = 0
    
    
    for item in view:
        prob_view += PAB_view[new_ind, old2new_dict[item]] / PB_view[old2new_dict[item]]
        sim_view_ = sim_view[new_ind, old2new_dict[item]]
        ovr_view += sim_view_
        max_view = max(max_view, sim_view_)
    P_view[i] = prob_view 
    order_view_sim[i] = ovr_view
    order_view_sim_max[i] = max_view

    for item in cart:
        prob_cart += PAB_cart[new_ind, old2new_dict[item]] / PB_cart[old2new_dict[item]]
        sim_cart_ = sim_cart[new_ind, old2new_dict[item]]
        ovr_cart += sim_cart_
        max_cart = max(max_cart, sim_cart_)
    P_cart[i] = prob_cart
    order_cart_sim[i] = ovr_cart
    order_cart_sim_max[i] = max_cart

del sim_view, sim_cart, old2new_dict
gc.collect()


exp['order_view_sim'] = order_view_sim
exp['order_cart_sim'] = order_cart_sim
exp['order_view_sim_max'] = order_view_sim_max
exp['order_cart_sim_max'] = order_cart_sim_max
exp['p_view'] = P_view
exp['p_cart'] = P_cart

5670112it [1:04:37, 1462.48it/s]


CPU times: user 1h 4min 39s, sys: 23.3 s, total: 1h 5min 2s
Wall time: 1h 4min 37s


In [10]:
with open('order_view_sim.pkl', 'wb') as f:
    pickle.dump(order_view_sim, f)
with open('order_cart_sim.pkl', 'wb') as f:
    pickle.dump(order_cart_sim, f)

with open('order_view_sim_max.pkl', 'wb') as f:
    pickle.dump(order_view_sim_max, f)
with open('order_cart_sim_max.pkl', 'wb') as f:
    pickle.dump(order_cart_sim_max, f)

with open('p_view.pkl', 'wb') as f:
    pickle.dump(P_view, f)

with open('p_cart.pkl', 'wb') as f:
    pickle.dump(P_cart, f)

In [4]:
# with open('order_view_sim.pkl', 'rb') as f:
#     order_view_sim = pickle.load(f)
# with open('order_cart_sim.pkl', 'rb') as f:
#     order_cart_sim = pickle.load(f)

# with open('order_view_sim_max.pkl', 'rb') as f:
#     order_view_sim_max = pickle.load(f)
# with open('order_cart_sim_max.pkl', 'rb') as f:
#     order_cart_sim_max = pickle.load(f)

# with open('p_view.pkl', 'rb') as f:
#     P_view = pickle.load(f)

# with open('p_cart.pkl', 'rb') as f:
#     P_cart = pickle.load(f)

In [20]:
# exp['order_view_sim'] = order_view_sim
# exp['order_cart_sim'] = order_cart_sim
# exp['order_view_sim_max'] = order_view_sim_max
# exp['order_cart_sim_max'] = order_cart_sim_max
# exp['P_view'] = P_view
# exp['P_cart'] = P_cart

del order_view_sim, order_cart_sim, order_view_sim_max, order_cart_sim_max
del P_view, P_cart
gc.collect()

40

In [11]:
# exp.to_csv("tmp.csv")

# exp = pd.read_csv("tmp.csv")

In [19]:
exp.drop(["view", "to_cart"], axis=1, inplace=True)

In [21]:
size = 400000
indices = np.arange(size)
np.random.shuffle(indices)

train_ind = indices[:int(0.8 * size)]
test_ind = indices[int(0.8 * size):]

exp_train = exp.loc[exp['index'].isin(train_ind)]
exp_test = exp.loc[exp['index'].isin(test_ind)]

In [22]:
print(exp_train.shape, exp_test.shape)

(4542924, 27) (1127188, 27)


In [23]:
exp.shape

(5670112, 27)

In [24]:
#expanding mean
global_mean = exp_train['target'].mean()

cat1_target_mean = exp_train.groupby('category_1')['target'].mean()
cumsum = exp_train.groupby('category_1')['target'].cumsum() - exp_train['target']
cumcnt = exp_train.groupby('category_1').cumcount()
exp_train['cat1_encoded_feature'] = cumsum / cumcnt
exp_train['cat1_encoded_feature'].fillna(global_mean, inplace = True)

exp_test['cat1_encoded_feature'] = exp_test['category_1'].map(cat1_target_mean)
exp_test['cat1_encoded_feature'].fillna(global_mean, inplace = True)


cat2_target_mean = exp_train.groupby('category_2')['target'].mean()
cumsum = exp_train.groupby('category_2')['target'].cumsum() - exp_train['target']
cumcnt = exp_train.groupby('category_2').cumcount()
exp_train['cat2_encoded_feature'] = cumsum / cumcnt
exp_train['cat2_encoded_feature'].fillna(global_mean, inplace = True)
exp_train.drop(['category_2'], axis=1, inplace=True)

exp_test['cat2_encoded_feature'] = exp_test['category_2'].map(cat2_target_mean)
exp_test['cat2_encoded_feature'].fillna(global_mean, inplace = True)
exp_test.drop(['category_2'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_train['cat1_encoded_feature'] = cumsum / cumcnt
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_test['cat1_encoded_feature'] = exp_test['category_1'].map(cat1_target_mean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 

In [17]:
exp_train.to_pickle("data/exp_train.pkl")
exp_test.to_pickle("data/exp_test.pkl")

In [1]:
# import pandas as pd
# exp_train = pd.read_pickle("data/exp_train.pkl")
# exp_test = pd.read_pickle("data/exp_test.pkl")

In [25]:
exp = pd.concat([exp_train, exp_test])
exp = pd.get_dummies(exp, columns=['category_1'], drop_first=True, dtype=bool)
exp.shape
exp.head()

Unnamed: 0,index,avg_w2vec,avg_cosine,sum_w2vec,sum_cosine,len_of_w2vec,len_of_cos,len_of_pred,mean_ovr,sum_ovr,...,category_1_OZON Express,"category_1_Odezhda, obuv' i aksessuary",category_1_Produkty pitanija,category_1_Sportivnye tovary,category_1_Stroitel'stvo i remont,category_1_Tovary dlja vzroslyh,category_1_Tovary dlja zhivotnyh,category_1_Tsifrovye tovary,"category_1_Turizm, rybalka, ohota",category_1_Vse dlja igr
298,2,0.760917,0.399454,38.045855,19.972705,50,50,100,0.580186,58.01856,...,False,True,False,False,False,False,False,False,False,False
299,2,0.760917,0.399454,38.045855,19.972705,50,50,100,0.580186,58.01856,...,False,False,False,False,False,False,False,False,False,False
300,2,0.760917,0.399454,38.045855,19.972705,50,50,100,0.580186,58.01856,...,False,False,False,False,False,False,False,False,False,False
301,2,0.760917,0.399454,38.045855,19.972705,50,50,100,0.580186,58.01856,...,False,True,False,False,False,False,False,False,False,False
302,2,0.760917,0.399454,38.045855,19.972705,50,50,100,0.580186,58.01856,...,False,True,False,False,False,False,False,False,False,False


In [19]:
import numpy as np
train_ind = np.unique(exp_train['index'])
test_ind = np.unique(exp_test['index'])

In [20]:
import gc
del exp_train, exp_test
gc.collect()

80

In [26]:
exp_train = exp.loc[exp['index'].isin(train_ind)]
exp_test = exp.loc[exp['index'].isin(test_ind)]

In [27]:
print(exp_train.shape, exp_test.shape)

(4542924, 53) (1127188, 53)


In [23]:
exp_train.head()

Unnamed: 0,index,avg_w2vec,avg_cosine,sum_w2vec,sum_cosine,len_of_w2vec,len_of_cos,len_of_pred,mean_ovr,sum_ovr,...,category_1_OZON Express,"category_1_Odezhda, obuv' i aksessuary",category_1_Produkty pitanija,category_1_Sportivnye tovary,category_1_Stroitel'stvo i remont,category_1_Tovary dlja vzroslyh,category_1_Tovary dlja zhivotnyh,category_1_Tsifrovye tovary,"category_1_Turizm, rybalka, ohota",category_1_Vse dlja igr
0,0,0.0,0.24048,0.0,1.923842,0,8,8,0.24048,1.923842,...,False,False,False,False,False,False,False,False,False,False
1,0,0.0,0.24048,0.0,1.923842,0,8,8,0.24048,1.923842,...,False,False,False,False,False,False,False,False,False,False
2,0,0.0,0.24048,0.0,1.923842,0,8,8,0.24048,1.923842,...,False,False,False,False,False,False,False,False,False,False
3,0,0.0,0.24048,0.0,1.923842,0,8,8,0.24048,1.923842,...,False,False,False,False,False,False,False,False,False,False
4,0,0.0,0.24048,0.0,1.923842,0,8,8,0.24048,1.923842,...,False,False,False,False,False,False,False,False,False,False


In [29]:
exp_train.to_pickle("exp_train.pkl")
exp_test.to_pickle("exp_test.pkl")

In [32]:
np.corrcoef(exp_train['target'], exp_train['order_cart_sim'])

array([[1.       , 0.1660119],
       [0.1660119, 1.       ]])