In [1]:
import pickle

import json
import numpy as np
import pandas as pd
import scipy.sparse as sp
import scipy
import gc

with open('raw_data/sessions.json', 'r') as f:
    sessions = json.load(f)

session_ids = []
items = []
action_types = []
for i in sessions:
    session = sessions[str(i)]['session']
    for item_id, action_type in session.items():
        session_ids.append(i)
        items.append(item_id)
        action_types.append(action_type)
    for item_id in sessions[str(i)]['order']:
        session_ids.append(i)
        items.append(item_id)
        action_types.append('order')
        
df = pd.DataFrame({'session_id': session_ids, 'item_id': items, 'action_type': action_types})
del session_ids, items, action_types, sessions
df.head()

Unnamed: 0,session_id,item_id,action_type
0,0,184878281,view
1,0,167969574,view
2,0,166174833,order
3,1,179765848,view
4,1,193869023,view


In [2]:
uniq_items = df['item_id'].unique()
item_mapping = pd.DataFrame({'old': uniq_items, 'new': np.arange(uniq_items.shape[0])})

df['item_id'] = df['item_id'].map(item_mapping.set_index('old').new)
df['session_id'] = df['session_id'].astype('uint32')
df.head()

item_mapping['old'] = item_mapping['old'].astype('str')
item_mapping['new'] = item_mapping['new'].astype('int')
item_mapping.index.name = 'index'
#item_mapping.to_csv('item_mapping.csv', index=False)

shape = (df['session_id'].max() + 1, df['item_id'].max() + 1)

# из df возьмем только view
condition = (df['action_type'] == 'view')
session_item_view = sp.csr_matrix(
    (np.ones(np.sum(condition)), (df.loc[condition, 'session_id'], df.loc[condition, 'item_id'])),
    shape=shape
)

condition = (df['action_type'] == 'to_cart')
session_item_cart_add = sp.csr_matrix(
    (np.ones(np.sum(condition)), (df.loc[condition, 'session_id'], df.loc[condition, 'item_id'])),
    shape=shape
)

condition = (df['action_type'] == 'order')
session_order = sp.csr_matrix(
    (np.ones(np.sum(condition)), (df.loc[condition, 'session_id'], df.loc[condition, 'item_id'])),
    shape=shape
)


In [3]:
from collections import Counter

for act in ['view', 'order', 'to_cart']:
    un = df[df.action_type==act]['item_id'].values
    cntr = Counter(un)
    with open('data/num_of_'+act+'.pkl', 'wb') as handle:
        pickle.dump(cntr, handle)

In [4]:
vals = df[df.action_type=='order']['item_id'].values

cntr = Counter(vals)

top50 = [i[0] for i in cntr.most_common(60)][10:]

In [5]:
with open('data/top50.pkl', 'wb') as handle:
    pickle.dump(top50, handle)

In [6]:
processed_df = pd.DataFrame()
view_df = df[df['action_type']=='view'].groupby('session_id')['item_id'].agg(list)
cart_df = df[df['action_type']=='to_cart'].groupby('session_id')['item_id'].agg(list)
order_df = df[df['action_type']=='order'].groupby('session_id')['item_id'].agg(list)

processed_df['view'] = view_df
processed_df['cart'] = cart_df
processed_df['order'] = order_df

view_df.map(lambda x: [str(i) for i in x]).to_pickle("w2vec/train_df/view_df.pkl")
del view_df


cart_df.map(lambda x: [str(i) for i in x]).to_pickle("w2vec/train_df/cart_df.pkl")
del cart_df
del order_df

In [12]:
processed_df['cart'] = processed_df['cart'].fillna({i: [] for i in processed_df.index})
processed_df['view'] = processed_df['view'].fillna({i: [] for i in processed_df.index})
processed_df['order'] = processed_df['order'].fillna({i: [] for i in processed_df.index})

In [14]:
processed_df.to_json('data/processed.json')
del processed_df

In [7]:
scipy.sparse.save_npz('data/session_order.npz', session_order)
del session_order
scipy.sparse.save_npz('data/session_item_cart_add.npz', session_item_cart_add)
scipy.sparse.save_npz('data/session_item_view.npz', session_item_view)

In [8]:
item_mapping.head()

Unnamed: 0_level_0,old,new
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,184878281,0
1,167969574,1
2,166174833,2
3,179765848,3
4,193869023,4


In [9]:
item_mapping = dict(zip(item_mapping['new'].values, item_mapping['old'].values))
reverse_mapping = dict(zip( item_mapping.values(), item_mapping.keys()))

del df
gc.collect()

80

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarities_view = cosine_similarity(session_item_view.transpose(), dense_output=False)
similarities_cart_add = cosine_similarity(session_item_cart_add.transpose(), dense_output=False)

In [11]:
similarities_cart_add.setdiag(0)
sp.save_npz("similarities_cart_add.npz", similarities_cart_add)
similarities_view.setdiag(0)
sp.save_npz("similarities_view.npz", similarities_view)

  self._set_arrayXarray(i, j, x)


In [12]:
def get_top_k(item_id: int, k: int):
    new_item_id = reverse_mapping[item_id]
    flat_view = similarities_view[:, new_item_id].toarray()
    flat_cart = similarities_cart_add[:, new_item_id].toarray()
    top_views = np.argpartition(flat_view, kth=np.arange(-k,0), axis=0)[-k-1:-1]
    top_cart = np.argpartition(flat_cart, kth=np.arange(-k,0), axis=0)[-k-1:-1]
    a = []
    b = []
    for ind in top_views:
        a.append([item_mapping[ind[0]], similarities_view[ind[0],new_item_id]])
    for ind in top_cart:
        b.append([item_mapping[ind[0]], similarities_cart_add[ind[0],new_item_id]])
    return a, b

product_id = '169944532'
num_of_recommendations = 10

based_on_views, based_on_cart_adds = get_top_k(product_id, num_of_recommendations)

def fancy_print(inp, product_id):
    print(f"На товар https://www.ozon.ru/context/detail/id/{product_id} похоже:")
    print()
    for prod_id, sim in reversed(inp):
        print(f"https://www.ozon.ru/context/detail/id/{prod_id}", sim)

fancy_print(based_on_views, product_id)

На товар https://www.ozon.ru/context/detail/id/169944532 похоже:

https://www.ozon.ru/context/detail/id/169944533 0.1721989151177004
https://www.ozon.ru/context/detail/id/193371968 0.15811388300841897
https://www.ozon.ru/context/detail/id/178971727 0.15811388300841897
https://www.ozon.ru/context/detail/id/197898963 0.15811388300841897
https://www.ozon.ru/context/detail/id/155313449 0.15811388300841897
https://www.ozon.ru/context/detail/id/148781474 0.15811388300841897
https://www.ozon.ru/context/detail/id/193893809 0.15811388300841897
https://www.ozon.ru/context/detail/id/177719900 0.15811388300841897
https://www.ozon.ru/context/detail/id/193893799 0.15811388300841897
https://www.ozon.ru/context/detail/id/183198135 0.15811388300841897


In [13]:
fancy_print(based_on_cart_adds, product_id)

На товар https://www.ozon.ru/context/detail/id/169944532 похоже:

https://www.ozon.ru/context/detail/id/165274038 0.35355339059327373
https://www.ozon.ru/context/detail/id/161724606 0.35355339059327373
https://www.ozon.ru/context/detail/id/154788558 0.35355339059327373
https://www.ozon.ru/context/detail/id/184292790 0.35355339059327373
https://www.ozon.ru/context/detail/id/180556158 0.35355339059327373
https://www.ozon.ru/context/detail/id/190516574 0.24999999999999994
https://www.ozon.ru/context/detail/id/160282937 0.24999999999999994
https://www.ozon.ru/context/detail/id/154503461 0.17677669529663687
https://www.ozon.ru/context/detail/id/157682045 0.17677669529663687
https://www.ozon.ru/context/detail/id/162088977 0.15811388300841894


In [23]:
cd data

/home/huvi/Documents/Python/ozon/prod2vec_all_in_one/data


In [24]:
import pickle

with open("item_mapping.pkl", "wb") as handle:
    pickle.dump(item_mapping, handle)
with open("reverse_mapping.pkl", "wb") as handle:
    pickle.dump(reverse_mapping, handle)

In [25]:
ls

categories.pkl    new2old_dict.pkl  processed.json
df_features.pkl   num_of_carts.pkl  reverse_mapping.pkl
df_scores.pkl     num_of_views.pkl  session_item_cart_add.npz
exploded_df.pkl   old2new_dict.pkl  session_item_view.npz
exp_test.pkl      PAB_cart.npz      session_order.npz
exp_train.pkl     PAB_view.npz      similarities_cart_add.npz
item_mapping.csv  PB_cart.npy       similarities_view.npz
item_mapping.pkl  PB_view.npy       top50.txt
