In [1]:
from pandarallel import pandarallel
pandarallel.initialize(8, progress_bar=True)

import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
from pys.models import CombinedProd2Vec, cosine_model, top
from typing import Tuple, List
import pickle
import random
import time

"""
proccessed.json - id, view_list, to_cart_list, order_list

"""

SIZE_OF_DATASET = 40000
N = 50
HOW = 'mean'
random.seed(15)
np.random.seed(15)

def gen_sliced_dataset()-> pd.DataFrame:
    df = pd.read_json('data/processed.json')
    n_rows = df.shape[0]
    data_size = SIZE_OF_DATASET
    random_nk = np.random.choice(n_rows, size=data_size, replace=False)
    df_slice = df.iloc[random_nk].copy()
    del df
    gc.collect()
    return df_slice


def get_bin(x):
    orders = x[0]
    prediction = x[1]
    return 1 if prediction in orders else 0


def get_models()->Tuple[List[str], CombinedProd2Vec, cosine_model]:
    #Top 50 from popular
    top50 = top()
    #Prod2Vec model
    path1 = 'w2vec/weights/view.model'
    path2 = 'w2vec/weights/cart.model'
    p2vec = CombinedProd2Vec([path1, path2])
    
    #Cosine similarity Model
    path = 'data'
    cos_model = cosine_model(path)
    
    return top50, p2vec, cos_model


def paralled_prediction(row):
    session = {'views':row[0],
               'to_cart':row[1]}
    p2vec_pred = p2vec.get_prediction_for_session(session, N, HOW)
    cos_pred = cos_model._predict(session, N, HOW)
    return pd.Series([p2vec_pred, cos_pred])
   

def get_predictions(df)->None:
    print("Predicting...")
    tic = time.time()
    df[['w2vec_pred', 'cos_pred']] = df[['view','cart']].parallel_apply(paralled_prediction, axis=1)
    print(time.time() - tic)
    top50_prediction = top50.get_prediction()
    df['top_pred'] = [top50_prediction] * SIZE_OF_DATASET
    return df


if __name__ == '__main__':
    print("Creating Dataset")
    df = gen_sliced_dataset()
    print("Creating Model")
    top50, p2vec, cos_model = get_models()
    df = get_predictions(df)


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Creating Dataset
Creating Model
Predicting...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5000), Label(value='0 / 5000'))), …

2198.2027702331543


In [2]:
df.head()

Unnamed: 0,view,cart,order,w2vec_pred,cos_pred,top_pred
6738,"[54712, 54714, 54719, 54720, 1892, 54723, 5472...","[54710, 54711, 54713, 54715, 7521, 54716, 5471...","[54715, 54718, 54711, 54726, 54724, 54716, 547...","[(699536, 0.9678006172180176), (1178128, 0.965...","[(349038, 0.5773502691896258), (936262, 0.5), ...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
832109,[547459],[1813284],[1813284],"[(827982, 0.8512779474258423), (369892, 0.8434...","[(547461, 0.35355339059327373), (547460, 0.353...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
450005,"[256659, 159628]",[],[461866],"[(256653, 0.8684725761413574), (256663, 0.8125...","[(256653, 0.3889222341312986), (816691, 0.3563...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
579758,"[21910, 5478]","[346854, 14767, 3080, 43047, 5477]","[5477, 3080, 14767, 346854, 43047]","[(49398, 0.9022839069366455), (49365, 0.895142...","[(694721, 0.7071067811865475), (694717, 0.4999...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
66898,"[370917, 370918]",[],[370918],[],"[(846754, 0.7071067811865475), (1377304, 0.707...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."


In [3]:
df.head()

Unnamed: 0,view,cart,order,w2vec_pred,cos_pred,top_pred
6738,"[54712, 54714, 54719, 54720, 1892, 54723, 5472...","[54710, 54711, 54713, 54715, 7521, 54716, 5471...","[54715, 54718, 54711, 54726, 54724, 54716, 547...","[(699536, 0.9678006172180176), (1178128, 0.965...","[(349038, 0.5773502691896258), (936262, 0.5), ...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
832109,[547459],[1813284],[1813284],"[(827982, 0.8512779474258423), (369892, 0.8434...","[(547461, 0.35355339059327373), (547460, 0.353...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
450005,"[256659, 159628]",[],[461866],"[(256653, 0.8684725761413574), (256663, 0.8125...","[(256653, 0.3889222341312986), (816691, 0.3563...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
579758,"[21910, 5478]","[346854, 14767, 3080, 43047, 5477]","[5477, 3080, 14767, 346854, 43047]","[(49398, 0.9022839069366455), (49365, 0.895142...","[(694721, 0.7071067811865475), (694717, 0.4999...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
66898,"[370917, 370918]",[],[370918],[],"[(846754, 0.7071067811865475), (1377304, 0.707...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."


In [4]:
df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,view,cart,order,w2vec_pred,cos_pred,top_pred
0,"[54712, 54714, 54719, 54720, 1892, 54723, 5472...","[54710, 54711, 54713, 54715, 7521, 54716, 5471...","[54715, 54718, 54711, 54726, 54724, 54716, 547...","[(699536, 0.9678006172180176), (1178128, 0.965...","[(349038, 0.5773502691896258), (936262, 0.5), ...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
1,[547459],[1813284],[1813284],"[(827982, 0.8512779474258423), (369892, 0.8434...","[(547461, 0.35355339059327373), (547460, 0.353...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
2,"[256659, 159628]",[],[461866],"[(256653, 0.8684725761413574), (256663, 0.8125...","[(256653, 0.3889222341312986), (816691, 0.3563...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
3,"[21910, 5478]","[346854, 14767, 3080, 43047, 5477]","[5477, 3080, 14767, 346854, 43047]","[(49398, 0.9022839069366455), (49365, 0.895142...","[(694721, 0.7071067811865475), (694717, 0.4999...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."
4,"[370917, 370918]",[],[370918],[],"[(846754, 0.7071067811865475), (1377304, 0.707...","[54, 292, 13701, 1000, 65, 9761, 258, 2397, 52..."


In [5]:
df.to_pickle("data/df_scores.pkl")

In [8]:
import scipy.sparse as sp
def mean(x):
    return sum(x)/len(x)


def get_scores(scores, how: str = 'mean'):
    """
    
    """
    scoring = dict()
    for item, score in scores:
        try:
            scoring[item].append(score)
        except:
            scoring[item] = [score]
    for item, scoring_list in scoring.items():
        scoring[item] = sum(scoring_list) if how == 'sum' else mean(scoring_list)
    return [(k, v) for k, v in sorted(scoring.items(), key=lambda item: item[1], reverse=True)]



path = 'data'

with open(path+'/old2new_dict.pkl', 'rb') as handle:
    old2new = pickle.load(handle)
with open(path+'/new2old_dict.pkl', 'rb') as handle:
    new2old = pickle.load(handle)
similarities_view = sp.load_npz(path+'/similarities_view.npz')
similarities_view = similarities_view.tocsc()

similarities_cart_add = sp.load_npz(path+'/similarities_cart_add.npz')
similarities_cart_add = similarities_cart_add.tocsc()



In [65]:
def _predict(session, topn, how='mean'):
    
    



def predict(session, topn, how='mean'):
    res_view = []
    res_cart = []
    fancy = []
    for vec in session['views']:
        new_item_id = old2new[vec]
        fancy.append(new_item_id)
        col = similarities_view[:, new_item_id]
        try:
            ix = np.argpartition(col.data,kth=-topn-1, axis=0)[-topn-1:]
            indices = col.indices[ix]
            values = col.data[ix]
        except ValueError:
            indices = col.indices
            values = col.data
        for i, ind in enumerate(indices):
            res_view.append([new2old[ind], values[i]])
    fancy = []
    for vec in session['to_cart']:
        new_item_id = old2new[vec]
        fancy.append(new_item_id)
        col = similarities_cart_add[:,  new_item_id]
        try:
            ix = np.argpartition(col.data,kth=-topn-1, axis=0)[-topn-1:]
            indices = col.indices[ix]
            values = col.data[ix]
        except ValueError:
            indices = col.indices
            values = col.data
        for i, ind in enumerate(indices):
            res_cart.append([new2old[ind], values[i]])
    #print(res_view, res_cart)
    res_cart += res_view
    #Сортим по скорам.
    res_cart.sort(key=lambda x: x[1], reverse=True)
    return get_scores(res_cart, how)[:topn]



In [69]:
row = df.iloc[1]

In [71]:
sess = {'views':row[0],
        'to_cart':row[1]}

In [79]:
%%time
for i in range(100):
    predict(sess, 50)

CPU times: user 436 ms, sys: 3.45 ms, total: 440 ms
Wall time: 438 ms


In [78]:
%%time
for i in range(100):
    _predict(sess, 50)

CPU times: user 334 ms, sys: 16 µs, total: 334 ms
Wall time: 332 ms
