In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm
from pys.models import CombinedProd2Vec, cosine_model, top
from typing import Tuple, List
import pickle
import random

"""
proccessed.json - id, view_list, to_cart_list, order_list

"""

SIZE_OF_DATASET = 40000
random.seed(15)
np.random.seed(15)

def gen_sliced_dataset()-> pd.DataFrame:
    df = pd.read_json('data/processed.json')
    n_rows = df.shape[0]
    data_size = SIZE_OF_DATASET
    random_nk = np.random.choice(n_rows, size=data_size, replace=False)
    df_slice = df.iloc[random_nk].copy()
    del df
    gc.collect()
    return df_slice


def get_bin(x):
    orders = x[0]
    prediction = x[1]
    return 1 if prediction in orders else 0


def get_models()->Tuple[List[str], CombinedProd2Vec, cosine_model]:
    #Top 50 from popular
    top50 = top()
    #Prod2Vec model
    path1 = 'w2vec/weights/view.model'
    path2 = 'w2vec/weights/cart.model'
    p2vec = CombinedProd2Vec([path1, path2])
    
    #Cosine similarity Model
    path = 'data'
    cos_model = cosine_model(path)
    
    return top50, p2vec, cos_model


def get_predictions(df)->None:
    top50, p2vec, cos_model = get_models()
    top50_pred = top50.get_prediction()
    print("Predicting...")
    #top n
    n = 50
    how = 'mean'
    predictions_w2vec = []
    predictions_cosine = []
    for row in tqdm(df.values):
        session = {'views':row[0],
                  'to_cart':row[1]}
        p2vec_pred = p2vec.get_prediction_for_session(session, n, how)
        cos_pred = cos_model._predict(session, n, how)
        #p2vec_pred = [x for x in p2vec_pred]
        #cos_pred = [x for x in cos_pred]
        predictions_w2vec.append(p2vec_pred)
        predictions_cosine.append(cos_pred)
    df['w2vec_pred'] = predictions_w2vec
    df['cos_pred'] = predictions_cosine
    return df
    print("Done")


if __name__ == '__main__':
    print("Creating Dataset")
    df = gen_sliced_dataset()
    print("Creating Model")
    df = get_predictions(df)


Creating Dataset
Creating Model


  0%|          | 2/40000 [00:00<53:43, 12.41it/s]

Predicting...


100%|██████████| 40000/40000 [43:21<00:00, 15.37it/s]  


In [2]:
top50 = top()
top50_prediction = top50.get_prediction()
df['top_pred'] = [top50_prediction] * SIZE_OF_DATASET

In [3]:
df.head()

Unnamed: 0,view,to_cart,orders,w2vec_pred,cos_pred,top_pred
778664,[],"[29288920, 29288922, 19063915, 29288924, 16026...","[167019538, 160267269, 29288920, 29288924, 292...","[(152693522, 0.8030790090560913), (145642383, ...","[(29288924, 0.22376684056929183), (29288922, 0...","[182435597, 149074294, 29575310, 19148624, 145..."
905767,"[148605184, 173643354, 162816873, 177309999, 1...","[177310075, 173643351, 180754650, 148605082]",[177310075],"[(166486203, 0.9552826881408691), (178470646, ...","[(180754650, 0.5883883476483185), (173643351, ...","[182435597, 149074294, 29575310, 19148624, 145..."
92534,"[181696450, 141699526, 177619893, 149457302]","[159539301, 26893674]","[182435597, 26893674, 159539301]","[(151775169, 0.8174545764923096), (150824042, ...","[(180496130, 0.7071067811865475), (185459365, ...","[182435597, 149074294, 29575310, 19148624, 145..."
573892,"[179051119, 160731129]","[181556226, 181377603, 149563436, 34426519, 14...","[161762638, 166106016]","[(181377603, 0.9985669255256653), (143906047, ...","[(177387653, 0.5773502691896258), (178039205, ...","[182435597, 149074294, 29575310, 19148624, 145..."
456682,"[165187883, 166483280, 147617151, 190199068, 1...",[],[149074294],"[(168369533, 0.6856241822242737), (140320980, ...","[(150339708, 0.7071067811865475), (183122687, ...","[182435597, 149074294, 29575310, 19148624, 145..."


In [4]:
df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,view,to_cart,orders,w2vec_pred,cos_pred,top_pred
0,[],"[29288920, 29288922, 19063915, 29288924, 16026...","[167019538, 160267269, 29288920, 29288924, 292...","[(152693522, 0.8030790090560913), (145642383, ...","[(29288924, 0.22376684056929183), (29288922, 0...","[182435597, 149074294, 29575310, 19148624, 145..."
1,"[148605184, 173643354, 162816873, 177309999, 1...","[177310075, 173643351, 180754650, 148605082]",[177310075],"[(166486203, 0.9552826881408691), (178470646, ...","[(180754650, 0.5883883476483185), (173643351, ...","[182435597, 149074294, 29575310, 19148624, 145..."
2,"[181696450, 141699526, 177619893, 149457302]","[159539301, 26893674]","[182435597, 26893674, 159539301]","[(151775169, 0.8174545764923096), (150824042, ...","[(180496130, 0.7071067811865475), (185459365, ...","[182435597, 149074294, 29575310, 19148624, 145..."
3,"[179051119, 160731129]","[181556226, 181377603, 149563436, 34426519, 14...","[161762638, 166106016]","[(181377603, 0.9985669255256653), (143906047, ...","[(177387653, 0.5773502691896258), (178039205, ...","[182435597, 149074294, 29575310, 19148624, 145..."
4,"[165187883, 166483280, 147617151, 190199068, 1...",[],[149074294],"[(168369533, 0.6856241822242737), (140320980, ...","[(150339708, 0.7071067811865475), (183122687, ...","[182435597, 149074294, 29575310, 19148624, 145..."


In [6]:
df.to_pickle("data/df_scores.pkl")