In [128]:
import ast
import re
import pandas as pd
import math
import numpy as np

In [129]:
def read_data(path):
    with open(path, 'r') as f:
        lines = f.readlines() 
        
    json_formatted = '['
    for line in lines:
        json_formatted += line + ','
    json_formatted = json_formatted[:-1] + ']' 
    json_formatted = re.sub('null', 'None', json_formatted)
    return ast.literal_eval(json_formatted)

In [130]:
drive = 'data/'

df_products = pd.DataFrame(read_data(drive + 'products.jsonl'))
df_sessions = pd.DataFrame(read_data(drive + 'sessions.jsonl'))
df_users = pd.DataFrame(read_data(drive + 'users.jsonl'))

In [131]:
df_sessions['timestamp'] = pd.to_datetime(df_sessions['timestamp'])

In [132]:
df_sessions = df_sessions[df_sessions["product_id"].notna()]

In [133]:
print("Ilość wierszy z nienullowym user_id:", df_sessions[df_sessions["user_id"].notna()].shape[0])
print("Ilość wierszy z nullowym user_id:", df_sessions[df_sessions["user_id"].isna()].shape[0])

# https://stackoverflow.com/questions/46718178/dataframe-columns-to-key-value-dictionary-pair
session_dict = df_sessions[df_sessions["user_id"].notna()].set_index('session_id').to_dict()
session_to_user_map = session_dict['user_id']

for index, row in df_sessions.iterrows():
    if row["session_id"] in session_to_user_map and np.isnan(row["user_id"]):
        df_sessions.at[index, 'user_id'] = session_to_user_map[row["session_id"]]

print("Ilość wierszy z nienullowym user_id po poprawce:", df_sessions[df_sessions["user_id"].notna()].shape[0])
print("Ilość wierszy z nullowym user_id po poprawce:", df_sessions[df_sessions["user_id"].isna()].shape[0])

df_sessions = df_sessions[df_sessions["user_id"].notna()]

print("Ilość wierszy po usunieciu wadliwych danych:", df_sessions.shape[0])

Ilość wierszy z nienullowym user_id: 88413
Ilość wierszy z nullowym user_id: 4601
Ilość wierszy z nienullowym user_id po poprawce: 92980
Ilość wierszy z nullowym user_id po poprawce: 34
Ilość wierszy po usunieciu wadliwych danych: 92980


In [134]:
df = df_products.set_index('product_id')
df = df_sessions.join(df, on='product_id')
df['product_id'] = df['product_id'].astype(int)
df.shape

(92980, 10)

In [135]:
df_viewed = df[df['event_type'] == 'VIEW_PRODUCT'].copy()
df_viewed.shape
#df_bought = df[df['event_type'] == 'BUY_PRODUCT'].copy()
#assert(df_viewed.shape[0] + df_bought.shape[0] == df.shape[0])

(83919, 10)

In [136]:
df_viewed = df_viewed.drop_duplicates(subset=['session_id', 'product_id'])
df_viewed.shape

(77102, 10)

In [137]:
df_viewed.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
0,100001,2021-01-08 11:35:40,102.0,1276,VIEW_PRODUCT,15,,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
1,100001,2021-01-08 11:38:48,102.0,1277,VIEW_PRODUCT,15,,Apple iPad mini 64GB 4G,Komputery;Tablety i akcesoria;Tablety,2317.02
4,100002,2021-01-05 18:08:21,102.0,1283,VIEW_PRODUCT,0,,Okulary 3D PHILIPS PTA436/00,Sprzęt RTV;Video;Telewizory i akcesoria;Okular...,99.99
7,100003,2021-01-06 02:15:31,102.0,1075,VIEW_PRODUCT,0,,Ricoh SG3110DN,Komputery;Drukarki i skanery;Biurowe urządzeni...,1998.14
8,100004,2021-01-03 23:18:34,102.0,1017,VIEW_PRODUCT,10,,LCD Dell U2412M,Komputery;Monitory;Monitory LCD,399.0


In [138]:
def train_test_split(df, train_test_ratio):
    df = df.copy()
    sessions = df['session_id'].unique()
    threshold = int(np.round(train_test_ratio * len(sessions)))
    train_sessions = sessions[:threshold]
    test_sessions = sessions[threshold:]
    
    df_train = df.loc[df['session_id'].isin(train_sessions)].copy()
    train_categories = df_train['category_path'].unique()
    df_test = df.loc[df['session_id'].isin(test_sessions)].copy()
    list_test_x = []
    list_test_y = []
    for session in test_sessions:
        df_temp = df_test.loc[df['session_id'] == session]
        if len(df_test) >= 2 and df_temp.iloc[0]['category_path'] in train_categories:
            x_test = df_temp.iloc[:-1].copy()
            y_test = df_temp.iloc[-1]
            list_test_x.append(x_test)
            list_test_y.append(y_test)
        
    return df_train, pd.concat(list_test_x), pd.DataFrame(list_test_y)

In [139]:
df_train, df_test_x, df_test_y = train_test_split(df_viewed, 0.8)
df_train.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
0,100001,2021-01-08 11:35:40,102.0,1276,VIEW_PRODUCT,15,,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
1,100001,2021-01-08 11:38:48,102.0,1277,VIEW_PRODUCT,15,,Apple iPad mini 64GB 4G,Komputery;Tablety i akcesoria;Tablety,2317.02
4,100002,2021-01-05 18:08:21,102.0,1283,VIEW_PRODUCT,0,,Okulary 3D PHILIPS PTA436/00,Sprzęt RTV;Video;Telewizory i akcesoria;Okular...,99.99
7,100003,2021-01-06 02:15:31,102.0,1075,VIEW_PRODUCT,0,,Ricoh SG3110DN,Komputery;Drukarki i skanery;Biurowe urządzeni...,1998.14
8,100004,2021-01-03 23:18:34,102.0,1017,VIEW_PRODUCT,10,,LCD Dell U2412M,Komputery;Monitory;Monitory LCD,399.0


In [140]:
df_test_x.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
77636,108705,2021-01-03 16:25:49,262.0,1007,VIEW_PRODUCT,0,,Dead Space 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,89.99
77637,108705,2021-01-03 16:25:55,262.0,1019,VIEW_PRODUCT,0,,GTA 4 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,109.99
77638,108705,2021-01-03 16:28:57,262.0,1314,VIEW_PRODUCT,0,,Assassin&#39;s Creed (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
77639,108705,2021-01-03 16:31:41,262.0,1029,VIEW_PRODUCT,0,,Tom Clancy&#39;s Rainbow Six Vegas 2 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
77640,108705,2021-01-03 16:32:01,262.0,1016,VIEW_PRODUCT,0,,Rayman Origins (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,69.0


In [141]:
df_test_y.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
77653,108705,2021-01-03 17:15:27,262.0,1279,VIEW_PRODUCT,0,,Assassin&#39;s Creed 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
77656,108706,2021-01-01 08:24:19,262.0,1318,VIEW_PRODUCT,0,,Plantronics Savi W710,Sprzęt RTV;Audio;Słuchawki,553.0
77659,108707,2021-01-03 08:33:09,262.0,1281,VIEW_PRODUCT,20,,Manta MM266,Sprzęt RTV;Przenośne audio i video;Odtwarzacze...,64.8
77668,108708,2021-01-04 16:32:11,262.0,1043,VIEW_PRODUCT,0,,Fight Night Champion (PS3),Gry i konsole;Gry na konsole;Gry PlayStation3,109.0
77672,108709,2021-01-07 15:52:10,262.0,1233,VIEW_PRODUCT,0,,Manta DVD064,Sprzęt RTV;Video;Odtwarzacze DVD,109.0


In [142]:
import models

In [143]:
baseline = models.BaselineModel()
prob_model = models.ParametrizedModel()

In [144]:
baseline.fit(df_train)
prob_model.fit(df_train)

<models.ParametrizedModel at 0x28f03b129e8>

In [145]:
df_test_x['product_id'] = df_test_x['product_id'].astype(int)
df_test_y['product_id'] = df_test_y['product_id'].astype(int)

In [152]:
def measure_accuracy(x, y, model, n_to_predict=4):
    x = x.copy()
    sessions = x['session_id'].unique()
    
    preds_acc = []
    for session in sessions:
        x_temp = x.loc[x['session_id'] == session]
        y_temp = y.loc[y['session_id'] == session].squeeze()
        product_id = y_temp['product_id']
        category = y_temp['category_path']
        preds = model.predict(category, x_temp['product_id'], n_to_predict)
        preds = list(preds)
        preds_acc.append(product_id in preds)
    
    return np.mean(preds_acc)

In [153]:
baseline_acc = measure_accuracy(df_test_x, df_test_y, baseline)
prob_model_acc = measure_accuracy(df_test_x, df_test_y, prob_model)

In [154]:
print('Baseline: %.4f%%' % (baseline_acc * 100))
print('Parametrized model: %.4f%%' % (prob_model_acc * 100))

Baseline: 67.5634%
Parametrized model: 69.4081%
