In [14]:
import ast
import re
import pandas as pd
import math
import numpy as np

In [15]:
def read_data(path):
    with open(path, 'r') as f:
        lines = f.readlines() 
        
    json_formatted = '['
    for line in lines:
        json_formatted += line + ','
    json_formatted = json_formatted[:-1] + ']' 
    json_formatted = re.sub('null', 'None', json_formatted)
    return ast.literal_eval(json_formatted)

In [16]:
drive = 'data/'

df_products = pd.DataFrame(read_data(drive + 'products.jsonl'))
df_sessions = pd.DataFrame(read_data(drive + 'sessions.jsonl'))
df_users = pd.DataFrame(read_data(drive + 'users.jsonl'))

In [17]:
df_sessions['timestamp'] = pd.to_datetime(df_sessions['timestamp'])

In [18]:
df_sessions = df_sessions[df_sessions["product_id"].notna()]

In [19]:
print("Ilość wierszy z nienullowym user_id:", df_sessions[df_sessions["user_id"].notna()].shape[0])
print("Ilość wierszy z nullowym user_id:", df_sessions[df_sessions["user_id"].isna()].shape[0])

# https://stackoverflow.com/questions/46718178/dataframe-columns-to-key-value-dictionary-pair
session_dict = df_sessions[df_sessions["user_id"].notna()].set_index('session_id').to_dict()
session_to_user_map = session_dict['user_id']

for index, row in df_sessions.iterrows():
    if row["session_id"] in session_to_user_map and np.isnan(row["user_id"]):
        df_sessions.at[index, 'user_id'] = session_to_user_map[row["session_id"]]

print("Ilość wierszy z nienullowym user_id po poprawce:", df_sessions[df_sessions["user_id"].notna()].shape[0])
print("Ilość wierszy z nullowym user_id po poprawce:", df_sessions[df_sessions["user_id"].isna()].shape[0])

df_sessions = df_sessions[df_sessions["user_id"].notna()]

print("Ilość wierszy po usunieciu wadliwych danych:", df_sessions.shape[0])

Ilość wierszy z nienullowym user_id: 88413
Ilość wierszy z nullowym user_id: 4601
Ilość wierszy z nienullowym user_id po poprawce: 92980
Ilość wierszy z nullowym user_id po poprawce: 34
Ilość wierszy po usunieciu wadliwych danych: 92980


In [20]:
df = df_products.set_index('product_id')
df = df_sessions.join(df, on='product_id')
df['product_id'] = df['product_id'].astype(int)
df.shape

(92980, 10)

In [21]:
df_viewed = df[df['event_type'] == 'VIEW_PRODUCT'].copy()
df_bought = df[df['event_type'] == 'BUY_PRODUCT'].copy()
assert(df_viewed.shape[0] + df_bought.shape[0] == df.shape[0])

In [22]:
df_viewed.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
0,100001,2021-01-08 11:35:40,102.0,1276,VIEW_PRODUCT,15,,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
1,100001,2021-01-08 11:38:48,102.0,1277,VIEW_PRODUCT,15,,Apple iPad mini 64GB 4G,Komputery;Tablety i akcesoria;Tablety,2317.02
2,100001,2021-01-08 11:40:32,102.0,1276,VIEW_PRODUCT,15,,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
4,100002,2021-01-05 18:08:21,102.0,1283,VIEW_PRODUCT,0,,Okulary 3D PHILIPS PTA436/00,Sprzęt RTV;Video;Telewizory i akcesoria;Okular...,99.99
5,100002,2021-01-05 18:12:35,102.0,1283,VIEW_PRODUCT,0,,Okulary 3D PHILIPS PTA436/00,Sprzęt RTV;Video;Telewizory i akcesoria;Okular...,99.99


In [173]:
def train_test_split(df, train_test_ratio):
    df = df.copy()
    sessions = df['session_id'].unique()
    threshold = int(np.round(train_test_ratio * len(sessions)))
    train_sessions = sessions[:threshold]
    test_sessions = sessions[threshold:]
    
    df_train = df.loc[df['session_id'].isin(train_sessions)].copy()
    train_categories = df_train['category_path'].unique()
    df_test = df.loc[df['session_id'].isin(test_sessions)].copy()
    list_test_x = []
    list_test_y = []
    for session in test_sessions:
        df_temp = df_test.loc[df['session_id'] == session]
        if len(df_test) >= 2 and df_temp.iloc[0]['category_path'] in train_categories:
            x_test = df_temp.iloc[:-1].copy()
            y_test = df_temp.iloc[-1]
            list_test_x.append(x_test)
            list_test_y.append(y_test)
        
    return df_train, pd.concat(list_test_x), pd.DataFrame(list_test_y)

In [174]:
df_train, df_test_x, df_test_y = train_test_split(df_viewed, 0.8)
df_train.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
0,100001,2021-01-08 11:35:40,102.0,1276,VIEW_PRODUCT,15,,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
1,100001,2021-01-08 11:38:48,102.0,1277,VIEW_PRODUCT,15,,Apple iPad mini 64GB 4G,Komputery;Tablety i akcesoria;Tablety,2317.02
2,100001,2021-01-08 11:40:32,102.0,1276,VIEW_PRODUCT,15,,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
4,100002,2021-01-05 18:08:21,102.0,1283,VIEW_PRODUCT,0,,Okulary 3D PHILIPS PTA436/00,Sprzęt RTV;Video;Telewizory i akcesoria;Okular...,99.99
5,100002,2021-01-05 18:12:35,102.0,1283,VIEW_PRODUCT,0,,Okulary 3D PHILIPS PTA436/00,Sprzęt RTV;Video;Telewizory i akcesoria;Okular...,99.99


In [175]:
df_test_x.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
77636,108705,2021-01-03 16:25:49,262.0,1007,VIEW_PRODUCT,0,,Dead Space 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,89.99
77637,108705,2021-01-03 16:25:55,262.0,1019,VIEW_PRODUCT,0,,GTA 4 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,109.99
77638,108705,2021-01-03 16:28:57,262.0,1314,VIEW_PRODUCT,0,,Assassin&#39;s Creed (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
77639,108705,2021-01-03 16:31:41,262.0,1029,VIEW_PRODUCT,0,,Tom Clancy&#39;s Rainbow Six Vegas 2 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
77640,108705,2021-01-03 16:32:01,262.0,1016,VIEW_PRODUCT,0,,Rayman Origins (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,69.0


In [176]:
df_test_y.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,product_name,category_path,price
77654,108705,2021-01-03 17:16:56,262.0,1008,VIEW_PRODUCT,0,,Tom Clancy&#39;s Rainbow Six Vegas (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
77657,108706,2021-01-01 08:29:18,262.0,1318,VIEW_PRODUCT,0,,Plantronics Savi W710,Sprzęt RTV;Audio;Słuchawki,553.0
77660,108707,2021-01-03 08:34:43,262.0,1281,VIEW_PRODUCT,20,,Manta MM266,Sprzęt RTV;Przenośne audio i video;Odtwarzacze...,64.8
77669,108708,2021-01-04 16:33:26,262.0,1043,VIEW_PRODUCT,0,,Fight Night Champion (PS3),Gry i konsole;Gry na konsole;Gry PlayStation3,109.0
77673,108709,2021-01-07 15:53:50,262.0,1234,VIEW_PRODUCT,0,,Sony DVP-SR760,Sprzęt RTV;Video;Odtwarzacze DVD,193.0


In [177]:
import models

In [178]:
baseline = models.BaselineModel()
prob_model = models.ParametrizedModel()

In [179]:
baseline.fit(df_train)
prob_model.fit(df_train)

<models.ParametrizedModel at 0x17881b42b00>

In [187]:
def measure_accuracy(x, y, model, n_to_predict=10):
    x = x.copy()
    sessions = x['session_id'].unique()
    
    preds_acc = []
    for session in sessions:
        x_temp = x.loc[x['session_id'] == session]
        y_temp = y.loc[y['session_id'] == session].squeeze()
        product_id = y_temp['product_id']
        category = y_temp['category_path']
        print(session)
        preds = model.predict(category, x_temp['product_id'], n_to_predict)
        preds_acc.append(product_id in preds)
    
    return np.mean(preds_acc)

In [188]:
baseline_acc = measure_accuracy(df_test_x, df_test_y, baseline)
prob_model_acc = measure_accuracy(df_test_x, df_test_y, prob_model)

108705
108706
108707
108708
108709
108710
108711
108712
108713
108714
108716
108717
108718
108719
108720
108721
108722
108723
108724
108725
108726
108727
108728
108729
108730
108731
108732
108733
108735
108737
108738
108739
108740
108741
108742
108743
108744
108745
108746
108748
108749
108751
108752
108753
108754
108757
108758
108759
108760
108761
108762
108764
108765
108766
108767
108768
108769
108770
108771
108772
108774
108775
108776
108777
108778
108779
108780
108781
108782
108783
108784
108785
108786
108787
108788
108789
108790
108791
108792
108793
108794
108795
108796
108797
108798
108799
108800
108801
108805
108807
108808
108810
108811
108812
108814
108815
108816
108817
108818
108819
108820
108821
108822
108823
108824
108825
108827
108828
108829
108830
108831
108832
108833
108834
108835
108836
108837
108839
108840
108841
108842
108843
108844
108846
108847
108848
108849
108851
108852
108853
108854
108855
108857
108858
108859
108860
108862
108863
108864
108865
108866
108867
108869

110109
110110
110111
110112
110113
110114
110115
110117
110118
110119
110120
110121
110122
110123
110124
110125
110128
110130
110131
110132
110133
110134
110135
110136
110137
110138
110139
110140
110141
110142
110143
110145
110146
110148
110149
110150
110151
110152
110153
110154
110155
110156
110157
110158
110159
110161
110163
110166
110167
110168
110170
110171
110173
110174
110175
110177
110179
110180
110181
110182
110183
110184
110185
110187
110188
110190
110191
110192
110194
110196
110197
110198
110199
110202
110203
110204
110205
110206
110207
110208
110210
110211
110212
110213
110214
110215
110216
110217
110218
110219
110220
110221
110222
110223
110224
110227
110228
110229
110230
110231
110233
110234
110235
110236
110237
110238
110239
110240
110241
110242
110243
110244
110245
110246
110247
110248
110249
110250
110251
110252
110254
110255
110256
110257
110258
110259
110261
110262
110263
110264
110265
110266
110268
110270
110271
110272
110274
110275
110277
110278
110279
110280
110281

ValueError: Invalid weights: weights sum to zero

In [171]:
print('Baseline: %.4f%%' % (baseline_acc * 100))
print('Parametrized model: %.4f%%' % (prob_model_acc * 100))

Baseline: 0.0000%
Parametrized model: 0.0000%


In [195]:
df_test_x.loc[df_test_x['session_id'] == 110882]['product_id']

pandas.core.series.Series