In [1]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
!pip install pytorch-tabnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [3]:
from google.colab import drive
drive.mount('/content/drive')

#%cd /content/drive/MyDrive/recsys

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir("/content/drive/MyDrive/recsys")
!ls

boosting     notebooks		  recanet_model.py   res_tafeng.npy
checkpoints  preprocess		  res_dun.npy	     test_pretrain.zip
data	     __pycache__	  res_instacart.npy
metrics.py   recanet_datasets.py  res.npy


In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.2)
from sklearn.preprocessing import LabelEncoder

from IPython.display import clear_output
%matplotlib inline

import pandas as pd
import plotly.express as px
import numpy as np
from tqdm import tqdm

tqdm.pandas()

from sklearn.metrics import accuracy_score, roc_auc_score

from tqdm.notebook import tqdm

import os

from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

In [6]:
from metrics import recall_k, ndcg_k, precision_k, hitrate_k, repeat_score_item, repeat_score_user
from boosting.create_dataset import Dataset

In [7]:
def calculate_metrics(path_test, result):    
    test_baskets = pd.read_csv(path_test)
    user_test_baskets_df = test_baskets.groupby('user_id')['item_id'].apply(list).reset_index()
    user_test_baskets_dict = dict(zip( user_test_baskets_df['user_id'],user_test_baskets_df['item_id']))
    print('predictions ready', result.user_id.nunique())
    print('number of final test users:',result.user_id.nunique())
    for k in [5,10,20,'B']:
        print(k)
        recall_scores = {}
        ndcg_scores = {}
        precision_scores = {}
        hitrate_scores = {}
        #zero = 0
        for user in result.user_id.unique():

            top_items = result[result.user_id==user].item_id.values[0]

            if k == 'B':
                recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
                ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
                precision_scores[user] = precision_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
                hitrate_scores[user] = hitrate_k(user_test_baskets_dict[user],top_items,len(user_test_baskets_dict[user]))
            else:
                recall_scores[user] = recall_k(user_test_baskets_dict[user],top_items,k)
                ndcg_scores[user] = ndcg_k(user_test_baskets_dict[user],top_items,k)
                precision_scores[user] = precision_k(user_test_baskets_dict[user],top_items,k)
                hitrate_scores[user] = hitrate_k(user_test_baskets_dict[user],top_items,k)
        #print(zero)
        print('recall:',np.mean(list(recall_scores.values())))
        print('ndcg:',np.mean(list(ndcg_scores.values())))
        print('precision:',np.mean(list(precision_scores.values())))
        print('hitrate:',np.mean(list(hitrate_scores.values())))

In [8]:
# функция для предикта
def predict(df, th=0):
    
    df['user_id'] = df['user_id'].astype(int)
    df['item_id'] = df['item_id'].astype(int)
    
    test_users = pd.DataFrame(dataset.test_cleaned.user_id.unique(), columns=['user_id'])
    df = df[df.preds_scores>=th]       
    df = df.sort_values(by='preds_scores', ascending=False)
    
    res = df.groupby('user_id').agg({'item_id': list}).reset_index()
    res = test_users.merge(res, how='left')
    res['item_id'] = res['item_id'].fillna("").apply(list)
    return res

# TaFeng

In [8]:
dataset_name = 'tafeng'

In [9]:
path_train = f'data/{dataset_name}/baskets/train_baskets.csv'
path_test = f'data/{dataset_name}/baskets/test_baskets.csv'
path_val = f'data/{dataset_name}/baskets/valid_baskets.csv'

In [10]:
%%time
dataset = Dataset(path_train,path_val, path_test, dataset=dataset_name, history_len=50, basket_len=50)

Total users: 10205
Total items: 13521
CPU times: user 11.1 s, sys: 256 ms, total: 11.4 s
Wall time: 14.5 s


In [49]:
train = dataset.create_train_data()
val = dataset.create_val_test_data(mode='val')
test = dataset.create_val_test_data(mode='test')

boosting/data/tafeng/50_train.npz
Done!
boosting/data/tafeng/50_val.npz
Done!
boosting/data/tafeng/50_test.npz
Done!


In [50]:
val.shape, test.shape, train.shape

((195824, 55), (193585, 55), (2174103, 55))

In [51]:
val = val[val.item_id.isin(train.item_id)]
test = test[test.item_id.isin(train.item_id)]

In [55]:
test_item_id = test.item_id.copy()

In [56]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if col=='item_id':
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = l_enc.fit_transform(train[col].values)
        val[col] = l_enc.transform(val[col].values)
        test[col] = l_enc.transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

item_id 13511


In [57]:
unused_feat = ['user_id','labels', 'basket_id']

features = [ col for col in train.columns if col not in unused_feat] 

cat_idxs = [ i for i, f in enumerate(features) if f in ['item_id']]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [58]:
X_tr, y_tr = train.drop(['user_id','labels', 'basket_id'], axis=1).values, train.labels.sparse.to_dense().values 
X_val, y_val = val.drop(['user_id','labels', 'basket_id'], axis=1).values, val.labels.sparse.to_dense().values
X_test, y_test = test.drop(['user_id','labels', 'basket_id'], axis=1).values, test.labels.sparse.to_dense().values

### tabnet

In [21]:
clf = TabNetClassifier(cat_idxs = cat_idxs, cat_dims = cat_dims)



In [23]:
clf.fit(
  X_tr, y_tr,
  eval_set=[(X_val, y_val)],
  eval_metric=['auc'], patience=2
)

epoch 0  | loss: 0.11346 | val_0_auc: 0.62003 |  0:01:42s
epoch 1  | loss: 0.11022 | val_0_auc: 0.62014 |  0:03:23s
epoch 2  | loss: 0.11017 | val_0_auc: 0.60658 |  0:05:03s
epoch 3  | loss: 0.10845 | val_0_auc: 0.68576 |  0:06:46s
epoch 4  | loss: 0.10574 | val_0_auc: 0.70928 |  0:08:26s
epoch 5  | loss: 0.10467 | val_0_auc: 0.7148  |  0:10:09s
epoch 6  | loss: 0.10428 | val_0_auc: 0.7182  |  0:11:52s
epoch 7  | loss: 0.10408 | val_0_auc: 0.7116  |  0:13:35s
epoch 8  | loss: 0.10384 | val_0_auc: 0.71333 |  0:15:16s

Early stopping occurred at epoch 8 with best_epoch = 6 and best_val_0_auc = 0.7182




KeyboardInterrupt: ignored

In [75]:
preds = clf.predict_proba(X_test)
np.save('res_tafeng.npy', preds)
preds = np.load('res_tafeng.npy')

print('ROC AUC =', roc_auc_score(y_test, preds[:,1]))

ROC AUC = 0.725745512111351


In [74]:
roc_auc_score(y_test, preds[:,1])

0.725745512111351

In [68]:
res = test[['user_id','item_id']].copy()
res['item_id'] = test_item_id
res['preds_scores'] = preds[:,1]
result = predict(res, th = 0)

  df['user_id'] = df['user_id'].astype(int)
  df['item_id'] = df['item_id'].astype(int)
  res = df.groupby('user_id').agg({'item_id': list}).reset_index()


In [72]:
calculate_metrics(path_test, result)

predictions ready 5007
number of final test users: 5007
5
recall: 0.08969938443682439
ndcg: 0.08701742664654163
precision: 0.08200519273017776
hitrate: 0.31356101457958857
10
recall: 0.130043317772278
ndcg: 0.07292767097475931
precision: 0.06417016177351707
hitrate: 0.41981226283203515
20
recall: 0.16752206467025027
ndcg: 0.055923430755062835
precision: 0.04440782903934492
hitrate: 0.49730377471539844
B
recall: 0.07865192293075303
ndcg: 0.08425987223411527
precision: 0.07865192293075303
hitrate: 0.34012382664270024


# Dunnhumby

In [9]:
dataset_name = 'dunnhumby_cj'

In [10]:
path_train = f'data/{dataset_name}/baskets/train_baskets.csv'
path_test = f'data/{dataset_name}/baskets/test_baskets.csv'
path_val = f'data/{dataset_name}/baskets/valid_baskets.csv'

In [11]:
%%time
dataset = Dataset(path_train,path_val, path_test, dataset=dataset_name, history_len=50, basket_len=50)

Total users: 2483
Total items: 36963
CPU times: user 39 s, sys: 1.07 s, total: 40.1 s
Wall time: 45.5 s


In [12]:
train = dataset.create_train_data()
val = dataset.create_val_test_data(mode='val')
test = dataset.create_val_test_data(mode='test')

boosting/data/dunnhumby_cj/50_train.npz
Done!
boosting/data/dunnhumby_cj/50_val.npz
Done!
boosting/data/dunnhumby_cj/50_test.npz
Done!


In [13]:
val.shape, test.shape, train.shape

((177804, 55), (185085, 55), (13974032, 55))

In [14]:
val = val[val.item_id.isin(train.item_id)]
test = test[test.item_id.isin(train.item_id)]

In [15]:
test_item_id = test.item_id.copy()

In [16]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if col=='item_id':
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = l_enc.fit_transform(train[col].values)
        val[col] = l_enc.transform(val[col].values)
        test[col] = l_enc.transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

item_id 30220


In [17]:
unused_feat = ['user_id','labels', 'basket_id']

features = [ col for col in train.columns if col not in unused_feat] 

cat_idxs = [ i for i, f in enumerate(features) if f in ['item_id']]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [18]:
X_tr, y_tr = train.drop(['user_id','labels', 'basket_id'], axis=1).values, train.labels.sparse.to_dense().values 
X_val, y_val = val.drop(['user_id','labels', 'basket_id'], axis=1).values, val.labels.sparse.to_dense().values
X_test, y_test = test.drop(['user_id','labels', 'basket_id'], axis=1).values, test.labels.sparse.to_dense().values

### tabnet

In [19]:
clf = TabNetClassifier(cat_idxs = cat_idxs, cat_dims = cat_dims)



In [None]:
clf.fit(
  X_tr, y_tr,
  eval_set=[(X_val, y_val)],
  eval_metric=['auc'], patience=2
)

In [21]:
#preds = clf.predict_proba(X_test)
#np.save('res_dunnhumby.npy', preds)
preds = np.load('res_dun.npy')

print('ROC AUC =', roc_auc_score(y_test, preds[:,1]))

ROC AUC = 0.7579812489107728


In [26]:
res = test[['user_id','item_id']].copy()
res['item_id'] = test_item_id
res['preds_scores'] = preds[:,1]
result = predict(res, th = 0)

In [27]:
calculate_metrics(path_test, result)

predictions ready 1224
number of final test users: 1224
5
recall: 0.1124631529155371
ndcg: 0.1796441446942197
precision: 0.16290849673202615
hitrate: 0.45751633986928103
10
recall: 0.15250764933408997
ndcg: 0.14795306840463301
precision: 0.12573529411764706
hitrate: 0.5392156862745098
20
recall: 0.199348220517364
ndcg: 0.11535300916274825
precision: 0.09068627450980392
hitrate: 0.6062091503267973
B
recall: 0.12176454971541834
ndcg: 0.14254480838769648
precision: 0.12176454971541834
hitrate: 0.4877450980392157


# Instacart

In [9]:
dataset_name = 'instacart'

In [10]:
path_train = f'data/{dataset_name}/baskets/train_baskets_sample30k.csv'
path_test = f'data/{dataset_name}/baskets/test_baskets_sample30k.csv'
path_val = f'data/{dataset_name}/baskets/valid_baskets_sample30k.csv'

In [11]:
%%time
dataset = Dataset(path_train,path_val, path_test, dataset=dataset_name, history_len=50, basket_len=50)

Total users: 26828
Total items: 29396
CPU times: user 53.3 s, sys: 3.07 s, total: 56.4 s
Wall time: 57.1 s


In [12]:
train = dataset.create_train_data()
val = dataset.create_val_test_data(mode='val')
test = dataset.create_val_test_data(mode='test')

boosting/data/instacart/50_train.npz
Done!
boosting/data/instacart/50_val.npz
Done!
boosting/data/instacart/50_test.npz
Done!


In [13]:
val.shape, test.shape, train.shape

((354470, 55), (348067, 55), (10528763, 55))

In [16]:
val = val[val.item_id.isin(train.item_id)]
test = test[test.item_id.isin(train.item_id)]

In [17]:
test_item_id = test.item_id.copy()

In [18]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if col=='item_id':
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = l_enc.fit_transform(train[col].values)
        val[col] = l_enc.transform(val[col].values)
        test[col] = l_enc.transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

item_id 27492


In [19]:
unused_feat = ['user_id','labels', 'basket_id']

features = [ col for col in train.columns if col not in unused_feat] 

cat_idxs = [ i for i, f in enumerate(features) if f in ['item_id']]

cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [20]:
X_tr, y_tr = train.drop(['user_id','labels', 'basket_id'], axis=1).values, train.labels.sparse.to_dense().values 
X_val, y_val = val.drop(['user_id','labels', 'basket_id'], axis=1).values, val.labels.sparse.to_dense().values
X_test, y_test = test.drop(['user_id','labels', 'basket_id'], axis=1).values, test.labels.sparse.to_dense().values

### tabnet

In [34]:
clf = TabNetClassifier(cat_idxs = cat_idxs, cat_dims = cat_dims)

In [35]:
clf.fit(
  X_tr, y_tr,
  eval_set=[(X_val, y_val)],
  eval_metric=['auc'], patience=2
)

epoch 0  | loss: 0.35861 | val_0_auc: 0.80132 |  0:08:00s
epoch 1  | loss: 0.35427 | val_0_auc: 0.80374 |  0:15:57s
epoch 2  | loss: 0.3535  | val_0_auc: 0.80318 |  0:23:45s


KeyboardInterrupt: ignored

In [36]:
preds = clf.predict_proba(X_test)
np.save('res_instacart.npy', preds)
preds = np.load('res_instacart.npy')

print('ROC AUC =', roc_auc_score(y_test, preds[:,1]))

ROC AUC = 0.8036692208477078


In [37]:
res = test[['user_id','item_id']].copy()
res['item_id'] = test_item_id
res['preds_scores'] = preds[:,1]
result = predict(res, th = 0)

In [38]:
calculate_metrics(path_test, result)

predictions ready 13246
number of final test users: 13246
5
recall: 0.23718449460286695
ndcg: 0.4018971588227779
precision: 0.3690019628567115
hitrate: 0.7818964215612261
10
recall: 0.31528159457366345
ndcg: 0.3247469399934575
precision: 0.27455835723992145
hitrate: 0.8216065227238412
20
recall: 0.38119880679459833
ndcg: 0.2417657688536496
precision: 0.18197569077457343
hitrate: 0.8404046504605164
B
recall: 0.2993579204213944
ndcg: 0.34530760343838
precision: 0.2993579204213944
hitrate: 0.7845387286728069
