In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

!mkdir -p /data
!mkdir -p /data/cache/interactions

!cp gdrive/MyDrive/MadeProject/ncf_orders.parquet /data/interactions.parquet
!cp gdrive/MyDrive/MadeProject/ncf_val_df.parquet /data/pred_interactions.parquet
!cp gdrive/MyDrive/MadeProject/epoch_250.pkl /data/model.pkl
!cp gdrive/MyDrive/MadeProject/test_VALID.pkl /data/test_VALID.pkl
!cp gdrive/MyDrive/MadeProject/h3_to_chains.pkl /data/h3_to_chains.pkl

Mounted at /content/gdrive/


In [2]:
# see: https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/cornac_bivae_deep_dive.ipynb
!pip install cornac
!pip install papermill
!pip install scrapbook

Collecting cornac
  Downloading cornac-1.14.1-cp37-cp37m-manylinux1_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 6.1 MB/s 
Collecting powerlaw
  Downloading powerlaw-1.5-py3-none-any.whl (24 kB)
Installing collected packages: powerlaw, cornac
Successfully installed cornac-1.14.1 powerlaw-1.5
Collecting papermill
  Downloading papermill-2.3.3-py3-none-any.whl (36 kB)
Collecting tenacity
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Collecting ansiwrap
  Downloading ansiwrap-0.8.4-py2.py3-none-any.whl (8.5 kB)
Collecting black
  Downloading black-21.12b0-py3-none-any.whl (156 kB)
[K     |████████████████████████████████| 156 kB 13.8 MB/s 
Collecting jupyter-client>=6.1.5
  Downloading jupyter_client-7.1.0-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 68.9 MB/s 
Collecting textwrap3>=0.9.2
  Downloading textwrap3-0.9.2-py2.py3-none-any.whl (12 kB)
Collecting platformdirs>=2
  Downloading platformdirs-2.4.0-py3-none-any.whl 

Collecting scrapbook
  Downloading scrapbook-0.5.0-py3-none-any.whl (34 kB)
Installing collected packages: scrapbook
Successfully installed scrapbook-0.5.0


In [3]:
!git clone https://github.com/microsoft/recommenders
!mv -v ./recommenders/* ./
!mv -v ./recommenders/recommenders/* ./recommenders/
!rmdir ./recommenders/recommenders

Cloning into 'recommenders'...
remote: Enumerating objects: 31745, done.[K
remote: Counting objects: 100% (2911/2911), done.[K
remote: Compressing objects: 100% (1530/1530), done.[K
remote: Total 31745 (delta 1870), reused 2158 (delta 1309), pack-reused 28834[K
Receiving objects: 100% (31745/31745), 200.97 MiB | 31.52 MiB/s, done.
Resolving deltas: 100% (21048/21048), done.
renamed './recommenders/AUTHORS.md' -> './AUTHORS.md'
renamed './recommenders/CODE_OF_CONDUCT.md' -> './CODE_OF_CONDUCT.md'
renamed './recommenders/conda.md' -> './conda.md'
renamed './recommenders/contrib' -> './contrib'
renamed './recommenders/CONTRIBUTING.md' -> './CONTRIBUTING.md'
renamed './recommenders/docs' -> './docs'
renamed './recommenders/examples' -> './examples'
renamed './recommenders/GLOSSARY.md' -> './GLOSSARY.md'
renamed './recommenders/LICENSE' -> './LICENSE'
renamed './recommenders/MANIFEST.in' -> './MANIFEST.in'
renamed './recommenders/NEWS.md' -> './NEWS.md'
renamed './recommenders/pyproject

In [4]:
import sys
import os
import torch
import cornac
import pandas as pd
import papermill as pm
import scrapbook as sb
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking, predict
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

In [24]:
TOP_K = 30

# Model parameters
LATENT_DIM = 64
ENCODER_DIMS = [256]
ACT_FUNC = 'relu6'
LIKELIHOOD = 'pois'
NUM_EPOCHS = 200
BATCH_SIZE = 256
LEARNING_RATE = 0.005
BETA_KL = 0.8

In [6]:
data = pd.read_parquet('/data/interactions.parquet')
data = data.rename(columns={"user_id": "userID", "chain_id": "itemID", "weight": "rating"})
data.head()

Unnamed: 0_level_0,userID,itemID,rating
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,28276,1
1,0,28720,144447
2,0,31057,4860
3,0,32322,2966
4,0,35152,22


In [7]:
train, test = python_random_split(data, 0.95)

In [8]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 594507
Number of items: 7214


In [25]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    beta_kl=BETA_KL,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)
with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))
# bivae.load('/data/model.pkl')
# bivae.save('/data/')
# !cp /data/BiVAECF/* gdrive/MyDrive/MadeProject/

  0%|          | 0/200 [00:00<?, ?it/s]

Took 11697.6450 seconds for training.


In [10]:
h3_to_chains = pd.read_pickle('/data/h3_to_chains.pkl')
h3_to_index = {h3: i for i, h3 in enumerate(h3_to_chains.keys())}
r_h3_to_index = {i: h3 for i, h3 in enumerate(h3_to_chains.keys())}

In [11]:
val_df = pd.read_pickle('/data/test_VALID.pkl')
val_df = val_df[['customer_id', 'h3', 'chain_id']]
val_df = val_df.rename(columns={"customer_id": "user_id"})
val_df.user_id = val_df.user_id.astype(int)
val_df.chain_id = val_df.chain_id.astype(int)
val_df = val_df.query('user_id in @train_set.uid_map')
val_df = val_df.query('chain_id in @train_set.iid_map')
val_df = pd.pivot_table(val_df,
                        values=['chain_id'],
                        index=['user_id', 'h3'],
                        aggfunc={'chain_id': set})
val_df = val_df.reset_index()
val_df = val_df.rename(columns={"chain_id": "chains"})
val_df.head(100)

Unnamed: 0,user_id,h3,chains
0,0,89118108b43ffff,{28720}
1,0,89118134503ffff,{28720}
2,0,89118134513ffff,{28720}
3,0,89118134517ffff,{28720}
4,0,8911813456bffff,{28720}
...,...,...,...
95,0,891181a0657ffff,{28720}
96,0,891181a065bffff,{28720}
97,0,891181a068bffff,{28720}
98,0,891181a06c3ffff,{28720}


In [26]:
import numpy as np

def predict_for_user(model, uid_map, r_iid_map, h3_to_valid_items, user_id, h3):
    if h3 not in h3_to_valid_items:
        return set()
    valid_items = h3_to_valid_items[h3]
    top = sorted([(r_iid_map[i], v)
                for i, v in enumerate(model.score(uid_map[user_id]))
                if v > 0 and r_iid_map[i] in valid_items
            ], key=lambda x : -x[1])[:TOP_K]
    return set([r_i for r_i, v in top])

def old_items(df, user_id):
    return set(df[df['userID'] == user_id]['itemID'].unique())

def metric(y_true, y_pred, y_old, at1=10, at2=30, average=True):
    """
    new_prec@10 + new_prec@30 + 1/2 *(prec_@10 + prec@30)
    """
    scores_new = []
    scores_all = []
    scores_total = []
    for t, p, o in zip(y_true, y_pred, y_old):
        t = list(t)
        p = list(p)
        o = o if isinstance(o, (set, list)) else []
        
        prec1 = len(set(t[:at1]) & set(p[:at1])) / at1
        prec2 = len(set(t[:at2]) & set(p[:at2])) / at2
        new_prec1 = len((set(p[:at1]) - set(o)) & set(t[:at1])) / at1
        new_prec2 = len((set(p[:at2]) - set(o)) & set(t[:at2])) / at2

        scores_total.append(new_prec1 + new_prec2 + 0.5 * (prec1 + prec2))
        scores_new.append(new_prec1 + new_prec2)
        scores_all.append(prec1 + prec2)

    return (np.mean(scores_total) if average else scores_total,
            np.mean(scores_new) if average else scores_new,
            np.mean(scores_all) if average else scores_all)

In [27]:
%%time
uid_map = bivae.train_set.uid_map
iid_map = bivae.train_set.iid_map
r_iid_map = {v: k for k, v in iid_map.items()}
val_df['pred_chains'] = val_df.apply(
    lambda x: predict_for_user(bivae, uid_map, r_iid_map, h3_to_chains, x.user_id, x.h3),
    axis=1
)
val_df['old_chains'] = val_df.apply(
    lambda x: old_items(train, x.user_id),
    axis=1
)

CPU times: user 38min 7s, sys: 7.03 s, total: 38min 14s
Wall time: 38min 6s


In [28]:
scores = metric(val_df['chains'], val_df['pred_chains'], val_df['old_chains'])
print('total, new, all = ', scores)

total, new, all =  (0.04271075137793088, 0.011887986530427934, 0.06164552969500586)


### total, new, all =  (0.053971209243539216, 0.020153075695999743, 0.06763626709507893)

### total, new, all =  (0.055129147691043955, 0.01820479830813461, 0.07384869876581869)

### total, new, all =  (0.044247408083104726, 0.01291613058405208, 0.06266255499810527)

### total, new, all =  (0.04271075137793088, 0.011887986530427934, 0.06164552969500586)