In [1]:
%load_ext autoreload
%autoreload 2

# Experiments with ItemKNN model

In [2]:
from pathlib import Path

import scipy.sparse
import numpy as np

import src.io as io
import src.evaluation as evaluation
import src.evaluation.splits
import src.evaluation.base

from src.algorithm.baseline.iknn import ItemKNN

## Datasets

In [3]:
## Uncomment corresponding directory.
## RETARGET True for Frappe, False for the others

# DATA_DIR, RETARGET = Path('../../../data/CARS/Mobile_Frappe/'), True
DATA_DIR, RETARGET = Path('../../../data/CARS/Food_com/'), False
# DATA_DIR, RETARGET = Path('../../../data/CARS/TripAdvisor/'), False

In [4]:
# Shouldn't need to change this info
INTERACTIONS = DATA_DIR / 'interactions.csv'

SEED = 123456
SEED2 = 78910

ITEM_ID = 'item'
USER_ID = 'user'

## Define algorithm and hyperparameter ranges

In [5]:
ALG = ItemKNN

K = [1, 10, 20, 50, 100, 200, 500, 1000, None]
NORMALIZE = [True, False]

HYPERPARAMS = {'k': K, 'normalize': NORMALIZE}
HYPERPARAMS

{'k': [1, 10, 20, 50, 100, 200, 500, 1000, None], 'normalize': [True, False]}

## Parse data

In [6]:
data = io.parse_interactions_with_context(INTERACTIONS, item_id=ITEM_ID, user_id=USER_ID)
data.df

Unnamed: 0,userId,itemId,season,weekday
0,19211,6280,1,1
1,17684,6280,2,2
2,12110,6280,3,3
3,7184,6280,3,1
4,17837,6280,2,4
...,...,...,...,...
388357,895,7799,3,7
388358,4100,7799,4,2
388359,2940,7799,1,4
388360,757,7799,2,2


## Make train/val split for hyperparam tuning

In [7]:
Xtrain, Xval_in, Xval_out = evaluation.splits.leave_one_out_split_non_context(data, seed=SEED)

## Perform gridsearch on validation set

In [8]:
%%time
score, best_hyperparams = evaluation.base.gridsearch(ALG, Xtrain, Xval_in, Xval_out, HYPERPARAMS, retarget=RETARGET)
f"Best score of {score} achieved with {best_hyperparams}."

  0%|          | 0/18 [00:00<?, ?it/s]

Training model ItemKNN with hyperparameters {'k': 1, 'normalize': True}
density of model 6.628662335940607e-05
Evaluating with 22178 users
MRR@5 0.006
MRR@20 0.007
Average Recall@5 0.009
Average Recall@20 0.012
Training model ItemKNN with hyperparameters {'k': 10, 'normalize': True}
density of model 0.0006628662335940607
Evaluating with 22178 users
MRR@5 0.011
MRR@20 0.012
Average Recall@5 0.015
Average Recall@20 0.027
Training model ItemKNN with hyperparameters {'k': 20, 'normalize': True}
density of model 0.0013257324671881213
Evaluating with 22178 users
MRR@5 0.011
MRR@20 0.012
Average Recall@5 0.016
Average Recall@20 0.029
Training model ItemKNN with hyperparameters {'k': 50, 'normalize': True}
density of model 0.0033141422295635387
Evaluating with 22178 users
MRR@5 0.011
MRR@20 0.012
Average Recall@5 0.016
Average Recall@20 0.032
Training model ItemKNN with hyperparameters {'k': 100, 'normalize': True}
density of model 0.006619470262755675
Evaluating with 22178 users
MRR@5 0.01
MR

"Best score of 0.012342715308944149 achieved with {'k': 20, 'normalize': True}."

## Evaluate model with optimal hyperparams with KFolds Cross validation

In [9]:
%%time
alg = ALG(**best_hyperparams)
results = evaluation.base.kFoldsEval(alg, data, nr_folds=5, seed=SEED2, retarget=RETARGET)

density of model 0.0013257324671881213
Evaluating with 22178 users
density of model 0.0013257324671881213
Evaluating with 22178 users
density of model 0.0013257324671881213
Evaluating with 22178 users
density of model 0.0013257324671881213
Evaluating with 22178 users
density of model 0.0013257324671881213
Evaluating with 22178 users
MRR@5 0.01 (0.001)
MRR@20 0.012 (0.001)
Average Recall@5 0.016 (0.001)
Average Recall@20 0.03 (0.001)
CPU times: user 30.8 s, sys: 10.5 s, total: 41.3 s
Wall time: 41.3 s
