In [1]:
%load_ext autoreload
%autoreload 2

# Experiments with ItemKNN model

In [2]:
from pathlib import Path

import scipy.sparse
import numpy as np

import src.io as io
import src.evaluation as evaluation
import src.evaluation.splits
import src.evaluation.base

from src.algorithm.baseline.iknn import ItemKNN

## Datasets

In [3]:
## Uncomment corresponding directory.
## RETARGET True for Frappe, False for the others

DATA_DIR, RETARGET = Path('../../../data/CARS/Mobile_Frappe/'), True
# DATA_DIR, RETARGET = Path('../../../data/CARS/Food_com/'), False
# DATA_DIR, RETARGET = Path('../../../data/CARS/TripAdvisor/'), False

In [4]:
# Shouldn't need to change this info
INTERACTIONS = DATA_DIR / 'interactions.csv'

SEED = 123456
SEED2 = 78910

ITEM_ID = 'item'
USER_ID = 'user'

## Define algorithm and hyperparameter ranges

In [5]:
ALG = ItemKNN

K = [1, 10, 20, 50, 100, 200, 500, 1000, None]
NORMALIZE = [True, False]

HYPERPARAMS = {'k': K, 'normalize': NORMALIZE}
HYPERPARAMS

{'k': [1, 10, 20, 50, 100, 200, 500, 1000, None], 'normalize': [True, False]}

## Parse data

In [6]:
data = io.parse_interactions_with_context(INTERACTIONS, item_id=ITEM_ID, user_id=USER_ID)
data.df

Unnamed: 0,userId,itemId,daytime,weather,weekday
0,0,0,1,1,1
1,1,1,2,2,2
2,2,2,3,2,3
3,3,3,4,0,4
4,4,4,5,1,4
...,...,...,...,...,...
95997,109,0,3,1,1
95998,37,16,4,2,1
95999,180,33,2,2,1
96000,445,751,3,1,1


## Make train/val split for hyperparam tuning

In [7]:
Xtrain, Xval_in, Xval_out = evaluation.splits.leave_one_out_split_non_context(data, seed=SEED)

## Perform gridsearch on validation set

In [8]:
%%time
score, best_hyperparams = evaluation.base.gridsearch(ALG, Xtrain, Xval_in, Xval_out, HYPERPARAMS, retarget=RETARGET)
f"Best score of {score} achieved with {best_hyperparams}."

  0%|          | 0/18 [00:00<?, ?it/s]

Training model ItemKNN with hyperparameters {'k': 1, 'normalize': True}
density of model 0.00024411921665179684
Evaluating with 816 users
MRR@5 0.062
MRR@20 0.066
Average Recall@5 0.109
Average Recall@20 0.175
Training model ItemKNN with hyperparameters {'k': 10, 'normalize': True}
density of model 0.002369717460548027
Evaluating with 816 users
MRR@5 0.077
MRR@20 0.103
Average Recall@5 0.162
Average Recall@20 0.4
Training model ItemKNN with hyperparameters {'k': 20, 'normalize': True}
density of model 0.004537277490192115
Evaluating with 816 users
MRR@5 0.075
MRR@20 0.101
Average Recall@5 0.162
Average Recall@20 0.424
Training model ItemKNN with hyperparameters {'k': 50, 'normalize': True}
density of model 0.009561639616395192
Evaluating with 816 users
MRR@5 0.076
MRR@20 0.105
Average Recall@5 0.157
Average Recall@20 0.428
Training model ItemKNN with hyperparameters {'k': 100, 'normalize': True}
density of model 0.01507721575839339
Evaluating with 816 users
MRR@5 0.078
MRR@20 0.106
Ave

"Best score of 0.10632453837513822 achieved with {'k': 100, 'normalize': True}."

## Evaluate model with optimal hyperparams with KFolds Cross validation

In [9]:
%%time
alg = ALG(**best_hyperparams)
results = evaluation.base.kFoldsEval(alg, data, nr_folds=5, seed=SEED2, retarget=RETARGET)

density of model 0.015061426953336308
Evaluating with 816 users
density of model 0.015054321991060621
Evaluating with 816 users
density of model 0.015065981416333543
Evaluating with 816 users
density of model 0.015056568859472591
Evaluating with 816 users
density of model 0.01505760120441863
Evaluating with 816 users
MRR@5 0.071 (0.006)
MRR@20 0.101 (0.009)
Average Recall@5 0.155 (0.007)
Average Recall@20 0.451 (0.02)
CPU times: user 1.04 s, sys: 236 ms, total: 1.28 s
Wall time: 1.27 s
