In [1]:
%load_ext autoreload
%autoreload 2

# Experiments with ItemKNN model

In [2]:
from pathlib import Path

import scipy.sparse
import numpy as np

import src.io as io
import src.evaluation as evaluation
import src.evaluation.splits
import src.evaluation.base

from src.algorithm.baseline.iknn import ItemKNN

## Datasets

In [3]:
## Uncomment corresponding directory.
## RETARGET True for Frappe, False for the others

# DATA_DIR, RETARGET = Path('../../../data/CARS/Mobile_Frappe/'), True
# DATA_DIR, RETARGET = Path('../../../data/CARS/Food_com/'), False
DATA_DIR, RETARGET = Path('../../../data/CARS/TripAdvisor/'), False

In [4]:
# Shouldn't need to change this info
INTERACTIONS = DATA_DIR / 'interactions.csv'

SEED = 123456
SEED2 = 78910

ITEM_ID = 'item'
USER_ID = 'user'

## Define algorithm and hyperparameter ranges

In [5]:
ALG = ItemKNN

K = [1, 10, 20, 50, 100, 200, 500, 1000, None]
NORMALIZE = [True, False]

HYPERPARAMS = {'k': K, 'normalize': NORMALIZE}
HYPERPARAMS

{'k': [1, 10, 20, 50, 100, 200, 500, 1000, None], 'normalize': [True, False]}

## Parse data

In [6]:
data = io.parse_interactions_with_context(INTERACTIONS, item_id=ITEM_ID, user_id=USER_ID)
data.df

Unnamed: 0,userId,itemId,TripType,UserState
0,1899,1865,1,1
1,466,721,1,1
2,1992,451,2,1
3,466,1016,1,1
4,1441,414,3,1
...,...,...,...,...
12831,936,341,3,78
12832,1545,1936,3,78
12833,399,1556,2,79
12834,918,1380,5,79


## Make train/val split for hyperparam tuning

In [7]:
Xtrain, Xval_in, Xval_out = evaluation.splits.leave_one_out_split_non_context(data, seed=SEED)

## Perform gridsearch on validation set

In [8]:
%%time
score, best_hyperparams = evaluation.base.gridsearch(ALG, Xtrain, Xval_in, Xval_out, HYPERPARAMS, retarget=RETARGET)
f"Best score of {score} achieved with {best_hyperparams}."

  0%|          | 0/18 [00:00<?, ?it/s]

Training model ItemKNN with hyperparameters {'k': 1, 'normalize': True}
density of model 0.0004271372217348988
Evaluating with 2362 users
MRR@5 0.003
MRR@20 0.004
Average Recall@5 0.006
Average Recall@20 0.01
Training model ItemKNN with hyperparameters {'k': 10, 'normalize': True}
density of model 0.003299315749281195
Evaluating with 2362 users
MRR@5 0.004
MRR@20 0.005
Average Recall@5 0.008
Average Recall@20 0.022
Training model ItemKNN with hyperparameters {'k': 20, 'normalize': True}
density of model 0.005144499893671821
Evaluating with 2362 users
MRR@5 0.003
MRR@20 0.005
Average Recall@5 0.009
Average Recall@20 0.029
Training model ItemKNN with hyperparameters {'k': 50, 'normalize': True}
density of model 0.007851864675954485
Evaluating with 2362 users
MRR@5 0.004
MRR@20 0.006
Average Recall@5 0.011
Average Recall@20 0.036
Training model ItemKNN with hyperparameters {'k': 100, 'normalize': True}
density of model 0.009150507790540989
Evaluating with 2362 users
MRR@5 0.004
MRR@20 0.0

"Best score of 0.008447168406440341 achieved with {'k': 200, 'normalize': False}."

## Evaluate model with optimal hyperparams with KFolds Cross validation

In [9]:
%%time
alg = ALG(**best_hyperparams)
results = evaluation.base.kFoldsEval(alg, data, nr_folds=5, seed=SEED2, retarget=RETARGET)

density of model 0.009422967413707435
Evaluating with 2362 users
density of model 0.00941648027982252
Evaluating with 2362 users
density of model 0.009408371362466375
Evaluating with 2362 users
density of model 0.009427832764121123
Evaluating with 2362 users
density of model 0.00942945454759235
Evaluating with 2362 users
MRR@5 0.006 (0.001)
MRR@20 0.008 (0.001)
Average Recall@5 0.011 (0.001)
Average Recall@20 0.04 (0.002)
CPU times: user 1.1 s, sys: 198 ms, total: 1.29 s
Wall time: 1.28 s
