<a href="https://colab.research.google.com/github/Lausti98/bsc-recsys/blob/main/bsc_initial_dataset_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install recpack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Load dataset

In [None]:
from recpack.datasets import MovieLens100K
from recpack.preprocessing.filters import MinRating, MinUsersPerItem, MinItemsPerUser
from recpack.scenarios import WeakGeneralization

ml_loader = MovieLens100K(path='datasets/', filename='ml-1k.csv', use_default_filters=False)
# Consider ratings 2 or higher as interactions
ml_loader.add_filter(MinRating(
    2,
    ml_loader.RATING_IX,
))
# Keep users with at least 5 interactions
ml_loader.add_filter(MinItemsPerUser(
    5,
    ml_loader.ITEM_IX,
    ml_loader.USER_IX,
))
# Keep items with at least 30 interactions
ml_loader.add_filter(MinUsersPerItem(
    30,
    ml_loader.ITEM_IX,
    ml_loader.USER_IX,
))

data = ml_loader.load()

  0%|          | 0/85975 [00:00<?, ?it/s]

  0%|          | 0/85975 [00:00<?, ?it/s]

In [None]:
print(data.num_active_items)
print(data.num_active_users)
print(data.properties)
print(data.density)

763
943
InteractionMatrix.InteractionMatrixProperties(num_users=943, num_items=763, has_timestamps=True)
0.11949120858807881


In [None]:
# Split data into train, test and validation set. Random split not accounting for time data.
scenario = WeakGeneralization(0.75, validation=True)
scenario.split(data)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
from recpack.pipelines import PipelineBuilder

builder = PipelineBuilder()

builder.set_data_from_scenario(scenario)

## Popularity baseline model

In [None]:
builder.add_algorithm('Popularity') # No real parameters to optimise

## Collaborative filtering 

In [None]:
# ITEM BASED Nearest Neighbors Recommendation Recommender Systems: An Introduction 2.2
builder.add_algorithm('ItemKNN', grid={
    'K': [100, 200, 500, 700, 900, 1000],
    'similarity': ['cosine', 'conditional_probability'],
})



In [None]:
builder.add_algorithm('SLIM')
#builder.add_algorithm('ItemPNN', grid={
#    'K': [100, 200, 500],
#    'similarity': ['cosine'],
#})
#builder.add_algorithm('NMFItemToItem', grid={
#    'num_components': [100, 200, 500],
#    'seed': [1],
#})
#builder.add_algorithm('SVDItemToItem', grid={
#    'num_components': [100, 200, 500],
#    'seed': [1],
#})
#builder.add_algorithm('Prod2Vec', grid={
#    'num_components': [100, 200, 500],
#    'seed': [1],
#})


In [None]:
# Accounting for timeseries data ITEM BASED NN 
#builder.add_algorithm('TARSItemKNN', grid={
#    'K': [100, 200, 500],
#    'similarity': ['cosine', 'conditional_probability'],
#})

## Hybrid algorithm

In [None]:
# Unified item and user based nearet neighbors recommendation Recommender Systems: An Introduction 5.2.1

builder.add_algorithm('KUNN', grid={
    'Ku': [100, 200, 500],
    'Ki': [100, 200, 500],
})

## Add metrics
NDCG metric is the Normalized Discounted Cumulative Gain. The metric scores the algorithms not by their precision (correct prediction) but on the gain of the prediciton. Gain is the proximity of the prediction to the real value. 

See article https://machinelearninginterview.com/topics/machine-learning/ndcg-evaluation-metric-for-recommender-systems/ for more information
$$NDCG_{@K}= \frac{DCG_{@K}}{IDCG_{@K}}$$ \\

$$IDCG_{@K}=\sum_{i=1}^{K^{ideal}} \frac{G^{ideal}_i}{log_2 (i+1)}$$

In [None]:
builder.set_optimisation_metric('NDCGK', K=10)
builder.add_metric('NDCGK', K=[10, 20, 50])
builder.add_metric('CoverageK', K=[10, 20])
builder.add_metric('RecallK', K=[10, 20, 50])
builder.add_metric('PrecisionK', K=[10, 20, 50])

## Run pipeline

In [None]:
pipeline = builder.build()
pipeline.run()
pipeline.get_metrics()

  0%|          | 0/4 [00:00<?, ?it/s]

2023-03-02 18:43:46,245 - base - recpack - INFO - Fitting Popularity complete - Took 0.0378s


INFO:recpack:Fitting Popularity complete - Took 0.0378s


2023-03-02 18:43:47,723 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.319s


INFO:recpack:Fitting ItemKNN complete - Took 0.319s
  self._set_arrayXarray(i, j, x)


2023-03-02 18:43:48,230 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.272s


INFO:recpack:Fitting ItemKNN complete - Took 0.272s


2023-03-02 18:43:48,913 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.492s


INFO:recpack:Fitting ItemKNN complete - Took 0.492s


2023-03-02 18:43:49,646 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.276s


INFO:recpack:Fitting ItemKNN complete - Took 0.276s


2023-03-02 18:43:50,445 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.64s


INFO:recpack:Fitting ItemKNN complete - Took 0.64s


2023-03-02 18:43:51,751 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.05s


INFO:recpack:Fitting ItemKNN complete - Took 1.05s


2023-03-02 18:43:53,989 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.71s


INFO:recpack:Fitting ItemKNN complete - Took 1.71s


2023-03-02 18:43:56,511 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.07s


INFO:recpack:Fitting ItemKNN complete - Took 2.07s


2023-03-02 18:43:58,164 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.23s


INFO:recpack:Fitting ItemKNN complete - Took 1.23s


2023-03-02 18:43:59,793 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.36s


INFO:recpack:Fitting ItemKNN complete - Took 1.36s


2023-03-02 18:44:02,046 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.8s


INFO:recpack:Fitting ItemKNN complete - Took 1.8s


2023-03-02 18:44:03,716 - base - recpack - INFO - Fitting ItemKNN complete - Took 1.25s


INFO:recpack:Fitting ItemKNN complete - Took 1.25s


2023-03-02 18:44:04,939 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.946s


INFO:recpack:Fitting ItemKNN complete - Took 0.946s


2023-03-02 18:44:17,831 - base - recpack - INFO - Fitting SLIM complete - Took 12.0s


INFO:recpack:Fitting SLIM complete - Took 12.0s


2023-03-02 18:44:18,744 - base - recpack - INFO - Fitting KUNN complete - Took 0.139s


INFO:recpack:Fitting KUNN complete - Took 0.139s


2023-03-02 18:44:19,514 - base - recpack - INFO - Fitting KUNN complete - Took 0.129s


INFO:recpack:Fitting KUNN complete - Took 0.129s


2023-03-02 18:44:20,398 - base - recpack - INFO - Fitting KUNN complete - Took 0.136s


INFO:recpack:Fitting KUNN complete - Took 0.136s


2023-03-02 18:44:21,906 - base - recpack - INFO - Fitting KUNN complete - Took 0.247s


INFO:recpack:Fitting KUNN complete - Took 0.247s


2023-03-02 18:44:22,756 - base - recpack - INFO - Fitting KUNN complete - Took 0.218s


INFO:recpack:Fitting KUNN complete - Took 0.218s


2023-03-02 18:44:23,736 - base - recpack - INFO - Fitting KUNN complete - Took 0.216s


INFO:recpack:Fitting KUNN complete - Took 0.216s


2023-03-02 18:44:25,643 - base - recpack - INFO - Fitting KUNN complete - Took 0.622s


INFO:recpack:Fitting KUNN complete - Took 0.622s


2023-03-02 18:44:27,623 - base - recpack - INFO - Fitting KUNN complete - Took 0.932s


INFO:recpack:Fitting KUNN complete - Took 0.932s


2023-03-02 18:44:29,331 - base - recpack - INFO - Fitting KUNN complete - Took 0.6s


INFO:recpack:Fitting KUNN complete - Took 0.6s


2023-03-02 18:44:31,292 - base - recpack - INFO - Fitting KUNN complete - Took 0.639s


INFO:recpack:Fitting KUNN complete - Took 0.639s


Unnamed: 0,NDCGK_10,NDCGK_20,NDCGK_50,CoverageK_10,CoverageK_20,RecallK_10,RecallK_20,RecallK_50,PrecisionK_10,PrecisionK_20,PrecisionK_50
Popularity(K=200),0.235652,0.231848,0.261325,0.057667,0.099607,0.117798,0.179125,0.308064,0.210817,0.173807,0.132874
"ItemKNN(K=700,normalize_X=False,normalize_sim=False,pop_discount=None,similarity=cosine)",0.387507,0.381751,0.403314,0.228047,0.310616,0.202175,0.301821,0.458827,0.325663,0.259385,0.174062
"SLIM(fit_intercept=True,ignore_neg_weights=True,l1_reg=0.0005,l2_reg=5e-05)",0.39641,0.404468,0.44515,0.363041,0.477064,0.217518,0.335153,0.523204,0.329799,0.27386,0.190498
"KUNN(Ki=500,Ku=100)",0.438982,0.439078,0.475721,0.366972,0.492792,0.228623,0.3558,0.559559,0.370626,0.301273,0.204793
