<a href="https://colab.research.google.com/github/Lausti98/bsc-recsys/blob/main/bsc_amazon_data_initial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Amazon Reviews datasets

The notebook performs recommenations for amazon datasets: 
* Amazon Fashion 
* Amazon Prime Pantry
* Amazon Software 

There are many more datasets available (https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/). The three specific dataset categories are chosen specifically because of the sizes of the datasets (approx 500K reviews)

In [1]:
!pip install recpack

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting recpack
  Downloading recpack-0.3.5-py3-none-any.whl (241 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/241.2 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m235.5/241.2 KB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.2/241.2 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting hyperopt==0.2.*,>=0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
Collecting PyYAML==5.*,>=5.4.1
  Downloading PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl (630 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m630.1/630.1 KB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
Collecting py4j
  Downloading

In [2]:
# Imports 
from pathlib import Path
import pandas as pd
import re
from sklearn import preprocessing
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.scenarios import WeakGeneralization
from recpack.pipelines import PipelineBuilder


## Load & process data

In [5]:
fpath = 'amazon_fashion_rating_only.csv'

df = pd.read_csv(fpath)

In [6]:
proc = DataFramePreprocessor(item_ix='product_id', user_ix='user_id')
proc.add_filter(MinItemsPerUser(5, item_ix='product_id', user_ix='user_id'))
proc.add_filter(MinUsersPerItem(5, item_ix='product_id', user_ix='user_id'))

# Assuming you have loaded a dataframe called df
interaction_matrix = proc.process(df)

  0%|          | 0/6404 [00:00<?, ?it/s]

  0%|          | 0/6404 [00:00<?, ?it/s]

## Split data into train, test and validation

In [7]:
# Split data into train, test and validation set. Random split not accounting for time data.
scenario = WeakGeneralization(0.75, validation=True)
scenario.split(interaction_matrix)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [12]:
from recpack.algorithms import ItemKNN
model = ItemKNN(K=10)
model.fit(scenario.full_training_data)

2023-03-09 18:58:48,813 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.03s


INFO:recpack:Fitting ItemKNN complete - Took 0.03s


In [19]:
X_test = scenario.test_data_in.items_in([7])
print(X_test)
pred = model.predict(X_test)
print(pred)

<recpack.matrix.interaction_matrix.InteractionMatrix object at 0x7fa81cfe94f0>
  (5, 1)	0.13363062095621217
  (5, 13)	0.9999999999999997
  (47, 1)	0.13363062095621217
  (47, 13)	0.9999999999999997


## Build pipeline

In [None]:
builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

In [None]:
builder.add_algorithm('Popularity') # No real parameters to optimise

In [None]:
# ITEM BASED Nearest Neighbors Recommendation Recommender Systems: An Introduction 2.2
builder.add_algorithm('ItemKNN', grid={
    'K': [10, 30, 50, 100, 200, 500],
    'similarity': ['cosine'],# 'conditional_probability'],
})




In [None]:
builder.set_optimisation_metric('NDCGK', K=10)
builder.add_metric('NDCGK', K=[10, 20, 50])
builder.add_metric('CoverageK', K=[10, 20])
builder.add_metric('RecallK', K=[10, 20, 50])
builder.add_metric('PrecisionK', K=[10, 20, 50])

In [None]:
pipeline = builder.build()
pipeline.run()
pipeline.get_metrics()

  0%|          | 0/2 [00:00<?, ?it/s]

2023-03-06 15:11:38,488 - base - recpack - INFO - Fitting Popularity complete - Took 0.00471s


INFO:recpack:Fitting Popularity complete - Took 0.00471s


2023-03-06 15:11:39,336 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0246s


INFO:recpack:Fitting ItemKNN complete - Took 0.0246s


2023-03-06 15:11:39,425 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0294s


INFO:recpack:Fitting ItemKNN complete - Took 0.0294s


2023-03-06 15:11:39,519 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0335s


INFO:recpack:Fitting ItemKNN complete - Took 0.0335s


2023-03-06 15:11:39,611 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0301s


INFO:recpack:Fitting ItemKNN complete - Took 0.0301s


2023-03-06 15:11:39,702 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0303s


INFO:recpack:Fitting ItemKNN complete - Took 0.0303s


2023-03-06 15:11:39,788 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0291s


INFO:recpack:Fitting ItemKNN complete - Took 0.0291s


2023-03-06 15:11:39,880 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.0328s


INFO:recpack:Fitting ItemKNN complete - Took 0.0328s


Unnamed: 0,NDCGK_10,NDCGK_20,NDCGK_50,CoverageK_10,CoverageK_20,RecallK_10,RecallK_20,RecallK_50,PrecisionK_10,PrecisionK_20,PrecisionK_50
Popularity(K=200),0.07082,0.093838,0.127294,0.042493,0.073654,0.14485,0.234263,0.401693,0.015594,0.012589,0.008755
"ItemKNN(K=50,normalize_X=False,normalize_sim=False,pop_discount=None,similarity=cosine)",0.24966,0.273309,0.303567,0.988669,0.994334,0.388317,0.479208,0.626729,0.041631,0.026109,0.01382
