In [1]:
from models.sansa import SANSA

sansa_config = {
       "l2": 20.0,
        "target_density": 5e-5,
        "ainv_params": {
            "umr_scans": 3,
            "umr_finetune_steps": 10,
            "umr_loss_threshold": 1e-4,
        },
        "ldlt_method": "icf"
}
     
sansa = SANSA.from_config(sansa_config)


In [2]:
# Load amazon books data

from datasets.amazonbook import Amazonbook

amazonbooks_data_config = {
    "name": "amazonbook",
    "rewrite": False,
}

amazonbooks_data = Amazonbook.from_config(amazonbooks_data_config)

2024-04-29 14:08:22,913 : [1/3] DATASET : Loading processed dataset datasets/data/amazonbook/dataset.parquet.


In [3]:
amazon_split_config = {
    "seed": 42,
    "val_target_proportion": 0.0,
}

(amazon_train, amazon_val, amazon_test), amazon_split_time = amazonbooks_data.create_splits(amazon_split_config)

2024-04-29 14:09:15,848 : [1/3] DATASET : Dataframe lengths | train_df: 2380730, val_df: 2380730, test_df: 2984108
2024-04-29 14:10:02,009 : [1/3] DATASET : Removing users [50736, 52234, 41589, 13647] from test inputs.
2024-04-29 14:10:02,639 : [1/3] DATASET : Splits information:
2024-04-29 14:10:02,640 : [1/3] DATASET : Train split info | n_users = 52643, n_items = 91599, n_ratings = 2380730, sparsity = 99.95%
2024-04-29 14:10:02,641 : [1/3] DATASET : Validation split info | n_users = 52643, n_items = 91599, n_ratings = 2380730, sparsity = 99.95%
2024-04-29 14:10:02,642 : [1/3] DATASET : Test split info | n_users = 52639, n_items = 91599, n_ratings = 2380661, sparsity = 99.95%
2024-04-29 14:10:02,644 : [1/3] DATASET : Execution of create_splits took at 97.698 seconds.


In [4]:
# Train Sansa
sansa.train(amazon_train)

2024-04-29 14:10:30,034 : [2/3] TRAINING : Train user-item matrix info | n_users = 52643, n_items = 91599, n_ratings = 2380730, sparsity = 99.95%
2024-04-29 14:10:30,037 : [2/3] TRAINING : Item-item matrix info | shape = (91599,91599)
2024-04-29 14:10:30,038 : [2/3] TRAINING : Training SANSA with L2=20.0, target density=0.005000%, LDL^T method=icf, approx. inverse method=umr...
2024-04-29 14:10:30,039 : [2/3] TRAINING : Loading item-user matrix...
2024-04-29 14:10:30,179 : [2/3] TRAINING : Constructing weights:
2024-04-29 14:10:39,884 : [2/3] TRAINING : Constructing A...
2024-04-29 14:10:42,083 : [2/3] TRAINING : A info | nnz: 330335853, size: 3964.4 MB
2024-04-29 14:10:56,577 : [2/3] TRAINING : Computing incomplete LL^T decomposition...
2024-04-29 14:11:27,482 : [2/3] TRAINING : L info | nnz: 419506, size: 5.400 MB, density: 0.005000%
2024-04-29 14:11:27,484 : [2/3] TRAINING : Scaling columns and creating D (LL^T -> L'DL'^T)
2024-04-29 14:11:27,498 : [2/3] TRAINING : Execution of ldlt

In [5]:
import pandas as pd

# Evaluate on novelty
# Get all users
users = list(amazon_test.user_encoder.classes_)
# Get rated items of users
users_rated = amazon_test.get_rated_items(users)
targets = amazon_test.get_target_items(users)
target_ids_dict = (
    targets.groupby("user_id", group_keys=True)["item_id"]
    .apply(list)
    .to_dict()
)
keys = list(target_ids_dict.keys())
users_to_arange = {user: i for i, user in enumerate(keys)}
pd.options.mode.chained_assignment = None  # suppress irrelevant warning
users_rated["user_id"] = users_rated["user_id"].map(users_to_arange)
pd.options.mode.chained_assignment = "warn"
top_maxk_ids, top_maxk_scores = sansa.recommend(users_rated, 20)

2024-04-27 17:36:45,462 : [3/3] EVALUATION : Execution of _matmat took at 0.076 seconds.
2024-04-27 17:36:45,804 : [3/3] EVALUATION : Execution of _matmat took at 0.340 seconds.
2024-04-27 17:36:48,201 : [3/3] EVALUATION : Execution of _predict took at 2.814 seconds.


In [6]:
# Create dictionary of item occurences for novelty metric
training_csr_matrix = amazon_train.get_csr_matrix()
item_occurrences = training_csr_matrix.sum(axis=0)
item_ids = item_occurrences.nonzero()[1]
# Get the occurrences as a numpy array
item_occurrences = item_occurrences.A1
    
# Create a dictionary of item IDs and their occurrences
item_occurrences_dict = dict(zip(item_ids, item_occurrences))

In [7]:
import recmetrics

amazon_book_novelty, amazon_book_novelty_topn = recmetrics.novelty(top_maxk_ids, item_occurrences_dict, len(users), 20)



In [8]:
amazon_book_novelty

9.840958832420865