In [1]:
from models.sansa import SANSA

sansa_config = {
    "l2": 2.5,
    "target_density": 0.0005,
    "ainv_params": {
        "umr_scans": 4,
        "umr_finetune_steps": 10,
        "umr_loss_threshold": 1e-4,
    },
    "ldlt_method": "icf",
    "ldlt_params": {},
}
     
sansa = SANSA.from_config(sansa_config)


In [2]:
# Load msd data. If there is a parquet file, it will not call read_raw_data

from datasets.msd import MSD

msd_data_config = {
    "name": "msd",
    "rewrite": False,
}

msd_dataset = MSD.from_config(msd_data_config)

2024-04-29 12:40:05,340 : [1/3] DATASET : Creating new dataset msd:
2024-04-29 12:40:05,341 : [1/3] DATASET : Loading raw dataset files from datasets/data/msd/ ...
2024-04-29 12:44:26,918 : [1/3] DATASET : Unifying dataset format...
2024-04-29 12:44:42,576 : [1/3] DATASET : Preprocessing dataset...
2024-04-29 12:46:23,406 : [1/3] DATASET : Saving processed dataset datasets/data/msd/dataset.parquet...


In [14]:
msd_split_config = {
    "n_val_users": 40000,
    "n_test_users": 40000,
    "seed": 42,
    "target_proportion": 0.2,
    "targets_newest": False,
}


(msd_train, msd_val, msd_test), msd_split_time = msd_dataset.create_splits(msd_split_config)

2024-04-29 12:57:14,136 : [1/3] DATASET : Dataframe lengths | train_df: 28917900, val_df: 2352979, test_df: 2362570
2024-04-29 12:58:33,809 : [1/3] DATASET : Splits information:
2024-04-29 12:58:33,813 : [1/3] DATASET : Train split info | n_users = 491355, n_items = 41140, n_ratings = 28917900, sparsity = 99.86%
2024-04-29 12:58:33,815 : [1/3] DATASET : Validation split info | n_users = 40000, n_items = 41140, n_ratings = 1897726, sparsity = 99.88%
2024-04-29 12:58:33,816 : [1/3] DATASET : Test split info | n_users = 40000, n_items = 41140, n_ratings = 1905499, sparsity = 99.88%
2024-04-29 12:58:33,817 : [1/3] DATASET : Execution of create_splits took at 98.694 seconds.


In [16]:
# Train and test use the same songs, so we don't have to get ALL song data
# Get unencoded item ids
item_ids = msd_dataset.item_decoder.inverse_transform(msd_train.item_encoder.classes_)

# maps song id to item id
mapping_dict = {}
for item_id, encoded_id in zip(item_ids, msd_train.item_encoder.classes_):
    mapping_dict[item_id] = encoded_id


In [17]:
# Get song to track mappings
raw_item_ids_mappings = "datasets/metadata/msd/taste_profile_song_to_tracks.txt"

# maps song id to track id
song_to_track = {}
no_mapping_cnt = 0
with open(raw_item_ids_mappings) as f:
    for line in f:
        line = line.strip()
        line = line.split()

        song_id = line[0]
        # songs can map to multiple tracks
        track_ids = line[1:]

        if len(track_ids) > 0:
            song_to_track[song_id] = track_ids[0]
        else:
            song_to_track[song_id] = 'DNE'
            no_mapping_cnt += 1

print("Num of songs that could be mapped to a track: " + str(len(song_to_track)))
print("Num of songs that could not be mapped to a track: " + str(no_mapping_cnt))


# Get genres of all songs
raw_genre_data = "datasets/metadata/msd/msd_tagtraum_cd2.txt"

track_id_genre = {}
no_mapping_cnt = 0
with open(raw_genre_data) as f:
    for line in f:
        line = line.strip()
        line = line.split()

        track_id = line[0]
        # tracks can have multiple genres
        track_genres = line[1:]

        if len(track_ids) > 0:
            track_id_genre[track_id] = track_genres[0]
        else:
            track_id_genre[track_id] = 'No genre'
            no_mapping_cnt += 1

print("Num of songs that have genres: " + str(len(track_id_genre)))
print("Num of songs that do not have genres: " + str(no_mapping_cnt))



Num of songs that could be mapped to a track: 386213
Num of songs that could not be mapped to a track: 2380
Num of songs that have genres: 280831
Num of songs that do not have genres: 0


In [19]:
train_item_ids_to_track_ids = {}

for item_id in item_ids:
    if item_id in song_to_track:
        if song_to_track[item_id] in track_id_genre:
            train_item_ids_to_track_ids[mapping_dict[item_id]] = track_id_genre[song_to_track[item_id]]
        else:
            train_item_ids_to_track_ids[mapping_dict[item_id]] = "none"
    else:
        train_item_ids_to_track_ids[mapping_dict[item_id]] = "none"


In [27]:
import pandas as pd
item_genres_df = pd.DataFrame.from_dict(train_item_ids_to_track_ids, orient='index', columns=['genre'])
item_genres_df.head()

Unnamed: 0,genre
0,Pop
1,none
2,none
3,none
4,Rock


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents='ascii')  # Initialize a TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform(item_genres_df['genre'])

feature_names = vectorizer.get_feature_names_out()  # Get the vocabulary 

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                        columns=feature_names, 
                        index=item_genres_df.index)

print(tfidf_df)

       blues  country  electronic  folk  jazz  latin  metal  new  none  pop  \
0        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   0.0  1.0   
1        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   1.0  0.0   
2        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   1.0  0.0   
3        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   1.0  0.0   
4        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   0.0  0.0   
...      ...      ...         ...   ...   ...    ...    ...  ...   ...  ...   
41135    0.0      0.0         0.0   0.0   0.0    0.0    0.0  1.0   0.0  0.0   
41136    0.0      0.0         0.0   0.0   0.0    0.0    1.0  0.0   0.0  0.0   
41137    0.0      0.0         1.0   0.0   0.0    0.0    0.0  0.0   0.0  0.0   
41138    0.0      0.0         0.0   0.0   0.0    0.0    1.0  0.0   0.0  0.0   
41139    0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   0.0  0.0   

       punk  rap  reggae  rnb  rock  world  
0     

In [22]:
# Train Sansa
sansa.train(msd_train)

2024-04-29 13:00:14,212 : [2/3] TRAINING : Train user-item matrix info | n_users = 491355, n_items = 41140, n_ratings = 28917900, sparsity = 99.86%
2024-04-29 13:00:14,214 : [2/3] TRAINING : Item-item matrix info | shape = (41140,41140)
2024-04-29 13:00:14,216 : [2/3] TRAINING : Training SANSA with L2=2.5, target density=0.050000%, LDL^T method=icf, approx. inverse method=umr...
2024-04-29 13:00:14,217 : [2/3] TRAINING : Loading item-user matrix...
2024-04-29 13:00:16,554 : [2/3] TRAINING : Constructing weights:
2024-04-29 13:00:31,814 : [2/3] TRAINING : Constructing A...
2024-04-29 13:00:42,219 : [2/3] TRAINING : A info | nnz: 716070712, size: 8593.0 MB
2024-04-29 13:01:22,550 : [2/3] TRAINING : Computing incomplete LL^T decomposition...
2024-04-29 13:03:50,199 : [2/3] TRAINING : L info | nnz: 846061, size: 10.317 MB, density: 0.049989%
2024-04-29 13:03:50,200 : [2/3] TRAINING : Scaling columns and creating D (LL^T -> L'DL'^T)
2024-04-29 13:03:50,220 : [2/3] TRAINING : Execution of ld

In [23]:
# Evaluate
# Get all users
users = list(msd_test.user_encoder.classes_)
# Get rated items of users
users_rated = msd_test.get_rated_items(users)
targets = msd_test.get_target_items(users)
target_ids_dict = (
    targets.groupby("user_id", group_keys=True)["item_id"]
    .apply(list)
    .to_dict()
)
keys = list(target_ids_dict.keys())
users_to_arange = {user: i for i, user in enumerate(keys)}
pd.options.mode.chained_assignment = None  # suppress irrelevant warning
users_rated["user_id"] = users_rated["user_id"].map(users_to_arange)
pd.options.mode.chained_assignment = "warn"
top_maxk_ids, top_maxk_scores = sansa.recommend(users_rated, 20)

2024-04-29 13:07:22,594 : [3/3] EVALUATION : Execution of _matmat took at 0.128 seconds.
2024-04-29 13:07:27,022 : [3/3] EVALUATION : Execution of _matmat took at 4.426 seconds.
2024-04-29 13:07:37,990 : [3/3] EVALUATION : Execution of _predict took at 15.523 seconds.


In [24]:
import recmetrics

top_maxk_ids_list = top_maxk_ids.tolist()
msd_ils = recmetrics.intra_list_similarity(top_maxk_ids_list, tfidf_df)

print(msd_ils)

0.37234763157894735
