In [1]:
# Load msd data. If there is a parquet file, it will not call read_raw_data

from datasets.msd import MSD

msd_data_config = {
    "name": "msd",
    "rewrite": False,
}

msd_dataset = MSD.from_config(msd_data_config)

2024-04-29 13:10:17,635 : [1/3] DATASET : Creating new dataset msd:
2024-04-29 13:10:17,636 : [1/3] DATASET : Loading raw dataset files from datasets/data/msd/ ...
2024-04-29 13:14:19,877 : [1/3] DATASET : Unifying dataset format...
2024-04-29 13:14:35,285 : [1/3] DATASET : Preprocessing dataset...
2024-04-29 13:16:14,031 : [1/3] DATASET : Saving processed dataset datasets/data/msd/dataset.parquet...


In [15]:
msd_split_config = {
    "n_val_users": 40000,
    "n_test_users": 40000,
    "seed": 42,
    "target_proportion": 0.2,
    "targets_newest": False,
}


(msd_train, msd_val, msd_test), msd_split_time = msd_dataset.create_splits(msd_split_config)

2024-04-29 13:28:35,456 : [1/3] DATASET : Dataframe lengths | train_df: 28917900, val_df: 2352979, test_df: 2362570
2024-04-29 13:29:54,524 : [1/3] DATASET : Splits information:
2024-04-29 13:29:54,529 : [1/3] DATASET : Train split info | n_users = 491355, n_items = 41140, n_ratings = 28917900, sparsity = 99.86%
2024-04-29 13:29:54,530 : [1/3] DATASET : Validation split info | n_users = 40000, n_items = 41140, n_ratings = 1897726, sparsity = 99.88%
2024-04-29 13:29:54,531 : [1/3] DATASET : Test split info | n_users = 40000, n_items = 41140, n_ratings = 1905499, sparsity = 99.88%
2024-04-29 13:29:54,532 : [1/3] DATASET : Execution of create_splits took at 98.026 seconds.


In [16]:
# If you run into AttributeError: 'NoneType' object has no attribute 'inverse_transform' delete the parquet file 
# under datasets/data/msd/

# Train and test use the same songs, so we don't have to get ALL song data
# Get unencoded item ids
item_ids = msd_dataset.item_decoder.inverse_transform(msd_train.item_encoder.classes_)

# maps song id to item id
mapping_dict = {}
for item_id, encoded_id in zip(item_ids, msd_train.item_encoder.classes_):
    mapping_dict[item_id] = encoded_id


In [17]:
# Get song to track mappings
raw_item_ids_mappings = "datasets/metadata/msd/taste_profile_song_to_tracks.txt"

# maps song id to track id
song_to_track = {}
no_mapping_cnt = 0
with open(raw_item_ids_mappings) as f:
    for line in f:
        line = line.strip()
        line = line.split()

        song_id = line[0]
        # songs can map to multiple tracks
        track_ids = line[1:]

        if len(track_ids) > 0:
            song_to_track[song_id] = track_ids[0]
        else:
            song_to_track[song_id] = 'DNE'
            no_mapping_cnt += 1

print("Num of songs that could be mapped to a track: " + str(len(song_to_track)))
print("Num of songs that could not be mapped to a track: " + str(no_mapping_cnt))


# Get genres of all songs
raw_genre_data = "datasets/metadata/msd/msd_tagtraum_cd2.txt"

track_id_genre = {}
no_mapping_cnt = 0
with open(raw_genre_data) as f:
    for line in f:
        line = line.strip()
        line = line.split()

        track_id = line[0]
        # tracks can have multiple genres
        track_genres = line[1:]

        if len(track_ids) > 0:
            track_id_genre[track_id] = track_genres[0]
        else:
            track_id_genre[track_id] = 'No genre'
            no_mapping_cnt += 1

print("Num of songs that have genres: " + str(len(track_id_genre)))
print("Num of songs that do not have genres: " + str(no_mapping_cnt))



Num of songs that could be mapped to a track: 386213
Num of songs that could not be mapped to a track: 2380
Num of songs that have genres: 280831
Num of songs that do not have genres: 0


In [18]:
train_item_ids_to_track_ids = {}

for item_id in item_ids:
    if item_id in song_to_track:
        if song_to_track[item_id] in track_id_genre:
            train_item_ids_to_track_ids[mapping_dict[item_id]] = track_id_genre[song_to_track[item_id]]
        else:
            train_item_ids_to_track_ids[mapping_dict[item_id]] = "none"
    else:
        train_item_ids_to_track_ids[mapping_dict[item_id]] = "none"


In [19]:
import pandas as pd
item_genres_df = pd.DataFrame.from_dict(train_item_ids_to_track_ids, orient='index', columns=['genre'])
item_genres_df.head()

Unnamed: 0,genre
0,Pop
1,none
2,none
3,none
4,Rock


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents='ascii')  # Initialize a TF-IDF vectorizer
tfidf_matrix = vectorizer.fit_transform(item_genres_df['genre'])

feature_names = vectorizer.get_feature_names_out()  # Get the vocabulary 

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                        columns=feature_names, 
                        index=item_genres_df.index)

print(tfidf_df)

       blues  country  electronic  folk  jazz  latin  metal  new  none  pop  \
0        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   0.0  1.0   
1        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   1.0  0.0   
2        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   1.0  0.0   
3        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   1.0  0.0   
4        0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   0.0  0.0   
...      ...      ...         ...   ...   ...    ...    ...  ...   ...  ...   
41135    0.0      0.0         0.0   0.0   0.0    0.0    0.0  1.0   0.0  0.0   
41136    0.0      0.0         0.0   0.0   0.0    0.0    1.0  0.0   0.0  0.0   
41137    0.0      0.0         1.0   0.0   0.0    0.0    0.0  0.0   0.0  0.0   
41138    0.0      0.0         0.0   0.0   0.0    0.0    1.0  0.0   0.0  0.0   
41139    0.0      0.0         0.0   0.0   0.0    0.0    0.0  0.0   0.0  0.0   

       punk  rap  reggae  rnb  rock  world  
0     

In [21]:
from models.randomrec import RANDOM_REC

# No need to train, just predict
ran_pred = RANDOM_REC(msd_test.data, 20)

In [22]:
import recmetrics
import numpy as np


ran_pred_np = np.array(ran_pred)
ran_pred_np = ran_pred_np.tolist()

msd_ils = recmetrics.intra_list_similarity(ran_pred_np, tfidf_df)

print(msd_ils)

0.3563305263157895
