In [None]:
# !jupyter kernelspec list
# !conda list

### Import

In [2]:
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

### Data

In [3]:
data_path = "../data/raw/"
os.listdir(data_path)

['mxm_dataset_train.txt',
 'p02_msd_tagtraum_cd2.cls',
 'p02_unique_tracks.txt',
 'train_triplets.txt']

Echo Nest Taste Profile Subset (triplets)

In [4]:
interactions_df = pd.read_csv(data_path + 'train_triplets.txt',
                      sep='\t',
                      header=None,
                      names=['user_id', 'song_id', 'play_count']
)

interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


MusiXmatch Dataset (lyrics)

In [5]:
with open(data_path + 'mxm_dataset_train.txt', encoding="utf-8") as f:
  lines = f.readlines()

# for i, line in enumerate(lines[:20]):
#   print(i, line.strip())


# Find the line with %
vocab_line = [l for l in lines if l.startswith('%')][0]
vocab = vocab_line[1:].strip().split(',')   # remove '%' and split
print("Vocabulary size:", len(vocab))
print("First 10 words:", vocab[:10])


records = []
for line in lines:
    if line.startswith('#') or line.startswith('%'):
        continue  # skip comments and vocab line

    parts = line.strip().split(',')
    track_id = parts[0]
    mxm_track_id = parts[1]
    word_counts = parts[2:]

    bow = {}
    for wc in word_counts:
        idx, count = wc.split(':')
        bow[vocab[int(idx)-1]] = int(count)  # word indices start at 1

    records.append((track_id, mxm_track_id, bow))

Vocabulary size: 5000
First 10 words: ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in']


In [6]:
lyrics_df = pd.DataFrame(records, columns=['track_id', 'mxm_track_id', 'bow'])
lyrics_df

Unnamed: 0,track_id,mxm_track_id,bow
0,TRAAAAV128F421A322,4623710,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5..."
1,TRAAABD128F429CF47,6477168,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2..."
2,TRAAAED128E0783FAB,2516445,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'..."
3,TRAAAEF128F4273421,3759847,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1..."
4,TRAAAEW128F42930C0,3783760,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '..."
...,...,...,...
210514,TRZZZWS128F429CF87,3080645,"{'a': 1, 'no': 9, 'que': 7, 'de': 1, 'y': 4, '..."
210515,TRZZZXA128F428ED56,2344272,"{'i': 1, 'the': 13, 'you': 6, 'to': 5, 'and': ..."
210516,TRZZZXV128F4289747,1417347,"{'i': 13, 'the': 3, 'you': 17, 'to': 5, 'and':..."
210517,TRZZZYV128F92E996D,6849828,"{'i': 10, 'the': 6, 'you': 20, 'and': 2, 'me':..."


Tagtraum Genre Annotations

In [7]:
genres_df = pd.read_csv(
    data_path + 'p02_msd_tagtraum_cd2.cls',
    sep='\t',
    comment='#',
    header=None,
    names=['track_id', 'majority_genre', 'minority_genre']
)

genres_df

Unnamed: 0,track_id,majority_genre,minority_genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,
...,...,...,...
280826,TRZZZRJ128F42819AF,Rock,
280827,TRZZZUK128F92E3C60,Folk,
280828,TRZZZYV128F92E996D,New Age,RnB
280829,TRZZZZD128F4236844,Rock,


Track ↔ Song Mapping

In [8]:
tracks_df = pd.read_csv(data_path + 'p02_unique_tracks.txt',
                               sep='<SEP>',
                               header=None,
                               engine="python", # needed because <SEP> is more than 1 character
                               names=["track_id", "song_id", "artist_name", "track_title"]
)

tracks_df

Unnamed: 0,track_id,song_id,artist_name,track_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday


Sampling

In [9]:
def save_datasets(interactions_df, tracks_df, genres_df, lyrics_df):
    interactions_df.to_csv('../data/processed/interactions_df.csv', index=False)
    tracks_df.to_csv('../data/processed/tracks_df.csv', index=False)
    genres_df.to_csv('../data/processed/genres.csv', index=False)
    lyrics_df.to_csv('../data/processed/lyrics_df.csv', index=False)

In [10]:
# save_datasets(interactions_df, tracks_df, genres_df, lyrics_df)

In [11]:
def create_samples(interactions_df, tracks_df, genres_df, lyrics_df, frac=0.01, random_state=42):
    # 1. Sample interactions
    s_interactions = interactions_df.sample(frac=frac, random_state=random_state)

    # 2. Get song_ids that appear in sampled interactions
    sampled_song_ids = s_interactions['song_id'].unique()

    # 3. Filter related tracks, genres, lyrics
    s_tracks = tracks_df[tracks_df['song_id'].isin(sampled_song_ids)]
    sampled_track_ids = s_tracks['track_id'].unique()
    
    s_genres = genres_df[genres_df['track_id'].isin(sampled_track_ids)]
    s_lyrics = lyrics_df[lyrics_df['track_id'].isin(sampled_track_ids)]

    return s_interactions, s_tracks, s_genres, s_lyrics

In [12]:
s_interactions, s_tracks, s_genres, s_lyrics = create_samples(
    interactions_df, tracks_df, genres_df, lyrics_df, frac=0.01
)

In [13]:
def save_samples(s_interactions, s_tracks, s_genres, s_lyrics, out_dir="../data/samples"):
    os.makedirs(out_dir, exist_ok=True)

    s_interactions.to_csv(f"{out_dir}/interactions_sample.csv", index=False)
    s_tracks.to_csv(f"{out_dir}/tracks_sample.csv", index=False)
    s_genres.to_csv(f"{out_dir}/genres_sample.csv", index=False)
    s_lyrics.to_csv(f"{out_dir}/lyrics_sample.csv", index=False)

In [14]:
save_samples(s_interactions, s_tracks, s_genres, s_lyrics)

### Non-Personalized

#### Top 250 Tracks

In [18]:
interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


In [19]:
interactions_df["song_id"].nunique()

384546

In [20]:
# 1. Aggregate play counts per song
track_playcounts = (
    interactions_df
    .groupby('song_id')['play_count']
    .sum()
    .reset_index()
)

In [21]:
track_playcounts

Unnamed: 0,song_id,play_count
0,SOAAADD12AB018A9DD,24
1,SOAAADE12A6D4F80CC,12
2,SOAAADF12A8C13DF62,9
3,SOAAADZ12A8C1334FB,12
4,SOAAAFI12A6D4F9C66,188
...,...,...
384541,SOZZZRJ12AB0187A75,16
384542,SOZZZRV12A8C1361F1,75
384543,SOZZZSR12AB01854CD,5
384544,SOZZZWD12A6D4F6624,3


In [22]:
# 2. Sort descending
track_playcounts = track_playcounts.sort_values(
    by="play_count", ascending=False
)

In [23]:
track_playcounts

Unnamed: 0,song_id,play_count
25043,SOBONKR12A58A7A7E0,726885
12936,SOAUWYT12A81C206F1,648239
287415,SOSXLTC12AF72A7F54,527893
90798,SOFRQTD12A81C233C0,425463
67917,SOEGIYH12A6D4FC0E3,389880
...,...,...
39627,SOCMFLA12A6D4FA417,1
156718,SOJYBHV12A6D4F9A92,1
39625,SOCMFIS12A58291D8C,1
109970,SOGXJKU12A6D4F6D9F,1


In [24]:
# 3. Take Top 250
top_250 = track_playcounts.head(250)

In [25]:
top_250

Unnamed: 0,song_id,play_count
25043,SOBONKR12A58A7A7E0,726885
12936,SOAUWYT12A81C206F1,648239
287415,SOSXLTC12AF72A7F54,527893
90798,SOFRQTD12A81C233C0,425463
67917,SOEGIYH12A6D4FC0E3,389880
...,...,...
13135,SOAVFLR12A8C138576,35253
224272,SOOLYZQ12A6D4FA5B7,35245
242485,SOPSYOY12A8C142E0B,35191
30161,SOBWSGV12AB018B5E0,35074


In [26]:
# 4. Merge with metadata (tracks_df)
top_250 = (
    top_250.merge(tracks_df, on="song_id", how="left")
    [["artist_name", "track_title", "play_count"]]
)

In [27]:
top_250

Unnamed: 0,artist_name,track_title,play_count
0,Dwight Yoakam,You're The One,726885
1,Björk,Undo,648239
2,Kings Of Leon,Revelry,527893
3,Harmonia,Sehr kosmisch,425463
4,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880
...,...,...,...
262,Triple Six Mafia,Now I'm High_ Really High,35253
263,The Red Jumpsuit Apparatus,Face Down (Album Version),35245
264,Linkin Park,New Divide (Album Version),35191
265,Selena Gomez & The Scene,Naturally,35074


In [28]:
# 5. Add ranking index
top_250.index = top_250.index + 1
top_250.index.name = "rank"

In [29]:
top_250

Unnamed: 0_level_0,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dwight Yoakam,You're The One,726885
2,Björk,Undo,648239
3,Kings Of Leon,Revelry,527893
4,Harmonia,Sehr kosmisch,425463
5,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880
...,...,...,...
263,Triple Six Mafia,Now I'm High_ Really High,35253
264,The Red Jumpsuit Apparatus,Face Down (Album Version),35245
265,Linkin Park,New Divide (Album Version),35191
266,Selena Gomez & The Scene,Naturally,35074


#### Top 100 tracks by genre

In [30]:
interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


In [31]:
genres_df

Unnamed: 0,track_id,majority_genre,minority_genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,
...,...,...,...
280826,TRZZZRJ128F42819AF,Rock,
280827,TRZZZUK128F92E3C60,Folk,
280828,TRZZZYV128F92E996D,New Age,RnB
280829,TRZZZZD128F4236844,Rock,


In [32]:
genres_df["majority_genre"].nunique()

15

In [33]:
genres_df["majority_genre"].unique()

array(['Rock', 'Rap', 'Latin', 'Jazz', 'Electronic', 'Punk', 'Pop',
       'New Age', 'Metal', 'RnB', 'Country', 'Reggae', 'Folk', 'Blues',
       'World'], dtype=object)

In [34]:
tracks_df

Unnamed: 0,track_id,song_id,artist_name,track_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday


In [35]:
genre = "Rock"

In [36]:
# 1. Merge interactions with track metadata
merged = (
    interactions_df
    .merge(tracks_df, on="song_id", how="left")
    .merge(genres_df[["track_id", "majority_genre"]], on="track_id", how="left")
)

In [37]:
merged

Unnamed: 0,user_id,song_id,play_count,track_id,artist_name,track_title,majority_genre
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,TRIQAUQ128F42435AD,Jack Johnson,The Cove,Rock
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,TRIRLYL128F42539D1,Billy Preston,Nothing from Nothing,RnB
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,TRMHBXZ128F4238406,Paco De Lucia,Entre Dos Aguas,Electronic
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,TRYQMNI128F147C1C7,Josh Rouse,Under Cold Blue Stars,Rock
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,TRAHZNE128F9341B86,The Dead 60s,Riot Radio (Soundtrack Version),
...,...,...,...,...,...,...,...
49664523,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2,TRKUAEO128F933ABFC,Eminem / Obie Trice / Stat Quo / Bobby Creekwa...,We're Back,
49664524,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1,TRRNFHH128F92D262D,Rise Against,Savior,Rock
49664525,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1,TRSLDDC12903CC36E7,Usher featuring will.i.am,OMG,
49664526,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3,TRNJQAM128F14557AF,matchbox twenty,Downfall (Album Version),Rock


In [38]:
# 2. Filter by genre
genre_df = merged[merged["majority_genre"] == genre]

In [40]:
genre_df

Unnamed: 0,user_id,song_id,play_count,track_id,artist_name,track_title,majority_genre
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,TRIQAUQ128F42435AD,Jack Johnson,The Cove,Rock
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,TRYQMNI128F147C1C7,Josh Rouse,Under Cold Blue Stars,Rock
13,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,TRKRHYM128F42934A9,Foo Fighters,Learn To Fly,Rock
14,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,TRPTWGR128F1452734,Héroes del Silencio,Apuesta Por El Rock 'N' Roll,Rock
17,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEGVZY12A58A7857E,1,TRKEXHB128F147C1C4,Josh Rouse,Nothing Gives Me Pleasure,Rock
...,...,...,...,...,...,...,...
49664512,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SORPVUD12A67020454,1,TRBTVVD128F146D742,Red Hot Chili Peppers,Otherside (Album Version),Rock
49664513,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOSCPOI12A8C139F02,1,TROBUUZ128F4263002,Finger Eleven,Paralyzer,Rock
49664520,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOTNMFD12A58A7789E,1,TRLLUPN128F4257E65,RAUNCHY,I Get What I See,Rock
49664524,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1,TRRNFHH128F92D262D,Rise Against,Savior,Rock


In [41]:
# 3. Aggregate play counts per track
track_playcounts = (
    genre_df
    .groupby(['track_id', 'artist_name', 'track_title'])['play_count']
    .sum()
    .reset_index()
)

In [42]:
track_playcounts

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRAAAAK128F9318786,Adelitas Way,Scream,515
1,TRAAABD128F429CF47,The Box Tops,Soul Deep,72
2,TRAAAUR128F428B1FA,International Noise Conspiracy,Smash It Up,375
3,TRAABFH128F92C812E,The Donkeys,Excelsior Lady,410
4,TRAABIG128F9356C56,Poe,Walk the Walk,410
...,...,...,...,...
75933,TRZZZCB128F4249252,Winger,Spell I'm Under,185
75934,TRZZZCL128F428BB80,I Am Ghost,The Ship of Pills and Needed Things,32
75935,TRZZZHL128F9329CFB,Ayreon,Day five: Voices,281
75936,TRZZZRJ128F42819AF,Belle & Sebastian,Lord Anthony,1212


In [43]:
# 4. Sort and take top k
top_100_by_genre = (
    track_playcounts
    .sort_values("play_count", ascending=False)
    .head(100)
    .reset_index(drop=True)
)

In [44]:
top_100_by_genre

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRGXQES128F42BA5EB,Björk,Undo,648239
1,TRONYHY128F92C9D11,Kings Of Leon,Revelry,527893
2,TRDMBIJ128F4290431,Harmonia,Sehr kosmisch,425463
3,TROAQBZ128F9326213,OneRepublic,Secrets,292642
4,TRIXAZF128F421EE64,Tub Ring,Invalid,268353
...,...,...,...,...
95,TRFXWSD128F93173BF,Metric,Gold Guns Girls,28148
96,TRTRVEP128F428F617,Pearl Jam,Encore Break,27579
97,TRJEITS128F92E2FEC,Daughtry,No Surprise,27187
98,TRVYICQ128F4252493,Eric Clapton,Tears In Heaven,26999


In [45]:
# 5. Add ranking index
top_100_by_genre.index = top_100_by_genre.index + 1
top_100_by_genre.index.name = "rank"

In [46]:
top_100_by_genre

Unnamed: 0_level_0,track_id,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,TRGXQES128F42BA5EB,Björk,Undo,648239
2,TRONYHY128F92C9D11,Kings Of Leon,Revelry,527893
3,TRDMBIJ128F4290431,Harmonia,Sehr kosmisch,425463
4,TROAQBZ128F9326213,OneRepublic,Secrets,292642
5,TRIXAZF128F421EE64,Tub Ring,Invalid,268353
...,...,...,...,...
96,TRFXWSD128F93173BF,Metric,Gold Guns Girls,28148
97,TRTRVEP128F428F617,Pearl Jam,Encore Break,27579
98,TRJEITS128F92E2FEC,Daughtry,No Surprise,27187
99,TRVYICQ128F4252493,Eric Clapton,Tears In Heaven,26999


### Content-based

#### Baseline

In [50]:
keyword = "love"
k = 50
threshold = 10

In [51]:
mask = lyrics_df['bow'].apply(lambda bow: bow.get(keyword, 0) >= threshold)
keyword_tracks = lyrics_df[mask][["track_id"]]
keyword_tracks

Unnamed: 0,track_id
2,TRAAAED128E0783FAB
45,TRAADNL128F14519DF
70,TRAAFTE128F429545F
78,TRAAGHM128EF35CF8E
101,TRAAHOA128F425A4F7
...,...
210343,TRZZLCS128F92C6124
210389,TRZZOWY128F42BA84E
210447,TRZZTYS128EF347EAB
210450,TRZZUNB128F4263020


In [52]:
merged = (
    keyword_tracks
    .merge(tracks_df, on="track_id", how="left")
    .merge(interactions_df, on="song_id", how="left")
)

In [53]:
merged

Unnamed: 0,track_id,song_id,artist_name,track_title,user_id,play_count
0,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,cf7bd4b5b398b3e150cf262d79147312a69b96ac,9.0
1,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,043d81932e75d5749ed5758d6420506e7bc457a5,3.0
2,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,515e1ab04c00859de983cacf35f150f2ddb37dde,1.0
3,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,e49ac0612b9444abf3d513e54b1cd77f6fe5ae4b,1.0
4,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,a2c8271ed491e9fd6ecb5a8760940362e3a54e3a,1.0
...,...,...,...,...,...,...
767302,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,101c5333c580dc2d936ec6025632138478cf1917,2.0
767303,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,de011fdc5ed1544fb33d70cbc33443f64cfcec67,2.0
767304,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,47df968e42b74c06b11fd936425b6ac9d60c73e5,3.0
767305,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,d494b27dc831e58f4573e52e35be27ca209d3d31,1.0


In [54]:
track_playcounts = (
    merged.groupby(["artist_name", "track_title"])["play_count"]
    .sum()
    .reset_index()
)

In [55]:
track_playcounts

Unnamed: 0,artist_name,track_title,play_count
0,'Til Tuesday,Have Mercy,1.0
1,'Til Tuesday,How Can You Give Up,0.0
2,-123min.,I'm In You,0.0
3,100 Proof Aged in Soul,One Mans Leftovers (Is Another Mans Feast),0.0
4,10000 Maniacs,Love Among The Ruins,19.0
...,...,...,...
6292,tobyMac,Made To Love,3358.0
6293,tobyMac,No Ordinary Love,0.0
6294,yoomiii,For Your Love,0.0
6295,yoomiii,You Are My First Love,0.0


In [56]:
top_k_by_keyword = (
    track_playcounts
    .sort_values(by="play_count", ascending=False)
    .head(k)
    .reset_index(drop=True)
)

In [57]:
top_k_by_keyword

Unnamed: 0,artist_name,track_title,play_count
0,Bill Withers,Make Love To Your Mind,146978.0
1,John Mayer,Half Of My Heart,65966.0
2,Eminem / Dina Rae,Superman,45328.0
3,Guns N' Roses,Don't Cry (Original),40480.0
4,UB40,Red Red Wine (Edit),23263.0
5,Black Eyed Peas,My Humps,23151.0
6,Leona Lewis,Bleeding Love,22195.0
7,John Legend,Save Room,21535.0
8,Modern Lovers,Modern world,20584.0
9,Train,If It's Love,20410.0


In [58]:
top_k_by_keyword.index = top_k_by_keyword.index + 1
top_k_by_keyword.index.name = "rank"

In [59]:
top_k_by_keyword

Unnamed: 0_level_0,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Bill Withers,Make Love To Your Mind,146978.0
2,John Mayer,Half Of My Heart,65966.0
3,Eminem / Dina Rae,Superman,45328.0
4,Guns N' Roses,Don't Cry (Original),40480.0
5,UB40,Red Red Wine (Edit),23263.0
6,Black Eyed Peas,My Humps,23151.0
7,Leona Lewis,Bleeding Love,22195.0
8,John Legend,Save Room,21535.0
9,Modern Lovers,Modern world,20584.0
10,Train,If It's Love,20410.0


In [None]:
def collection_baseline(self, keyword : str, threshold: int = 10, k: int = 50):
        """Return Top-k tracks containing a keyword in lyrics (baseline approach)."""
        mask = self.lyrics_df["bow"].apply(
            lambda bow: bow.get(keyword, 0) >= threshold
        )
        keyword_tracks = self.lyrics_df[mask]["track_id"]

        merged = (
            keyword_tracks
            .merge(self.tracks_df, on="track_id", how="left")
            .merge(self.interactions_df, on="song_id", how="left")
        )

        track_playcounts = (
            merged.groupby(["track_id", "artist_name", "track_title"])["play_count"]
            .sum()
            .reset_index()
        )

        top_k_by_keyword = (
            track_playcounts
            .sort_values("play_count", ascending=False)
            .head(k)
            .reset_index(drop=True)
        )

        top_k_by_keyword.index = top_k_by_keyword.index + 1
        top_k_by_keyword.index.name = "rank"
        return top_k_by_keyword

#### Word2Vec

In [60]:
keyword = "war"
topn = 10
threshold = 5
k=50

In [14]:
# Convert bow dict -> tokenized list of words (repeated by count)
tokenized_lyrics = [
    [word for word, cnt in bow.items() for _ in range(cnt)]
    for bow in lyrics_df["bow"]
]

In [15]:
# Train Word2Vec
w2v_model = Word2Vec(
    sentences=tokenized_lyrics,
    vector_size=100,  # embedding dimension
    window=5,         # context window
    min_count=3,      # ignore rare words
    workers=4         # parallel threads
)

In [18]:
w2v_model.save("../models/w2v_model.model")

In [16]:
print(f"Vocabulary size: {len(w2v_model.wv)}")

Vocabulary size: 5000


In [64]:
w2v_model.wv[keyword]

array([-2.4399631e+00, -2.1838227e-01,  2.3256813e-01, -1.4110947e+00,
       -7.5513744e-01,  7.2375399e-01,  2.0627668e+00,  1.4107111e-01,
        1.1620554e+00, -4.7452730e-01,  9.8016590e-02, -2.3852174e+00,
       -6.9416815e-01,  2.4296064e+00,  2.1802330e+00, -2.2763824e-01,
        1.7865597e+00,  2.1202281e+00, -2.0419614e+00, -1.2962694e+00,
       -7.5995333e-02,  2.7706535e+00,  2.0680652e+00,  1.1941788e+00,
        4.9496859e-01, -3.1857800e-01, -1.4684677e-01,  7.3162176e-02,
       -8.7768936e-01,  2.2521665e+00, -2.5843450e-01, -1.4699937e+00,
       -1.5265332e+00, -2.6333852e+00,  1.7318844e+00, -1.8156046e+00,
        1.0374765e+00,  9.5318824e-01, -1.6023140e+00,  1.3253373e+00,
        2.2741173e-01,  7.2449034e-01,  2.0673473e-01,  6.7951322e-01,
       -2.4532282e+00, -3.0076137e-01, -1.4420097e-01,  8.2897544e-01,
        7.2011524e-01,  1.9744678e+00, -3.3931341e+00, -2.3582728e+00,
        8.3403528e-01, -1.4923202e+00, -1.0170158e+00, -1.5662582e+00,
      

In [109]:
# w2v_model.wv.most_similar

In [65]:
w2v_model.wv.most_similar(keyword, topn=topn)

[('hell', 0.8144029378890991),
 ('death', 0.7717035412788391),
 ('hate', 0.7479702830314636),
 ('seen', 0.7477748990058899),
 ('truth', 0.7442511916160583),
 ('under', 0.7430849075317383),
 ('wind', 0.7294747829437256),
 ('set', 0.7286722660064697),
 ('ground', 0.7263931632041931),
 ('land', 0.7251535058021545)]

In [67]:
similar_words = [w for w, _ in w2v_model.wv.most_similar(keyword, topn=topn)]

In [68]:
similar_words

['hell',
 'death',
 'hate',
 'seen',
 'truth',
 'under',
 'wind',
 'set',
 'ground',
 'land']

In [69]:
all_keywords = [keyword] + similar_words

In [70]:
all_keywords

['war',
 'hell',
 'death',
 'hate',
 'seen',
 'truth',
 'under',
 'wind',
 'set',
 'ground',
 'land']

In [71]:
mask = lyrics_df["bow"].apply(
    lambda bow: any(bow.get(w, 0) >= threshold for w in all_keywords)
)

In [72]:
keyword_tracks = lyrics_df[mask][["track_id"]]
keyword_tracks

Unnamed: 0,track_id
13,TRAAAZF12903CCCF6B
21,TRAABOA128F933684A
56,TRAAEJH128E0785506
110,TRAAHZP12903CA25F4
170,TRAAMCQ128F4259A2F
...,...
210416,TRZZQSK128F92EF0B9
210446,TRZZTUV128F426B6EB
210473,TRZZWTN128F9352EC6
210493,TRZZYLO12903CAC06C


In [79]:
merged = (
    keyword_tracks
        .merge(tracks_df, on="track_id", how="left")
        .merge(interactions_df, on="song_id", how="left")
)

In [80]:
merged

Unnamed: 0,track_id,song_id,artist_name,track_title,user_id,play_count
0,TRAAAZF12903CCCF6B,SOUCVHW12AB018E830,Matthew Wilder,Break My Stride,,
1,TRAABOA128F933684A,SONHGLD12AB0188D47,Anthony B,Our Father,b58ad35665d625169bfe75ba3f97dffed518edac,8.0
2,TRAABOA128F933684A,SONHGLD12AB0188D47,Anthony B,Our Father,02756dc4251d3a9f4e28a94cf1a56a46a5b4865a,1.0
3,TRAAEJH128E0785506,SOFBGBL12A67020D9F,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),b322da50dc02b89bbb347dc3ee475f4fa19f4c62,2.0
4,TRAAEJH128E0785506,SOFBGBL12A67020D9F,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),aa5f5df9d3c41fc2d03e55110cd12e2007d3811a,4.0
...,...,...,...,...,...,...
631542,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,a93f9fb3fcfb3d7182e4f97848c4f291b98f47b8,2.0
631543,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,730fa2e10d61c8d0e86d0ed7addecb7dc5dd1021,1.0
631544,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,ba149996a348302ed135b4166e716be174aed211,1.0
631545,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,278254ed302a6e26bf07f1eb23f138d26758fd96,1.0


In [81]:
track_playcounts = (
    merged.groupby(["track_id", "artist_name", "track_title"])["play_count"]
        .sum()
        .reset_index()
)

In [82]:
track_playcounts

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRAAAZF12903CCCF6B,Matthew Wilder,Break My Stride,0.0
1,TRAABOA128F933684A,Anthony B,Our Father,9.0
2,TRAAEJH128E0785506,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),38.0
3,TRAAHZP12903CA25F4,Organized Konfusion,Hate,43.0
4,TRAAMCQ128F4259A2F,Primal Scream,Pills,3424.0
...,...,...,...,...
4391,TRZZQSK128F92EF0B9,Disciple,Love Hate (On And On),129.0
4392,TRZZTUV128F426B6EB,Weeping Willows,Echoes Of Your Breath,3.0
4393,TRZZWTN128F9352EC6,Ektomorf,Rat War,51.0
4394,TRZZYLO12903CAC06C,Dallas Holm,I've Never Seen The Righteous Forsaken,0.0


In [83]:
top_k_by_keyword_word2vec = (
    track_playcounts
    .sort_values(by="play_count", ascending=False)
    .head(k)
    .reset_index(drop=True)
)

In [84]:
top_k_by_keyword_word2vec

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRRNFHH128F92D262D,Rise Against,Savior,74654.0
1,TRMEQQX12903CCD9D5,Sean Kingston and Justin Bieber,Eenie Meenie,66998.0
2,TRKDYZS12903CDB570,3OH!3,STARSTRUKK [FEATURINGKATYPERRY] (Explicit Bonu...,26230.0
3,TRTWBNZ128F92F3426,Vanessa Williams,Colors Of The Wind,26001.0
4,TRYFXPG128E078ECBD,Dr. Dre / Eminem,Forgot About Dre,24502.0
5,TRLAKFT128F427FF44,Rage Against The Machine,Killing In The Name,24485.0
6,TRRRMKC128E0792990,Guns N' Roses,Civil War,22893.0
7,TRDTYHN128E079504D,Twista feat. Kayne West & Jamie Foxx,Slow Jamz (Feat. Kanye West & Jamie Foxx) (Edi...,21025.0
8,TRUUXLZ128F932BA01,Zac Brown Band,Chicken Fried (Album),20088.0
9,TREEFIY128F425B9AE,Three Days Grace,I Hate Everything About You,19959.0


In [85]:
top_k_by_keyword_word2vec.index = top_k_by_keyword_word2vec.index + 1
top_k_by_keyword_word2vec.index.name = "rank"

In [86]:
top_k_by_keyword_word2vec

Unnamed: 0_level_0,track_id,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,TRRNFHH128F92D262D,Rise Against,Savior,74654.0
2,TRMEQQX12903CCD9D5,Sean Kingston and Justin Bieber,Eenie Meenie,66998.0
3,TRKDYZS12903CDB570,3OH!3,STARSTRUKK [FEATURINGKATYPERRY] (Explicit Bonu...,26230.0
4,TRTWBNZ128F92F3426,Vanessa Williams,Colors Of The Wind,26001.0
5,TRYFXPG128E078ECBD,Dr. Dre / Eminem,Forgot About Dre,24502.0
6,TRLAKFT128F427FF44,Rage Against The Machine,Killing In The Name,24485.0
7,TRRRMKC128E0792990,Guns N' Roses,Civil War,22893.0
8,TRDTYHN128E079504D,Twista feat. Kayne West & Jamie Foxx,Slow Jamz (Feat. Kanye West & Jamie Foxx) (Edi...,21025.0
9,TRUUXLZ128F932BA01,Zac Brown Band,Chicken Fried (Album),20088.0
10,TREEFIY128F425B9AE,Three Days Grace,I Hate Everything About You,19959.0


#### Classifier

Convert Bag-of-Words (BoW) dictionaries into plain text strings

In [15]:
keyword = "Rock"
label_by_genre = False

In [17]:
lyrics_df = lyrics_df.merge(
    genres_df[['track_id', 'majority_genre']], 
    on='track_id', 
    how='left'
)

In [19]:
# Create a binary target column for the keyword
y = (lyrics_df["majority_genre"] == keyword).astype(int)
y.value_counts()

majority_genre
0    155597
1     54922
Name: count, dtype: int64

In [22]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=True)
X = vec.fit_transform(lyrics_df["bow"])

In [23]:
X

<210519x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 16845943 stored elements in Compressed Sparse Row format>

In [11]:
# texts = lyrics_df["bow"].apply(
#     lambda bow: " ".join([w for w, c in bow.items() for _ in range(c)])
# )

In [12]:
# texts

0         i i i i i i the the the the you you to to and ...
1         i i i i i i i i i i you you you you you you yo...
2         i i i i i i i i i i i i i i i i i i i i i i i ...
3         i i i i i the the the the you you you to to an...
4         i i i i to to to to to and and and and and and...
                                ...                        
210514    a no no no no no no no no no que que que que q...
210515    i the the the the the the the the the the the ...
210516    i i i i i i i i i i i i i the the the you you ...
210517    i i i i i i i i i i the the the the the the yo...
210518    i i i i the the the the the the the the the th...
Name: bow, Length: 210519, dtype: object

In [13]:
# set(texts[0].split()).__len__()

68

In [25]:
# lyrics_df[lyrics_df['majority_genre'] == "Rock"]

Convert text → numeric vectors that classifiers can process.

In [None]:
 # # Prepare features
# texts = self.lyrics_df["bow"].apply(
#     lambda bow: " ".join([w for w, c in bow.items() for _ in range(c)])
# )

# # Create a binary target column for the keyword
# y = (self.lyrics_df["majority_genre"] == keyword).astype(int)  # 1 if track is the keyword genre, else 0

# # Fit vectorizer
# if vectorizer_type == "count":
#     vectorizer = CountVectorizer(
#         max_features=5000, ngram_range=(1,2), stop_words="english"
#     )
# elif vectorizer_type == "tfidf":
#     vectorizer = TfidfVectorizer(
#         max_features=5000, ngram_range=(1,2), stop_words="english"
#     )
# else:
#     raise ValueError("vectorizer_type must be 'count' or 'tfidf'")

# X = vectorizer.fit_transform(texts)

In [22]:
# vectorizer = CountVectorizer(
#     max_features=5000, # limit vocab size
#     ngram_range=(1,2), # unigrams + bigrams
#     stop_words="english"
# )

In [23]:
# X = vectorizer.fit_transform(texts) # learn vocabulary and transform to feature matrix

In [44]:
# X

In [26]:
# clf = LogisticRegression(
#     max_iter=500,
#     solver='liblinear',
#     random_state=42
# )

clf = LogisticRegression(
    max_iter=1000, solver='saga', 
    class_weight='balanced', random_state=42
)

# clf.fit(X, y)

In [None]:
from sklearn.model_selection import cross_val_predict

probs = cross_val_predict(clf, X, y, cv=5, method='predict_proba')[:, 1]

In [26]:
# probs = clf.predict_proba(X)[:, 1]

In [49]:
probs

array([0.50437715, 0.52943388, 0.40209707, ..., 0.5797232 , 0.66933396,
       0.50214123])

In [91]:
len(probs)

210519

In [92]:
len(probs[probs > 0.5])

106189

In [89]:
scored = pd.DataFrame({
            "track_id": lyrics_df["track_id"].values,
            "score": probs
        })

In [90]:
scored

Unnamed: 0,track_id,score
0,TRAAAAV128F421A322,0.504377
1,TRAAABD128F429CF47,0.529434
2,TRAAAED128E0783FAB,0.402097
3,TRAAAEF128F4273421,0.372638
4,TRAAAEW128F42930C0,0.646538
...,...,...
210514,TRZZZWS128F429CF87,0.312078
210515,TRZZZXA128F428ED56,0.570487
210516,TRZZZXV128F4289747,0.579723
210517,TRZZZYV128F92E996D,0.669334


In [86]:
song_plays = interactions_df.groupby("song_id")["play_count"].sum().reset_index()

In [87]:
song_plays

Unnamed: 0,song_id,play_count
0,SOAAADD12AB018A9DD,24
1,SOAAADE12A6D4F80CC,12
2,SOAAADF12A8C13DF62,9
3,SOAAADZ12A8C1334FB,12
4,SOAAAFI12A6D4F9C66,188
...,...,...
384541,SOZZZRJ12AB0187A75,16
384542,SOZZZRV12A8C1361F1,75
384543,SOZZZSR12AB01854CD,5
384544,SOZZZWD12A6D4F6624,3


In [93]:
tracks_and_plays = tracks_df.merge(song_plays, on="song_id", how="left")
tracks_and_plays["play_count"] = tracks_and_plays["play_count"].fillna(0)
tracks_and_plays

Unnamed: 0,track_id,song_id,artist_name,track_title,play_count
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night,8.0
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan,0.0
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever,0.0
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés,3.0
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens,0.0
...,...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida,0.0
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo,0.0
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba,0.0
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday,0.0


In [94]:
merged = scored.merge(tracks_and_plays, on="track_id", how="left")
merged

Unnamed: 0,track_id,score,song_id,artist_name,track_title,play_count
0,TRAAAAV128F421A322,0.504377,SOQPWCR12A6D4FB2A3,Western Addiction,A Poor Recipe For Civic Cohesion,3.0
1,TRAAABD128F429CF47,0.529434,SOCIWDW12A8C13D406,The Box Tops,Soul Deep,72.0
2,TRAAAED128E0783FAB,0.402097,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,315.0
3,TRAAAEF128F4273421,0.372638,SONHOTT12A8C13493C,Adam Ant,Something Girls,0.0
4,TRAAAEW128F42930C0,0.646538,SODZYPO12A8C13A91E,Broken Spindles,Burn My Body (Album Version),11.0
...,...,...,...,...,...,...
210514,TRZZZWS128F429CF87,0.312078,SOTVTUY12A8C140E48,Los Prisioneros,Que No Destrocen Tu Vida,0.0
210515,TRZZZXA128F428ED56,0.570487,SOHGAEN12A8C13E7B7,The God Awfuls,No Angels,1.0
210516,TRZZZXV128F4289747,0.579723,SOSSBEC12A8C139F5F,BlackHawk,Stepping Stones,0.0
210517,TRZZZYV128F92E996D,0.669334,SOSMTHR12A8C138B9B,TLC,Dear Lie,51.0


In [95]:
from sklearn.preprocessing import MinMaxScaler

# log scale plays
merged["play_count_log"] = np.log1p(merged["play_count"])

scaler = MinMaxScaler()
merged[["score_norm", "plays_norm"]] = scaler.fit_transform(
            merged[["score", "play_count_log"]].fillna(0)
    )

merged

Unnamed: 0,track_id,score,song_id,artist_name,track_title,play_count,play_count_log,score_norm,plays_norm
0,TRAAAAV128F421A322,0.504377,SOQPWCR12A6D4FB2A3,Western Addiction,A Poor Recipe For Civic Cohesion,3.0,1.386294,0.504377,0.102715
1,TRAAABD128F429CF47,0.529434,SOCIWDW12A8C13D406,The Box Tops,Soul Deep,72.0,4.290459,0.529434,0.317894
2,TRAAAED128E0783FAB,0.402097,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,315.0,5.755742,0.402097,0.426461
3,TRAAAEF128F4273421,0.372638,SONHOTT12A8C13493C,Adam Ant,Something Girls,0.0,0.000000,0.372638,0.000000
4,TRAAAEW128F42930C0,0.646538,SODZYPO12A8C13A91E,Broken Spindles,Burn My Body (Album Version),11.0,2.484907,0.646538,0.184115
...,...,...,...,...,...,...,...,...,...
210514,TRZZZWS128F429CF87,0.312078,SOTVTUY12A8C140E48,Los Prisioneros,Que No Destrocen Tu Vida,0.0,0.000000,0.312078,0.000000
210515,TRZZZXA128F428ED56,0.570487,SOHGAEN12A8C13E7B7,The God Awfuls,No Angels,1.0,0.693147,0.570487,0.051357
210516,TRZZZXV128F4289747,0.579723,SOSSBEC12A8C139F5F,BlackHawk,Stepping Stones,0.0,0.000000,0.579723,0.000000
210517,TRZZZYV128F92E996D,0.669334,SOSMTHR12A8C138B9B,TLC,Dear Lie,51.0,3.951244,0.669334,0.292760


In [96]:
alpha = 0.6
merged["final_score"] = alpha * merged["score_norm"] + (1 - alpha) * merged["plays_norm"]
merged

Unnamed: 0,track_id,score,song_id,artist_name,track_title,play_count,play_count_log,score_norm,plays_norm,final_score
0,TRAAAAV128F421A322,0.504377,SOQPWCR12A6D4FB2A3,Western Addiction,A Poor Recipe For Civic Cohesion,3.0,1.386294,0.504377,0.102715,0.343712
1,TRAAABD128F429CF47,0.529434,SOCIWDW12A8C13D406,The Box Tops,Soul Deep,72.0,4.290459,0.529434,0.317894,0.444818
2,TRAAAED128E0783FAB,0.402097,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,315.0,5.755742,0.402097,0.426461,0.411843
3,TRAAAEF128F4273421,0.372638,SONHOTT12A8C13493C,Adam Ant,Something Girls,0.0,0.000000,0.372638,0.000000,0.223583
4,TRAAAEW128F42930C0,0.646538,SODZYPO12A8C13A91E,Broken Spindles,Burn My Body (Album Version),11.0,2.484907,0.646538,0.184115,0.461569
...,...,...,...,...,...,...,...,...,...,...
210514,TRZZZWS128F429CF87,0.312078,SOTVTUY12A8C140E48,Los Prisioneros,Que No Destrocen Tu Vida,0.0,0.000000,0.312078,0.000000,0.187247
210515,TRZZZXA128F428ED56,0.570487,SOHGAEN12A8C13E7B7,The God Awfuls,No Angels,1.0,0.693147,0.570487,0.051357,0.362835
210516,TRZZZXV128F4289747,0.579723,SOSSBEC12A8C139F5F,BlackHawk,Stepping Stones,0.0,0.000000,0.579723,0.000000,0.347834
210517,TRZZZYV128F92E996D,0.669334,SOSMTHR12A8C138B9B,TLC,Dear Lie,51.0,3.951244,0.669334,0.292760,0.518704


In [106]:
top_tracks = merged.sort_values("final_score", ascending=False).head(50).reset_index(drop=True)
top_tracks.index = topk.index + 1
top_tracks.index.name = "rank"
top_tracks

Unnamed: 0_level_0,track_id,score,song_id,artist_name,track_title,play_count,play_count_log,score_norm,plays_norm,final_score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,TRZJNOD128F4264131,0.949083,SOQGVCS12AF72A078D,Guns N' Roses,Paradise City,60787.0,11.015148,0.949083,0.816147,0.895908
3,TRPDJBX128F42ACF9B,0.98003,SOIROON12A6701E0B8,Soundgarden,Black Hole Sun,24942.0,10.124349,0.98003,0.750145,0.888076
4,TRVLPDN128E07889D1,0.948367,SOOSADY12A6701F119,Daft Punk,Technologic,14572.0,9.586926,0.948367,0.710325,0.85315
5,TRYMLMM12903D05FD3,0.996028,SOHTSUQ12A58A7AFCE,Men Without Hats,Safety Dance,4861.0,8.489205,0.996028,0.628992,0.849214
6,TRSRVAK128F4282ECA,0.942475,SOKOTZG12A6D4F9519,Blue October,Into The Ocean,10536.0,9.262648,0.942475,0.686299,0.840004
7,TRPFYYL128F92F7144,0.837466,SONNSYV12A8C146BEC,Modest Mouse,Float On,85079.0,11.351347,0.837466,0.841057,0.838902
8,TRBIVWU128F92CA9D2,0.961571,SOFZVOT12A8C1408E9,Foo Fighters,Skin And Bones,4810.0,8.47866,0.961571,0.628211,0.828227
9,TREJAUV128E0792A2A,0.918118,SONSHLY12A6701EB77,Bloodhound Gang,Lift Your Head Up High (And Blow Your Brains Out),11537.0,9.353401,0.918118,0.693023,0.82808
10,TRDTYHN128E079504D,0.885288,SOKZZGT12A67ADA4C3,Twista feat. Kayne West & Jamie Foxx,Slow Jamz (Feat. Kanye West & Jamie Foxx) (Edi...,21025.0,9.953515,0.885288,0.737487,0.826168
11,TRWSDEV128F93119BF,0.80981,SOMGIYR12AB0187973,Panic At The Disco,Behind The Sea [Live In Chicago],89974.0,11.407287,0.80981,0.845202,0.823966


In [101]:
# merged["play_count"].isna().sum()

In [102]:
# merged["play_count"] = merged["play_count"].fillna(0)

If the same track appears multiple times, sum scores and play counts.

In [103]:
# merged['final_score'] = merged['score'] * merged['play_count']
# top_tracks = merged.sort_values('final_score', ascending=False).head(50)
# top_tracks.index = top_tracks.index + 1
# top_tracks.index.name = 'rank'
# top_tracks[['artist_name','track_title','play_count']]
# # top_tracks

In [190]:
# ranked = (merged
#             .groupby(["track_id", "artist_name", "track_title"])[["play_count", "score"]]
#             .sum()
#         )
# ranked["final_score"] = ranked["score"] * ranked["play_count"]
# ranked = ranked.sort_values("final_score", ascending=False).head(k)
# ranked.index = ranked.index + 1
# ranked.index.name = "rank"
# ranked

### Collaborative

In [11]:
from sklearn.model_selection import train_test_split

class MusicRecommender:
    def fit(self, *args, **kwargs):
        raise NotImplementedError

    def recommend(self, *args, **kwargs):
        raise NotImplementedError

class Collaborative(MusicRecommender):
    def __init__(self, interactions_df, tracks_df):
        self.interactions_df = interactions_df
        self.tracks_df = tracks_df
        self.train_df = None
        self.test_df = None

    def train_test_split(self, test_size=0.2, random_state=42):
        self.train_df, self.test_df = train_test_split(
            self.interactions_df, test_size=test_size, random_state=random_state
        )

#### People similar to you listening (User-User)

In [247]:
class UserBasedRecommender(Collaborative):
    def __init__(self, interactions_df, tracks_df, top_k_neighbors=50):
        super().__init__(interactions_df, tracks_df)
        self.top_k_neighbors = top_k_neighbors
        self.user_map = None
        self.song_map = None
        self.user_item_sparse = None
        self.user_ids = None
        self.user_sim = None
    
    def fit(self, top_k_neighbors=50):
        # 1. Map user_id and song_id to indices
        self.user_ids = self.train_df['user_id'].unique()
        self.song_ids = self.train_df['song_id'].unique()
        self.user_map = {user: idx for idx, user in enumerate(self.user_ids)}
        self.song_map = {song: idx for idx, song in enumerate(self.song_ids)}
        
        # 2. Build sparse user-item matrix
        rows = self.train_df['user_id'].map(self.user_map).to_numpy()
        cols = self.train_df['song_id'].map(self.song_map).to_numpy()
        data = self.train_df['play_count'].to_numpy()
        self.user_item_sparse = csr_matrix(
            (data, (rows, cols)), shape=(len(self.user_ids), len(self.song_ids))
        )

        # 3. Compute latent user vectors via SVD
        from sklearn.decomposition import TruncatedSVD
        from sklearn.preprocessing import normalize
        svd = TruncatedSVD(n_components=128)
        user_latent = svd.fit_transform(self.user_item_sparse)  # shape: (num_users, 128)

        # 4. Normalize for cosine similarity (dot product = cosine)
        user_latent = normalize(user_latent.astype(np.float32))

        # 5. Build IVF index
        dim = user_latent.shape[1]
        nlist = 4096  # number of clusters
        quantizer = faiss.IndexFlatIP(dim)
        index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT) 
        
        # Train the index on a subset
        np.random.seed(42)
        train_sample = user_latent[np.random.choice(user_latent.shape[0], size=100000, replace=False)]
        index.train(train_sample)

        # Add all users to the index
        index.add(user_latent)

        # How many clusters to search? (tradeoff: higher = more accurate, slower)
        index.nprobe = 20

        # 6. Query all users at once
        D, I = index.search(user_latent, self.top_k_neighbors + 1)  # +1 = includes self

        # Store neighbors
        self.user_sim = {}
        for i, user_id in enumerate(self.user_ids):
            neighbors = [(self.user_ids[j], float(d)) 
                         for j, d in zip(I[i], D[i]) if j != i]
            self.user_sim[user_id] = dict(neighbors[:self.top_k_neighbors])

    def recommend_for_user(self, user_id, k=10):
        """Return top-k recommended tracks for a given user based on co-listening"""
        if user_id not in self.user_map:
            return pd.DataFrame(columns=['rank', 'artist_name', 'track_title'])

        # 1. Get neighbors and similarities
        neighbors = self.user_sim.get(user_id, {})
        if not neighbors:
            return pd.DataFrame(columns=['rank', 'artist_name', 'track_title'])

        neighbor_ids, sim_scores = zip(*neighbors.items())
        neighbor_indices = [self.user_map[u] for u in neighbor_ids]

        # 2. Get neighbor interaction matrix
        neighbor_matrix = self.user_item_sparse[neighbor_indices]

        # 3. Weighted sum of neighbor play counts
        sim_scores = np.array(sim_scores).reshape(-1, 1)
        weighted_ratings = neighbor_matrix.T.dot(sim_scores).flatten()

        # 4. Remove already listened items
        user_index = self.user_map[user_id]
        listened = self.user_item_sparse[user_index].toarray().flatten() > 0
        weighted_ratings[listened] = 0

        # 5. Get top-k recommendations
        top_idx = np.argpartition(-weighted_ratings, k)[:k]
        top_scores = weighted_ratings[top_idx]

        recommended_song_ids = [self.song_ids[i] for i in top_idx]
        recs = pd.DataFrame({
            'song_id': recommended_song_ids,
            'score': top_scores
        }).sort_values(by="score", ascending=False).reset_index(drop=True)

        # Optional: merge with track metadata for readability
        recs = recs.merge(self.tracks_df, on="song_id", how="left")
        recs.insert(0, "rank", range(1, len(recs) + 1))
        return recs.head(k)

    def precision_at_k(self, k=10, users=None):
        """
        Compute average Precision@k across users in the test set.

        Precision@k = (# of recommended items @k that are relevant) / k

        Args:
            k (int): number of recommendations to evaluate
            users (list, optional): specific user_ids to evaluate. 
                                    If None, evaluates all test users.

        Returns:
            float: mean precision@k
        """
        if users is None:
            users = self.test_df['user_id'].unique()

        precisions = []
        for user_id in users:
            # 1. Get ground truth (relevant items from test interactions)
            actual_tracks = set(self.test_df[self.test_df['user_id'] == user_id]['song_id'])
            if not actual_tracks:
                continue

            # 2. Get recommended items
            recs = self.recommend_for_user(user_id, k)
            if recs.empty:
                continue
            rec_tracks = set(recs['song_id'])

            # 3. Compute precision@k for this user
            hit_count = len(actual_tracks & rec_tracks)
            precisions.append(hit_count / k)

        return float(np.mean(precisions)) if precisions else 0.0

In [249]:
ub_rec = UserBasedRecommender(interactions_df, tracks_df, top_k_neighbors=50)

In [250]:
ub_rec.train_test_split()

In [251]:
unique_train = set(ub_rec.train_df['user_id'])
unique_test = set(ub_rec.test_df['user_id'])
print(len(unique_train))
print(len(unique_test))

1019318
998596


In [252]:
ub_rec.fit()

In [253]:
import random

user_id = random.choice(ub_rec.user_ids)
print(user_id)

cd32f789c5167e17ca9fb907c86f6e6dfbcf33f0


In [254]:
ub_rec.recommend_for_user(user_id)

Unnamed: 0,rank,song_id,score,track_id,artist_name,track_title
0,1,SOSDUAW12A8C13BA7D,55.499139,TRIMECG128F92D0147,Deadmau5,Arguru
1,2,SOBRJNF12A8C13B63D,29.205863,TRONTHY128F4295577,Smile Empty Soul,Bottom of a Bottle (Explicit Album Version)
2,3,SOXCBEF12A8C146AFE,29.175663,TRFMZCV128F92D2365,Leggo Beast,Itchy Feet
3,4,SOUMYZD12A6D4FAB31,29.102162,TRXTARM128F42B91EE,Animal Collective,It's You
4,5,SOXBFGD12A8C13EAFD,28.029742,TRCMQDO128F42ABBD4,Malk De Koijn;Blæs Boogie;Geo G_ Tony Blacksmi...,Vi Tager Fuglen På Dig
5,6,SOIMCDE12A6D4F8383,27.396887,TRORLTF128F146DE1B,Daft Punk,Around The World (Radio Edit)
6,7,SOJJUKR12AB018509A,26.996297,TRDATPO12903CCDF40,Litfiba,Sole Nero
7,8,SOXQWUS12A58A7B236,26.810068,TRXFUVA128F4248174,Julie London,You'd Be So Nice To Come Home To
8,9,SOLTJPQ12A6D4F8D0F,26.021537,TRJEQAM128F92F8A26,Walter Wanderley,Lobo Bobo
9,10,SOIMLDL12A8C14283D,23.463515,TRALIAX128F92F4855,Ghostland Observatory,Stranger Lover


In [263]:
actual_tracks = set(ub_rec.test_df[ub_rec.test_df['user_id'] == user_id]['song_id'])
recommended_tracks = set(recs['song_id'])

hits = actual_tracks & recommended_tracks
precision_at_k = len(hits) / 10  # 10 is k
print("Actual tracks:", actual_tracks)
print("Recommended tracks:", recommended_tracks)
print("Hits:", hits)
print("Precision@10:", precision_at_k)

Actual tracks: {'SOZVUCT12A8C1424BE', 'SOBZXQL12A58A78057', 'SOCVTLJ12A6310F0FD', 'SOGPBAW12A6D4F9F22', 'SOQLVCG12A6310EE05', 'SOQGSUC12A8C13B66D', 'SODRUCE12A8C141123', 'SOOWVNN12A8C140775', 'SOWYYUQ12A6701D68D', 'SOFUUEH12AB0185F34', 'SOIZLKI12A6D4F7B61', 'SOXTBGF12A6D4FB49C', 'SOAXGDH12A8C13F8A1', 'SOOKEEB12A6D4FA78D', 'SOZVCRW12A67ADA0B7', 'SOOLYZQ12A6D4FA5B7', 'SOCICBL12A8C13D663', 'SOCKSGZ12A58A7CA4B', 'SOWOMMY127F8096DF9', 'SOYITRT12A6D4FA789', 'SODVVPM12A8C139BEE', 'SOUJVIT12A8C1451C1', 'SOSJDQJ12A8C13D4A9', 'SOEBAJL12AB01818D8'}
Recommended tracks: {'SOJSFGS12A8C13CE8E', 'SOAOREB12A6D4F7E4F', 'SOGAZFR12A6D4F9BD0', 'SOEOFAZ12A6701C62E', 'SOYHHOX12A6D4FA542', 'SOZLHEM12AB017E3DB', 'SOSHGDV12AB0180B58', 'SOTZIEU12A8C13B668', 'SOGAPBK12A6D4F8AB1', 'SOBXPDH12A6701C688'}
Hits: set()
Precision@10: 0.0


#### People who listen to this track usually listen (Item-Item)

In [159]:
class ItemBasedRecommender(Collaborative):
    def __init__(self, interactions_df, tracks_df, top_k_neighbors=50):
        super().__init__(interactions_df, tracks_df)
        self.top_k_neighbors = top_k_neighbors
        self.song_map = None
        self.user_map = None
        self.user_item_sparse = None
        self.song_ids = None
        self.item_sim = None  # stores top-K similar items
    
    def fit(self):
        # 1. Map user_id and song_id to indices
        self.user_ids = self.train_df['user_id'].unique()
        self.song_ids = self.train_df['song_id'].unique()
        self.user_map = {user: idx for idx, user in enumerate(self.user_ids)}
        self.song_map = {song: idx for idx, song in enumerate(self.song_ids)}

        # 2. Build sparse user-item matrix
        rows = self.train_df['user_id'].map(self.user_map).to_numpy()
        cols = self.train_df['song_id'].map(self.song_map).to_numpy()
        data = self.train_df['play_count'].to_numpy()
        self.user_item_sparse = csr_matrix(
            (data, (rows, cols)), shape=(len(self.user_ids), len(self.song_ids))
        )

        # 3. Compute item-item similarities (cosine similarity)
        # Transpose: items x users
        from sklearn.preprocessing import normalize
        item_matrix = self.user_item_sparse.T  # shape: (num_songs, num_users)
        item_matrix = normalize(item_matrix.astype(np.float32))
        # Cosine similarity
        self.item_sim = cosine_similarity(item_matrix, dense_output=False)  # sparse matrix
        self.item_sim.setdiag(0)

    def recommend_for_track(self, track_id, k=10):
        """Return top-k recommended tracks similar to a given track"""
        if track_id not in self.song_map:
            return pd.DataFrame(columns=['rank', 'artist_name', 'track_title'])

        track_index = self.song_map[track_id]
        sim_scores = self.item_sim[track_index].toarray().flatten()  # similarity with all songs

        # 1. Get top-k similar tracks
        top_idx = np.argpartition(-sim_scores, k)[:k]
        top_scores = sim_scores[top_idx]
        recommended_song_ids = [self.song_ids[i] for i in top_idx]

        recs = pd.DataFrame({
            'song_id': recommended_song_ids,
            'score': top_scores
        }).sort_values(by="score", ascending=False).reset_index(drop=True)

        # 2. Merge with track metadata
        recs = recs.merge(self.tracks_df, on="song_id", how="left")
        recs.insert(0, "rank", range(1, len(recs) + 1))
        return recs.head(k)

In [160]:
ib_rec = ItemBasedRecommender(interactions_df, tracks_df)

In [161]:
ib_rec.train_test_split(test_size=0.5)

In [162]:
ib_rec.fit()

In [165]:
import random 
track_id = random.choice(list(ib_rec.train_df['song_id']))
track_id

'SOQKFHB12AB0186B13'

In [166]:
ib_rec.recommend_for_track(track_id)

Unnamed: 0,rank,song_id,score,track_id,artist_name,track_title
0,1,SOTNGEQ12AB0179B8F,0.399809,TRTKPKT128F92FB837,Arcana,Eclipse Of The Soul
1,2,SOKBXZF12A6D4F5414,0.300389,TRYNOMO128EF35A43C,Magnet,Where Happiness Lives
2,3,SOODYTD12A58A7BD2F,0.275107,TRYPKUA128F42178F0,My Bitter End,Finding Level Ground
3,4,SOHVJWP12A8C13976F,0.272907,TRRYMPC128F4262232,Leftover Crack,Homeo Apathy
4,5,SOSADVA12AB0186B23,0.267069,TROFMIE12903CD70E0,William Orbit,Triple Concerto
5,6,SOXDWYB12A58A7BC9D,0.243892,TRTOYRM128F92EDA94,Alesana,Not A Single Word About This
6,7,SOHPIUC12AB018046E,0.238307,TRLZSFE128F933D87E,R. Kelly,Be My #2
7,8,SOBCFDU12A8C13E47C,0.211671,TRVRJWK128F429B12C,Alesana,A Siren's Soliloquy
8,9,SORFJFS12B0B80C789,0.207533,TRWHSCA12903CDB268,Felt,Sunlight Bathed the Golden Glow
9,10,SOILGKH12A6D4F546D,0.202098,TRNDCGL128EF358FBA,Dear Whoever,Battlefield Radio


##### test recommend_for_track

In [36]:
ib_rec.user_item_sparse

<1019244x357062 sparse matrix of type '<class 'numpy.int64'>'
	with 24186793 stored elements in Compressed Sparse Row format>

In [37]:
from sklearn.preprocessing import normalize

item_matrix = ib_rec.user_item_sparse.T  # shape: (num_songs, num_users)
item_matrix = normalize(item_matrix.astype(np.float32))

In [38]:
item_sim = cosine_similarity(item_matrix, dense_output=False)

In [41]:
item_sim.setdiag(0)

In [142]:
ib_rec.train_df['song_id']

15079346    SOAIVZL12A8C139DFB
35029287    SOFOGTV12A6D4F9FBE
22471140    SOHYPJR12A8C137A94
18589855    SOGUGWW12A8C136955
23368024    SOKOFXH12A6D4F9B32
                   ...        
48140618    SOZFGBY12A6D4FCEBB
26735830    SOFTXZQ12A8C13C67D
35788921    SOEYVHS12AB0181D31
13315092    SORCNJV12A58A7A6EF
21081788    SODJWHY12A8C142CCE
Name: song_id, Length: 24186793, dtype: object

In [150]:
import random 

track_id = random.choice(ib_rec.train_df['song_id'])

In [151]:
track_id

'SOGILMU12A67ADAD85'

In [152]:
ib_rec.tracks_df[ib_rec.tracks_df['song_id'] == track_id]

Unnamed: 0,track_id,song_id,artist_name,track_title
568953,TRTVMNT128F931217F,SOGILMU12A67ADAD85,Nightwish,Sacrament Of Wilderness


In [153]:
track_id in ib_rec.song_map

True

In [154]:
track_index = ib_rec.song_map[track_id]
sim_scores = item_sim[track_index].toarray().flatten()

In [155]:
k=10
top_idx = np.argpartition(-sim_scores, k)[:k]
top_scores = sim_scores[top_idx]
recommended_song_ids = [ib_rec.song_ids[i] for i in top_idx]

In [None]:
recs = pd.DataFrame({
            'song_id': recommended_song_ids,
            'score': top_scores
        }).sort_values(by="score", ascending=False).reset_index(drop=True)

In [157]:
recs = recs.merge(ib_rec.tracks_df, on="song_id", how="left")

In [158]:
recs

Unnamed: 0,song_id,score,track_id,artist_name,track_title
0,SOGJHGC12A6D4F7941,0.297099,TRWWXGV128F14548CE,Souad Massi,Bladi
1,SOMUCSI12A8C142D4F,0.284077,TRRFGDX128F92DF23B,Robin Guthrie and Harold Budd,Inside_ A Golden Echo
2,SONNLCP12AB01844FE,0.259537,TRZWGWX128F934C38E,Amon Amarth,The Dragons' Flight Across The Waves
3,SOUPIGP12AB018547C,0.224672,TRBUFNV128F9314BEA,The Wes Montgomery Trio,Missile Blues
4,SONUMIB12A8C13EBEC,0.219602,TRDLLJO128F93216EA,Nightwish,Dead Boy's Poem
5,SOFLUME12A6D4FD15F,0.208615,TRXBLME128F424330F,Die Mannequin,Saved By Strangers
6,SOPYBWC12A8C1334BD,0.199924,TRSLJKA128F424C84B,Wes Montgomery Trio,Dangerous
7,SONANSH12A8C1334C6,0.199924,TRSFFCY128F424C84D,Wes Montgomery Trio,Yesterday's Child
8,SOJYHQO12AC90717C0,0.1914,TRWZLWL12903CF3A15,Tommy Sands,Goin' Steady
9,SOGNEMJ12A8C135503,0.182505,TRTEZJM128F424C981,Wes Montgomery Trio,The Way You Look Tonight


##### test precision_recall_at_k

In [167]:
def precision_recall_at_k(self, k=10, tracks=None):
        """
        Compute average Precision@k and Recall@k across tracks in the test set.

        Args:
            k (int): number of recommendations to evaluate.
            tracks (list, optional): specific track_ids to evaluate. 
                                     If None, evaluates all tracks in test set.

        Returns:
            dict: {'precision_at_k': float, 'recall_at_k': float}
        """
        if tracks is None:
            tracks = self.test_df['song_id'].unique()

        precisions = []
        recalls = []

        for track_id in tracks:
            # 1. Ground truth: users who listened to this track in test set
            actual_users = set(self.test_df[self.test_df['song_id'] == track_id]['user_id'])
            if not actual_users:
                continue

            # 2. Recommended tracks
            recs = self.recommend_for_track(track_id, k)
            if recs.empty:
                continue
            rec_tracks = set(recs['song_id'])

            # 3. Compute hits
            # A hit: a recommended track that was listened to by the same users as the original track
            hits = 0
            for rec_track in rec_tracks:
                users_who_listened = set(self.test_df[self.test_df['song_id'] == rec_track]['user_id'])
                hits += len(actual_users & users_who_listened)

            # Precision@k: fraction of recommended tracks relevant to users
            precisions.append(hits / k)
            # Recall@k: fraction of actual users covered
            recalls.append(hits / len(actual_users))

        return {
            'precision_at_k': float(np.mean(precisions)) if precisions else 0.0,
            'recall_at_k': float(np.mean(recalls)) if recalls else 0.0
        }

In [169]:
# precision_recall_at_k(ib_rec)

In [174]:
actual_tracks = set(ib_rec.test_df[ib_rec.test_df['song_id'] == track_id]['user_id'])
recommended_tracks = set(recs['song_id'])

hits = actual_tracks & recommended_tracks
precision_at_k = len(hits) / 10  # 10 is k
# print("Actual tracks:", actual_tracks)
# print("Recommended tracks:", recommended_tracks)
print("Hits:", hits)
print("Precision@10:", precision_at_k)

Recommended tracks: {'SONUMIB12A8C13EBEC', 'SONNLCP12AB01844FE', 'SOMUCSI12A8C142D4F', 'SOPYBWC12A8C1334BD', 'SONANSH12A8C1334C6', 'SOUPIGP12AB018547C', 'SOFLUME12A6D4FD15F', 'SOGNEMJ12A8C135503', 'SOGJHGC12A6D4F7941', 'SOJYHQO12AC90717C0'}
Hits: set()
Precision@10: 0.0
