### Import

In [95]:
# !jupyter kernelspec list
# !conda list

In [104]:
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

### Datasets

In [6]:
data_path = "../data/raw/"
os.listdir(data_path)

['attachments.txt',
 'mxm_dataset_train.txt',
 'p02_msd_tagtraum_cd2.cls',
 'p02_unique_tracks.txt',
 'train_triplets.txt']

Echo Nest Taste Profile Subset (triplets)

In [7]:
interactions_df = pd.read_csv(data_path + 'train_triplets.txt',
                      sep='\t',
                      header=None,
                      names=['user_id', 'song_id', 'play_count']
)

interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


MusiXmatch Dataset (lyrics)

In [8]:
with open(data_path + 'mxm_dataset_train.txt', encoding="utf-8") as f:
  lines = f.readlines()

# for i, line in enumerate(lines[:20]):
#   print(i, line.strip())


# Find the line with %
vocab_line = [l for l in lines if l.startswith('%')][0]
vocab = vocab_line[1:].strip().split(',')   # remove '%' and split
print("Vocabulary size:", len(vocab))
print("First 10 words:", vocab[:10])


records = []
for line in lines:
    if line.startswith('#') or line.startswith('%'):
        continue  # skip comments and vocab line

    parts = line.strip().split(',')
    track_id = parts[0]
    mxm_track_id = parts[1]
    word_counts = parts[2:]

    bow = {}
    for wc in word_counts:
        idx, count = wc.split(':')
        bow[vocab[int(idx)-1]] = int(count)  # word indices start at 1

    records.append((track_id, mxm_track_id, bow))

Vocabulary size: 5000
First 10 words: ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in']


In [9]:
lyrics_df = pd.DataFrame(records, columns=['track_id', 'mxm_track_id', 'bow'])
lyrics_df

Unnamed: 0,track_id,mxm_track_id,bow
0,TRAAAAV128F421A322,4623710,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5..."
1,TRAAABD128F429CF47,6477168,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2..."
2,TRAAAED128E0783FAB,2516445,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'..."
3,TRAAAEF128F4273421,3759847,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1..."
4,TRAAAEW128F42930C0,3783760,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '..."
...,...,...,...
210514,TRZZZWS128F429CF87,3080645,"{'a': 1, 'no': 9, 'que': 7, 'de': 1, 'y': 4, '..."
210515,TRZZZXA128F428ED56,2344272,"{'i': 1, 'the': 13, 'you': 6, 'to': 5, 'and': ..."
210516,TRZZZXV128F4289747,1417347,"{'i': 13, 'the': 3, 'you': 17, 'to': 5, 'and':..."
210517,TRZZZYV128F92E996D,6849828,"{'i': 10, 'the': 6, 'you': 20, 'and': 2, 'me':..."


Tagtraum Genre Annotations

In [10]:
genres_df = pd.read_csv(
    data_path + 'p02_msd_tagtraum_cd2.cls',
    sep='\t',
    comment='#',
    header=None,
    names=['track_id', 'majority_genre', 'minority_genre']
)

genres_df

Unnamed: 0,track_id,majority_genre,minority_genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,
...,...,...,...
280826,TRZZZRJ128F42819AF,Rock,
280827,TRZZZUK128F92E3C60,Folk,
280828,TRZZZYV128F92E996D,New Age,RnB
280829,TRZZZZD128F4236844,Rock,


Track ↔ Song Mapping

In [11]:
tracks_df = pd.read_csv(data_path + 'p02_unique_tracks.txt',
                               sep='<SEP>',
                               header=None,
                               engine="python", # needed because <SEP> is more than 1 character
                               names=["track_id", "song_id", "artist_name", "track_title"]
)

tracks_df

Unnamed: 0,track_id,song_id,artist_name,track_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday


### Top 250 Tracks

In [21]:
interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


In [22]:
interactions_df["song_id"].nunique()

384546

In [23]:
# 1. Aggregate play counts per song
track_playcounts = (
    interactions_df
    .groupby('song_id')['play_count']
    .sum()
    .reset_index()
)

In [24]:
track_playcounts

Unnamed: 0,song_id,play_count
0,SOAAADD12AB018A9DD,24
1,SOAAADE12A6D4F80CC,12
2,SOAAADF12A8C13DF62,9
3,SOAAADZ12A8C1334FB,12
4,SOAAAFI12A6D4F9C66,188
...,...,...
384541,SOZZZRJ12AB0187A75,16
384542,SOZZZRV12A8C1361F1,75
384543,SOZZZSR12AB01854CD,5
384544,SOZZZWD12A6D4F6624,3


In [25]:
# 2. Sort descending
track_playcounts = track_playcounts.sort_values(
    by="play_count", ascending=False
)

In [26]:
track_playcounts

Unnamed: 0,song_id,play_count
25043,SOBONKR12A58A7A7E0,726885
12936,SOAUWYT12A81C206F1,648239
287415,SOSXLTC12AF72A7F54,527893
90798,SOFRQTD12A81C233C0,425463
67917,SOEGIYH12A6D4FC0E3,389880
...,...,...
39627,SOCMFLA12A6D4FA417,1
156718,SOJYBHV12A6D4F9A92,1
39625,SOCMFIS12A58291D8C,1
109970,SOGXJKU12A6D4F6D9F,1


In [27]:
# 3. Take Top 250
top_250 = track_playcounts.head(250)

In [28]:
top_250

Unnamed: 0,song_id,play_count
25043,SOBONKR12A58A7A7E0,726885
12936,SOAUWYT12A81C206F1,648239
287415,SOSXLTC12AF72A7F54,527893
90798,SOFRQTD12A81C233C0,425463
67917,SOEGIYH12A6D4FC0E3,389880
...,...,...
13135,SOAVFLR12A8C138576,35253
224272,SOOLYZQ12A6D4FA5B7,35245
242485,SOPSYOY12A8C142E0B,35191
30161,SOBWSGV12AB018B5E0,35074


In [29]:
# 4. Merge with metadata (tracks_df)
top_250 = (
    top_250.merge(tracks_df, on="song_id", how="left")
    [["artist_name", "track_title", "play_count"]]
)

In [30]:
top_250

Unnamed: 0,artist_name,track_title,play_count
0,Dwight Yoakam,You're The One,726885
1,Björk,Undo,648239
2,Kings Of Leon,Revelry,527893
3,Harmonia,Sehr kosmisch,425463
4,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880
...,...,...,...
262,Triple Six Mafia,Now I'm High_ Really High,35253
263,The Red Jumpsuit Apparatus,Face Down (Album Version),35245
264,Linkin Park,New Divide (Album Version),35191
265,Selena Gomez & The Scene,Naturally,35074


In [31]:
# 5. Add ranking index
top_250.index = top_250.index + 1
top_250.index.name = "rank"

In [32]:
top_250

Unnamed: 0_level_0,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dwight Yoakam,You're The One,726885
2,Björk,Undo,648239
3,Kings Of Leon,Revelry,527893
4,Harmonia,Sehr kosmisch,425463
5,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880
...,...,...,...
263,Triple Six Mafia,Now I'm High_ Really High,35253
264,The Red Jumpsuit Apparatus,Face Down (Album Version),35245
265,Linkin Park,New Divide (Album Version),35191
266,Selena Gomez & The Scene,Naturally,35074


### Top 100 tracks by genre

In [33]:
interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


In [34]:
genres_df

Unnamed: 0,track_id,majority_genre,minority_genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,
...,...,...,...
280826,TRZZZRJ128F42819AF,Rock,
280827,TRZZZUK128F92E3C60,Folk,
280828,TRZZZYV128F92E996D,New Age,RnB
280829,TRZZZZD128F4236844,Rock,


In [35]:
genres_df["majority_genre"].nunique()

15

In [51]:
genres_df["majority_genre"].unique()

array(['Rock', 'Rap', 'Latin', 'Jazz', 'Electronic', 'Punk', 'Pop',
       'New Age', 'Metal', 'RnB', 'Country', 'Reggae', 'Folk', 'Blues',
       'World'], dtype=object)

In [36]:
tracks_df

Unnamed: 0,track_id,song_id,artist_name,track_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday


In [37]:
genre = "Rock"

In [38]:
# 1. Merge interactions with track metadata
merged = (
    interactions_df
    .merge(tracks_df, on="song_id", how="left")
    .merge(genres_df[["track_id", "majority_genre"]], on="track_id", how="left")
)

In [39]:
merged

Unnamed: 0,user_id,song_id,play_count,track_id,artist_name,track_title,majority_genre
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,TRIQAUQ128F42435AD,Jack Johnson,The Cove,Rock
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1,TRIRLYL128F42539D1,Billy Preston,Nothing from Nothing,RnB
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,TRMHBXZ128F4238406,Paco De Lucia,Entre Dos Aguas,Electronic
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,TRYQMNI128F147C1C7,Josh Rouse,Under Cold Blue Stars,Rock
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1,TRAHZNE128F9341B86,The Dead 60s,Riot Radio (Soundtrack Version),
...,...,...,...,...,...,...,...
49664523,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2,TRKUAEO128F933ABFC,Eminem / Obie Trice / Stat Quo / Bobby Creekwa...,We're Back,
49664524,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1,TRRNFHH128F92D262D,Rise Against,Savior,Rock
49664525,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1,TRSLDDC12903CC36E7,Usher featuring will.i.am,OMG,
49664526,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3,TRNJQAM128F14557AF,matchbox twenty,Downfall (Album Version),Rock


In [40]:
# 2. Filter by genre
genre_df = merged[merged["majority_genre"] == genre]

In [41]:
genre_df

Unnamed: 0,user_id,song_id,play_count,track_id,artist_name,track_title,majority_genre
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,TRIQAUQ128F42435AD,Jack Johnson,The Cove,Rock
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1,TRYQMNI128F147C1C7,Josh Rouse,Under Cold Blue Stars,Rock
13,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,TRKRHYM128F42934A9,Foo Fighters,Learn To Fly,Rock
14,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,TRPTWGR128F1452734,Héroes del Silencio,Apuesta Por El Rock 'N' Roll,Rock
17,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEGVZY12A58A7857E,1,TRKEXHB128F147C1C4,Josh Rouse,Nothing Gives Me Pleasure,Rock
...,...,...,...,...,...,...,...
49664512,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SORPVUD12A67020454,1,TRBTVVD128F146D742,Red Hot Chili Peppers,Otherside (Album Version),Rock
49664513,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOSCPOI12A8C139F02,1,TROBUUZ128F4263002,Finger Eleven,Paralyzer,Rock
49664520,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOTNMFD12A58A7789E,1,TRLLUPN128F4257E65,RAUNCHY,I Get What I See,Rock
49664524,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1,TRRNFHH128F92D262D,Rise Against,Savior,Rock


In [42]:
# 3. Aggregate play counts per track
track_playcounts = (
    genre_df
    .groupby(['track_id', 'artist_name', 'track_title'])['play_count']
    .sum()
    .reset_index()
)

In [43]:
track_playcounts

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRAAAAK128F9318786,Adelitas Way,Scream,515
1,TRAAABD128F429CF47,The Box Tops,Soul Deep,72
2,TRAAAUR128F428B1FA,International Noise Conspiracy,Smash It Up,375
3,TRAABFH128F92C812E,The Donkeys,Excelsior Lady,410
4,TRAABIG128F9356C56,Poe,Walk the Walk,410
...,...,...,...,...
75933,TRZZZCB128F4249252,Winger,Spell I'm Under,185
75934,TRZZZCL128F428BB80,I Am Ghost,The Ship of Pills and Needed Things,32
75935,TRZZZHL128F9329CFB,Ayreon,Day five: Voices,281
75936,TRZZZRJ128F42819AF,Belle & Sebastian,Lord Anthony,1212


In [44]:
# 4. Sort and take top k
top_100_by_genre = (
    track_playcounts
    .sort_values("play_count", ascending=False)
    .head(100)
    .reset_index(drop=True)
)

In [45]:
top_100_by_genre

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRGXQES128F42BA5EB,Björk,Undo,648239
1,TRONYHY128F92C9D11,Kings Of Leon,Revelry,527893
2,TRDMBIJ128F4290431,Harmonia,Sehr kosmisch,425463
3,TROAQBZ128F9326213,OneRepublic,Secrets,292642
4,TRIXAZF128F421EE64,Tub Ring,Invalid,268353
...,...,...,...,...
95,TRFXWSD128F93173BF,Metric,Gold Guns Girls,28148
96,TRTRVEP128F428F617,Pearl Jam,Encore Break,27579
97,TRJEITS128F92E2FEC,Daughtry,No Surprise,27187
98,TRVYICQ128F4252493,Eric Clapton,Tears In Heaven,26999


In [46]:
# 5. Add ranking index
top_100_by_genre.index = top_100_by_genre.index + 1
top_100_by_genre.index.name = "rank"

In [47]:
top_100_by_genre

Unnamed: 0_level_0,track_id,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,TRGXQES128F42BA5EB,Björk,Undo,648239
2,TRONYHY128F92C9D11,Kings Of Leon,Revelry,527893
3,TRDMBIJ128F4290431,Harmonia,Sehr kosmisch,425463
4,TROAQBZ128F9326213,OneRepublic,Secrets,292642
5,TRIXAZF128F421EE64,Tub Ring,Invalid,268353
...,...,...,...,...
96,TRFXWSD128F93173BF,Metric,Gold Guns Girls,28148
97,TRTRVEP128F428F617,Pearl Jam,Encore Break,27579
98,TRJEITS128F92E2FEC,Daughtry,No Surprise,27187
99,TRVYICQ128F4252493,Eric Clapton,Tears In Heaven,26999


### Collections

In [9]:
lyrics_df

Unnamed: 0,track_id,mxm_track_id,bow
0,TRAAAAV128F421A322,4623710,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5..."
1,TRAAABD128F429CF47,6477168,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2..."
2,TRAAAED128E0783FAB,2516445,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'..."
3,TRAAAEF128F4273421,3759847,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1..."
4,TRAAAEW128F42930C0,3783760,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '..."
...,...,...,...
210514,TRZZZWS128F429CF87,3080645,"{'a': 1, 'no': 9, 'que': 7, 'de': 1, 'y': 4, '..."
210515,TRZZZXA128F428ED56,2344272,"{'i': 1, 'the': 13, 'you': 6, 'to': 5, 'and': ..."
210516,TRZZZXV128F4289747,1417347,"{'i': 13, 'the': 3, 'you': 17, 'to': 5, 'and':..."
210517,TRZZZYV128F92E996D,6849828,"{'i': 10, 'the': 6, 'you': 20, 'and': 2, 'me':..."


In [10]:
interactions_df

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1
...,...,...,...
48373581,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUHHHH12AF729E4AF,2
48373582,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUJVIT12A8C1451C1,1
48373583,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOUSMXX12AB0185C24,1
48373584,b7815dbb206eb2831ce0fe040d0aa537e2e800f7,SOWYSKH12AF72A303A,3


In [11]:
tracks_df

Unnamed: 0,track_id,song_id,artist_name,track_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens
...,...,...,...,...
999995,TRYYYUS12903CD2DF0,SOTXAME12AB018F136,Kiko Navarro,O Samba Da Vida
999996,TRYYYJO128F426DA37,SOXQYIQ12A8C137FBB,Kuldeep Manak,Jago Chhadeo
999997,TRYYYMG128F4260ECA,SOHODZI12A8C137BB3,Gabriel Le Mar,Novemba
999998,TRYYYDJ128F9310A21,SOLXGOR12A81C21EB7,Elude,Faraday


#### Baseline

In [59]:
keyword = "love"
k = 50
threshold = 10

In [61]:
mask = lyrics_df['bow'].apply(lambda bow: bow.get(keyword, 0) >= threshold)
keyword_tracks = lyrics_df[mask][["track_id"]]
keyword_tracks

Unnamed: 0,track_id
2,TRAAAED128E0783FAB
45,TRAADNL128F14519DF
70,TRAAFTE128F429545F
78,TRAAGHM128EF35CF8E
101,TRAAHOA128F425A4F7
...,...
210343,TRZZLCS128F92C6124
210389,TRZZOWY128F42BA84E
210447,TRZZTYS128EF347EAB
210450,TRZZUNB128F4263020


In [62]:
merged = (
    keyword_tracks
    .merge(tracks_df, on="track_id", how="left")
    .merge(interactions_df, on="song_id", how="left")
)

In [63]:
merged

Unnamed: 0,track_id,song_id,artist_name,track_title,user_id,play_count
0,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,cf7bd4b5b398b3e150cf262d79147312a69b96ac,9.0
1,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,043d81932e75d5749ed5758d6420506e7bc457a5,3.0
2,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,515e1ab04c00859de983cacf35f150f2ddb37dde,1.0
3,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,e49ac0612b9444abf3d513e54b1cd77f6fe5ae4b,1.0
4,TRAAAED128E0783FAB,SOXZYWX12A6310ED0C,Jamie Cullum,It's About Time,a2c8271ed491e9fd6ecb5a8760940362e3a54e3a,1.0
...,...,...,...,...,...,...
767302,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,101c5333c580dc2d936ec6025632138478cf1917,2.0
767303,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,de011fdc5ed1544fb33d70cbc33443f64cfcec67,2.0
767304,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,47df968e42b74c06b11fd936425b6ac9d60c73e5,3.0
767305,TRZZWZE128F92D2FCA,SOZMITG12A6D4F862C,Atomic Kitten,I Want Your Love,d494b27dc831e58f4573e52e35be27ca209d3d31,1.0


In [64]:
track_playcounts = (
    merged.groupby(["artist_name", "track_title"])["play_count"]
    .sum()
    .reset_index()
)

In [66]:
track_playcounts

Unnamed: 0,artist_name,track_title,play_count
0,'Til Tuesday,Have Mercy,1.0
1,'Til Tuesday,How Can You Give Up,0.0
2,-123min.,I'm In You,0.0
3,100 Proof Aged in Soul,One Mans Leftovers (Is Another Mans Feast),0.0
4,10000 Maniacs,Love Among The Ruins,19.0
...,...,...,...
6292,tobyMac,Made To Love,3358.0
6293,tobyMac,No Ordinary Love,0.0
6294,yoomiii,For Your Love,0.0
6295,yoomiii,You Are My First Love,0.0


In [73]:
top_k_by_keyword = (
    track_playcounts
    .sort_values(by="play_count", ascending=False)
    .head(k)
    .reset_index(drop=True)
)

In [74]:
top_k_by_keyword

Unnamed: 0,artist_name,track_title,play_count
0,Bill Withers,Make Love To Your Mind,146978.0
1,John Mayer,Half Of My Heart,65966.0
2,Eminem / Dina Rae,Superman,45328.0
3,Guns N' Roses,Don't Cry (Original),40480.0
4,UB40,Red Red Wine (Edit),23263.0
5,Black Eyed Peas,My Humps,23151.0
6,Leona Lewis,Bleeding Love,22195.0
7,John Legend,Save Room,21535.0
8,Modern Lovers,Modern world,20584.0
9,Train,If It's Love,20410.0


In [75]:
top_k_by_keyword.index = top_k_by_keyword.index + 1
top_k_by_keyword.index.name = "rank"

In [76]:
top_k_by_keyword

Unnamed: 0_level_0,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Bill Withers,Make Love To Your Mind,146978.0
2,John Mayer,Half Of My Heart,65966.0
3,Eminem / Dina Rae,Superman,45328.0
4,Guns N' Roses,Don't Cry (Original),40480.0
5,UB40,Red Red Wine (Edit),23263.0
6,Black Eyed Peas,My Humps,23151.0
7,Leona Lewis,Bleeding Love,22195.0
8,John Legend,Save Room,21535.0
9,Modern Lovers,Modern world,20584.0
10,Train,If It's Love,20410.0


In [None]:
def collection_baseline(self, keyword : str, threshold: int = 10, k: int = 50):
        """Return Top-k tracks containing a keyword in lyrics (baseline approach)."""
        mask = self.lyrics_df["bow"].apply(
            lambda bow: bow.get(keyword, 0) >= threshold
        )
        keyword_tracks = self.lyrics_df[mask]["track_id"]

        merged = (
            keyword_tracks
            .merge(self.tracks_df, on="track_id", how="left")
            .merge(self.interactions_df, on="song_id", how="left")
        )

        track_playcounts = (
            merged.groupby(["track_id", "artist_name", "track_title"])["play_count"]
            .sum()
            .reset_index()
        )

        top_k_by_keyword = (
            track_playcounts
            .sort_values("play_count", ascending=False)
            .head(k)
            .reset_index(drop=True)
        )

        top_k_by_keyword.index = top_k_by_keyword.index + 1
        top_k_by_keyword.index.name = "rank"
        return top_k_by_keyword

#### Word2Vec

In [46]:
keyword = "war"
topn = 10
threshold = 5
k=50

In [12]:
# Convert bow dict -> tokenized list of words (repeated by count)
tokenized_lyrics = [
    [word for word, cnt in bow.items() for _ in range(cnt)]
    for bow in lyrics_df["bow"]
]

In [14]:
# Train Word2Vec
w2v_model = Word2Vec(
    sentences=tokenized_lyrics,
    vector_size=100,  # embedding dimension
    window=5,         # context window
    min_count=3,      # ignore rare words
    workers=4         # parallel threads
)

In [50]:
print(f"Vocabulary size: {len(w2v_model.wv)}")

Vocabulary size: 5000


In [51]:
w2v_model.wv[keyword]

array([-1.9798435e+00,  3.1476986e+00,  5.3034997e-01, -1.0796342e+00,
        3.7239361e-01,  7.3948860e-01,  1.3342622e-01, -4.1996357e-01,
       -1.9665905e+00, -2.8688446e-01, -5.1528299e-03, -1.2821839e+00,
       -7.0479410e-03,  1.7230855e+00,  3.4181881e+00, -1.6156145e+00,
        2.3591237e+00,  2.0879304e+00, -1.3787403e+00,  3.1870937e-01,
        2.2438571e-01,  3.4610486e+00, -2.8742275e-03,  2.1761127e+00,
       -2.8703025e-01, -1.3574673e+00,  8.8820100e-01, -7.4275053e-01,
       -9.1267848e-01, -3.4452534e-01,  5.0873394e+00, -1.6656071e+00,
       -4.5301394e+00, -1.4835590e+00,  2.1878200e+00, -3.4149518e+00,
       -3.8787407e-01, -1.2699935e-01, -2.0061107e+00, -1.0173209e+00,
        5.0611252e-01,  7.7255172e-01, -6.8966287e-01,  4.6053609e-01,
       -1.3743182e+00,  4.0155035e-01, -4.2169224e-02, -2.8938228e-01,
       -2.1008928e+00,  1.2439790e+00,  4.0316841e-01, -1.0409189e+00,
       -7.2549731e-01, -9.7066534e-01, -1.3312080e+00, -2.4182953e-01,
      

In [52]:
w2v_model.wv.most_similar(keyword, topn=topn)

[('hell', 0.7817014455795288),
 ('death', 0.7717195153236389),
 ('truth', 0.7684311866760254),
 ('hate', 0.7483299970626831),
 ('seen', 0.7400242686271667),
 ('wind', 0.7344577312469482),
 ('under', 0.7321529388427734),
 ('land', 0.7301612496376038),
 ('arm', 0.7274069786071777),
 ('set', 0.7272908687591553)]

In [53]:
similar_words = [w for w, _ in w2v_model.wv.most_similar(keyword, topn=topn)]

In [54]:
similar_words

['hell',
 'death',
 'truth',
 'hate',
 'seen',
 'wind',
 'under',
 'land',
 'arm',
 'set']

In [55]:
all_keywords = [keyword] + similar_words

In [56]:
all_keywords

['war',
 'hell',
 'death',
 'truth',
 'hate',
 'seen',
 'wind',
 'under',
 'land',
 'arm',
 'set']

In [57]:
mask = lyrics_df["bow"].apply(
    lambda bow: any(bow.get(w, 0) >= threshold for w in all_keywords)
)

In [58]:
keyword_tracks = lyrics_df[mask][["track_id"]]
keyword_tracks

Unnamed: 0,track_id
21,TRAABOA128F933684A
56,TRAAEJH128E0785506
110,TRAAHZP12903CA25F4
170,TRAAMCQ128F4259A2F
244,TRAARXD12903D0CF24
...,...
210416,TRZZQSK128F92EF0B9
210446,TRZZTUV128F426B6EB
210473,TRZZWTN128F9352EC6
210493,TRZZYLO12903CAC06C


In [59]:
merged = (
    keyword_tracks
        .merge(tracks_df, on="track_id", how="left")
        .merge(interactions_df, on="song_id", how="left")
)

In [60]:
merged

Unnamed: 0,track_id,song_id,artist_name,track_title,user_id,play_count
0,TRAABOA128F933684A,SONHGLD12AB0188D47,Anthony B,Our Father,b58ad35665d625169bfe75ba3f97dffed518edac,8.0
1,TRAABOA128F933684A,SONHGLD12AB0188D47,Anthony B,Our Father,02756dc4251d3a9f4e28a94cf1a56a46a5b4865a,1.0
2,TRAAEJH128E0785506,SOFBGBL12A67020D9F,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),b322da50dc02b89bbb347dc3ee475f4fa19f4c62,2.0
3,TRAAEJH128E0785506,SOFBGBL12A67020D9F,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),aa5f5df9d3c41fc2d03e55110cd12e2007d3811a,4.0
4,TRAAEJH128E0785506,SOFBGBL12A67020D9F,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),29f9386dc001a437d233f2e7902020ffc1213e11,1.0
...,...,...,...,...,...,...
656967,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,a93f9fb3fcfb3d7182e4f97848c4f291b98f47b8,2.0
656968,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,730fa2e10d61c8d0e86d0ed7addecb7dc5dd1021,1.0
656969,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,ba149996a348302ed135b4166e716be174aed211,1.0
656970,TRZZZCB128F4249252,SOKOXRU12A8C131E09,Winger,Spell I'm Under,278254ed302a6e26bf07f1eb23f138d26758fd96,1.0


In [61]:
track_playcounts = (
    merged.groupby(["track_id", "artist_name", "track_title"])["play_count"]
        .sum()
        .reset_index()
)

In [62]:
track_playcounts

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRAABOA128F933684A,Anthony B,Our Father,9.0
1,TRAAEJH128E0785506,Hank Williams Jr.,Tuesday's Gone (Remastered Album Version),38.0
2,TRAAHZP12903CA25F4,Organized Konfusion,Hate,43.0
3,TRAAMCQ128F4259A2F,Primal Scream,Pills,3424.0
4,TRAARXD12903D0CF24,Black Label Society,Born To Booze,40.0
...,...,...,...,...
4502,TRZZQSK128F92EF0B9,Disciple,Love Hate (On And On),129.0
4503,TRZZTUV128F426B6EB,Weeping Willows,Echoes Of Your Breath,3.0
4504,TRZZWTN128F9352EC6,Ektomorf,Rat War,51.0
4505,TRZZYLO12903CAC06C,Dallas Holm,I've Never Seen The Righteous Forsaken,0.0


In [70]:
top_k_by_keyword_word2vec = (
    track_playcounts
    .sort_values(by="play_count", ascending=False)
    .head(k)
    .reset_index(drop=True)
)

In [71]:
top_k_by_keyword_word2vec

Unnamed: 0,track_id,artist_name,track_title,play_count
0,TRRNFHH128F92D262D,Rise Against,Savior,74654.0
1,TRMEQQX12903CCD9D5,Sean Kingston and Justin Bieber,Eenie Meenie,66998.0
2,TRKDYZS12903CDB570,3OH!3,STARSTRUKK [FEATURINGKATYPERRY] (Explicit Bonu...,26230.0
3,TRTWBNZ128F92F3426,Vanessa Williams,Colors Of The Wind,26001.0
4,TRYFXPG128E078ECBD,Dr. Dre / Eminem,Forgot About Dre,24502.0
5,TRLAKFT128F427FF44,Rage Against The Machine,Killing In The Name,24485.0
6,TRRRMKC128E0792990,Guns N' Roses,Civil War,22893.0
7,TRDTYHN128E079504D,Twista feat. Kayne West & Jamie Foxx,Slow Jamz (Feat. Kanye West & Jamie Foxx) (Edi...,21025.0
8,TRUUXLZ128F932BA01,Zac Brown Band,Chicken Fried (Album),20088.0
9,TREEFIY128F425B9AE,Three Days Grace,I Hate Everything About You,19959.0


In [72]:
top_k_by_keyword_word2vec.index = top_k_by_keyword_word2vec.index + 1
top_k_by_keyword_word2vec.index.name = "rank"

In [73]:
top_k_by_keyword_word2vec

Unnamed: 0_level_0,track_id,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,TRRNFHH128F92D262D,Rise Against,Savior,74654.0
2,TRMEQQX12903CCD9D5,Sean Kingston and Justin Bieber,Eenie Meenie,66998.0
3,TRKDYZS12903CDB570,3OH!3,STARSTRUKK [FEATURINGKATYPERRY] (Explicit Bonu...,26230.0
4,TRTWBNZ128F92F3426,Vanessa Williams,Colors Of The Wind,26001.0
5,TRYFXPG128E078ECBD,Dr. Dre / Eminem,Forgot About Dre,24502.0
6,TRLAKFT128F427FF44,Rage Against The Machine,Killing In The Name,24485.0
7,TRRRMKC128E0792990,Guns N' Roses,Civil War,22893.0
8,TRDTYHN128E079504D,Twista feat. Kayne West & Jamie Foxx,Slow Jamz (Feat. Kanye West & Jamie Foxx) (Edi...,21025.0
9,TRUUXLZ128F932BA01,Zac Brown Band,Chicken Fried (Album),20088.0
10,TREEFIY128F425B9AE,Three Days Grace,I Hate Everything About You,19959.0


### Classifier

Convert Bag-of-Words (BoW) dictionaries into plain text strings

In [77]:
texts = lyrics_df["bow"].apply(
    lambda bow: " ".join([w for w, c in bow.items() for _ in range(c)])
)

In [78]:
texts

0         i i i i i i the the the the you you to to and ...
1         i i i i i i i i i i you you you you you you yo...
2         i i i i i i i i i i i i i i i i i i i i i i i ...
3         i i i i i the the the the you you you to to an...
4         i i i i to to to to to and and and and and and...
                                ...                        
210514    a no no no no no no no no no que que que que q...
210515    i the the the the the the the the the the the ...
210516    i i i i i i i i i i i i i the the the you you ...
210517    i i i i i i i i i i the the the the the the yo...
210518    i i i i the the the the the the the the the th...
Name: bow, Length: 210519, dtype: object

In [117]:
lyrics_df = lyrics_df.merge(
    genres_df[['track_id', 'majority_genre']], 
    on='track_id', 
    how='left'
)

In [118]:
lyrics_df

Unnamed: 0,track_id,mxm_track_id,bow,majority_genre
0,TRAAAAV128F421A322,4623710,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5...",
1,TRAAABD128F429CF47,6477168,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2...",Rock
2,TRAAAED128E0783FAB,2516445,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'...",Jazz
3,TRAAAEF128F4273421,3759847,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1...",Rock
4,TRAAAEW128F42930C0,3783760,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '...",
...,...,...,...,...
210514,TRZZZWS128F429CF87,3080645,"{'a': 1, 'no': 9, 'que': 7, 'de': 1, 'y': 4, '...",
210515,TRZZZXA128F428ED56,2344272,"{'i': 1, 'the': 13, 'you': 6, 'to': 5, 'and': ...",
210516,TRZZZXV128F4289747,1417347,"{'i': 13, 'the': 3, 'you': 17, 'to': 5, 'and':...",
210517,TRZZZYV128F92E996D,6849828,"{'i': 10, 'the': 6, 'you': 20, 'and': 2, 'me':...",New Age


In [127]:
keyword = "Rock"

# Create a binary target column for the keyword
y = (lyrics_df['majority_genre'] == keyword).astype(int)

In [128]:
y.value_counts()

majority_genre
0    155597
1     54922
Name: count, dtype: int64

Convert text → numeric vectors that classifiers can process.

In [99]:
vectorizer = CountVectorizer(
    max_features=5000, # limit vocab size
    ngram_range=(1,2), # unigrams + bigrams
    stop_words="english"
)

In [102]:
X = vectorizer.fit_transform(texts) # learn vocabulary and transform to feature matrix

<210519x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 14798314 stored elements in Compressed Sparse Row format>

In [130]:
clf = LogisticRegression(
    max_iter=500,
    solver='liblinear',
    random_state=42
)

clf.fit(X, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,500


In [131]:
probs = clf.predict_proba(X_train)[:, 1]

In [160]:
probs

array([0.27466296, 0.25953099, 0.19197542, ..., 0.15883331, 0.34645639,
       0.27952123])

In [134]:
scored = pd.DataFrame({
            "track_id": lyrics_df["track_id"],
            "score": probs
        })

In [135]:
scored

Unnamed: 0,track_id,score
0,TRAAAAV128F421A322,0.274663
1,TRAAABD128F429CF47,0.259531
2,TRAAAED128E0783FAB,0.191975
3,TRAAAEF128F4273421,0.248846
4,TRAAAEW128F42930C0,0.530273
...,...,...
210514,TRZZZWS128F429CF87,0.067474
210515,TRZZZXA128F428ED56,0.327225
210516,TRZZZXV128F4289747,0.158833
210517,TRZZZYV128F92E996D,0.346456


In [137]:
merged = (
            scored
            .merge(tracks_df, on="track_id", how="left")
            .merge(interactions_df, on="song_id", how="left")
        )

In [138]:
merged

Unnamed: 0,track_id,score,song_id,artist_name,track_title,user_id,play_count
0,TRAAAAV128F421A322,0.274663,SOQPWCR12A6D4FB2A3,Western Addiction,A Poor Recipe For Civic Cohesion,3d599f5bfc8998d2f002b3f3413d489b7c156ac0,1.0
1,TRAAAAV128F421A322,0.274663,SOQPWCR12A6D4FB2A3,Western Addiction,A Poor Recipe For Civic Cohesion,7557a1fb62b97d4cd895570e9a7bb722dea3f817,1.0
2,TRAAAAV128F421A322,0.274663,SOQPWCR12A6D4FB2A3,Western Addiction,A Poor Recipe For Civic Cohesion,eda7de506ad98b9710c1dd14c97d479d1369b0e7,1.0
3,TRAAABD128F429CF47,0.259531,SOCIWDW12A8C13D406,The Box Tops,Soul Deep,84ce6a9b05c928a12f052c78554ac74a88fb28b5,3.0
4,TRAAABD128F429CF47,0.259531,SOCIWDW12A8C13D406,The Box Tops,Soul Deep,6280dcb18e0a8d7e5b1e578b6e1d4edea1cfb9c8,1.0
...,...,...,...,...,...,...,...
23921796,TRZZZYX128F92D32C6,0.279521,SOVZGCC12A58A794FC,Donald Fagen,Trans-Island Skyway (Album Version),a9ff32453027093aa607bad48ad1a0b475d3dff0,1.0
23921797,TRZZZYX128F92D32C6,0.279521,SOVZGCC12A58A794FC,Donald Fagen,Trans-Island Skyway (Album Version),bc27ed8c33730fc00b42e805294cbf0582d3568c,1.0
23921798,TRZZZYX128F92D32C6,0.279521,SOVZGCC12A58A794FC,Donald Fagen,Trans-Island Skyway (Album Version),a769ef651be79a0474c5363c83b31d3dcd53a114,8.0
23921799,TRZZZYX128F92D32C6,0.279521,SOVZGCC12A58A794FC,Donald Fagen,Trans-Island Skyway (Album Version),f597ef8df997087c3ddb0d597cb3ba5d71cbe8de,1.0


In [139]:
merged["play_count"].isna().sum()

84700

In [140]:
merged["play_count"] = merged["play_count"].fillna(0)

If the same track appears multiple times, sum scores and play counts.

In [150]:
ranked = (
            merged.groupby(["track_id", "artist_name", "track_title"])[["play_count", "score"]]
            .sum()
        )

In [151]:
ranked

Unnamed: 0,play_count,score
0,3.0,0.823989
1,72.0,7.007337
2,315.0,30.908042
3,0.0,0.248846
4,11.0,1.060547
...,...,...
210512,0.0,0.067474
210513,1.0,0.327225
210514,0.0,0.158833
210515,51.0,3.811020


In [154]:
ranked["final_score"] = ranked["score"] * ranked["play_count"]
ranked = ranked.sort_values("final_score", ascending=False).head(k).reset_index(drop=True)

In [155]:
ranked

Unnamed: 0,play_count,score,final_score
0,726885.0,26229.671948,19065960000.0
1,648239.0,18292.828299,11858120000.0
2,292642.0,27625.715676,8084445000.0
3,268353.0,14984.911115,4021246000.0
4,274627.0,9757.150022,2679577000.0
5,114362.0,22507.560614,2574010000.0
6,128837.0,16509.323808,2127012000.0
7,192884.0,10602.468086,2045046000.0
8,174080.0,11665.016804,2030646000.0
9,244730.0,6705.289396,1640985000.0


In [156]:
ranked.index = ranked.index + 1
ranked.index.name = "rank"

In [157]:
ranked

Unnamed: 0_level_0,play_count,score,final_score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,726885.0,26229.671948,19065960000.0
2,648239.0,18292.828299,11858120000.0
3,292642.0,27625.715676,8084445000.0
4,268353.0,14984.911115,4021246000.0
5,274627.0,9757.150022,2679577000.0
6,114362.0,22507.560614,2574010000.0
7,128837.0,16509.323808,2127012000.0
8,192884.0,10602.468086,2045046000.0
9,174080.0,11665.016804,2030646000.0
10,244730.0,6705.289396,1640985000.0


In [None]:
def collection_classifier(self, keyword: str, k: int = 50, vectorizer_type="count"):
        """
        Predict tracks about a keyword using a trained classifier.
         keyword: str, the genre/label to predict (must exist in lyrics_df['genre'] or similar)
        k: int, number of top tracks to return
        vectorizer_type: "count" for CountVectorizer or "tfidf" for TfidfVectorizer
        """
        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
        from sklearn.linear_model import LogisticRegression

        # Prepare features
        texts = self.lyrics_df["bow"].apply(
            lambda bow: " ".join([w for w, c in bow.items() for _ in range(c)])
        )

        # Create a binary target column for the keyword
        y = (self.lyrics_df["majority_genre"] == keyword).astype(int)  # 1 if track is the keyword genre, else 0

        # Fit vectorizer
        if vectorizer_type == "count":
            vectorizer = CountVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
        elif vectorizer_type == "tfidf":
            vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
        else:
            raise ValueError("vectorizer_type must be 'count' or 'tfidf'")
        
        X = vectorizer.fit_transform(texts)

        # Train classifier
        classifier = LogisticRegression(max_iter=500, solver='liblinear', random_state=42)
        classifier.fit(X, y)
        
        # Predict probabilities
        probs = classifier.predict_proba(X)[:, 1]  # probability of positive class

        scored = pd.DataFrame({
            "track_id": self.lyrics_df["track_id"],
            "score": probs
        })

        # Merge metadata
        merged = (
            scored
            .merge(self.tracks_df, on="track_id", how="left")
            .merge(self.interactions_df, on="song_id", how="left")
        )
        
        merged["play_count"] = merged["play_count"].fillna(0)
        
        ranked = (
            merged.groupby(["track_id", "artist_name", "track_title"])[["play_count", "score"]]
            .sum()
            .reset_index()
        )
        
        # Final ranking
        ranked["final_score"] = ranked["score"] * ranked["play_count"]
        ranked = ranked.sort_values("final_score", ascending=False).head(k)
        
        ranked.index = ranked.index + 1
        ranked.index.name = "rank"
        
        return ranked