### Import

In [1]:
import os
import pandas as pd

### Datasets

In [2]:
data_path = './drive/MyDrive/S21/data/My_Spotify/'

os.listdir(data_path)

['p02_msd_tagtraum_cd2.cls',
 'p02_unique_tracks.txt',
 'train_triplets.txt',
 'mxm_dataset_train.txt']

Echo Nest Taste Profile Subset (triplets)

In [3]:
interactions_df = pd.read_csv(data_path + 'train_triplets.txt',
                      sep='\t',
                      header=None,
                      names=['user_id', 'song_id', 'play_count']
)

print(interactions_df.shape)
interactions_df.head()

(48373586, 3)


Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


MusiXmatch Dataset (lyrics)

In [4]:
with open(data_path+'mxm_dataset_train.txt') as f:
  lines = f.readlines()

# for i, line in enumerate(lines[:20]):
#   print(i, line.strip())


# Find the line with %
vocab_line = [l for l in lines if l.startswith('%')][0]
vocab = vocab_line[1:].strip().split(',')   # remove '%' and split
print("Vocabulary size:", len(vocab))
print("First 10 words:", vocab[:10])


records = []
for line in lines:
    if line.startswith('#') or line.startswith('%'):
        continue  # skip comments and vocab line

    parts = line.strip().split(',')
    track_id = parts[0]
    mxm_track_id = parts[1]
    word_counts = parts[2:]

    bow = {}
    for wc in word_counts:
        idx, count = wc.split(':')
        bow[vocab[int(idx)-1]] = int(count)  # word indices start at 1

    records.append((track_id, mxm_track_id, bow))

Vocabulary size: 5000
First 10 words: ['i', 'the', 'you', 'to', 'and', 'a', 'me', 'it', 'not', 'in']


In [5]:
lyrics_df = pd.DataFrame(records, columns=['track_id', 'mxm_track_id', 'bow'])

print(lyrics_df.shape)
lyrics_df.head()

(210519, 3)


Unnamed: 0,track_id,mxm_track_id,bow
0,TRAAAAV128F421A322,4623710,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5..."
1,TRAAABD128F429CF47,6477168,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2..."
2,TRAAAED128E0783FAB,2516445,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'..."
3,TRAAAEF128F4273421,3759847,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1..."
4,TRAAAEW128F42930C0,3783760,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '..."


Tagtraum Genre Annotations

In [6]:
genres_df = pd.read_csv(
    data_path + 'p02_msd_tagtraum_cd2.cls',
    sep='\t',
    comment='#',
    header=None,
    names=['track_id', 'majority_genre', 'minority_genre']
)

print(genres_df.shape)
genres_df.head(20)

(280831, 3)


Unnamed: 0,track_id,majority_genre,minority_genre
0,TRAAAAK128F9318786,Rock,
1,TRAAAAW128F429D538,Rap,
2,TRAAABD128F429CF47,Rock,RnB
3,TRAAADJ128F4287B47,Rock,
4,TRAAADZ128F9348C2E,Latin,
5,TRAAAED128E0783FAB,Jazz,
6,TRAAAEF128F4273421,Rock,
7,TRAAAEM128F93347B9,Electronic,
8,TRAAAFD128F92F423A,Punk,Rock
9,TRAAAGF12903CEC202,Pop,


Track ↔ Song Mapping

In [7]:
tracks_df = pd.read_csv(data_path + 'p02_unique_tracks.txt',
                               sep='<SEP>',
                               header=None,
                               engine="python", # needed because <SEP> is more than 1 character
                               names=["track_id", "song_id", "artist_name", "track_title"]
)

print(tracks_df.shape)
tracks_df.head()

(1000000, 4)


Unnamed: 0,track_id,song_id,artist_name,track_title
0,TRMMMYQ128F932D901,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,TRMMMKD128F425225D,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,TRMMMRX128F93187D9,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,TRMMMCH128F425532C,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,TRMMMWA128F426B589,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


### Top 250 Tracks

In [20]:
interactions_df.head()

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [5]:
# 1. Aggregate play counts per song
track_playcounts = (
    interactions_df
    .groupby('song_id')['play_count']
    .sum()
    .reset_index()
)

In [6]:
# 2. Sort descending
track_playcounts = track_playcounts.sort_values(
    by="play_count", ascending=False
)

In [7]:
# 3. Take Top 250
top_250 = track_playcounts.head(250)

In [11]:
# 4. Merge with metadata (tracks_df)
top_250 = (
    top_250.merge(tracks_df, on="song_id", how="left")
    [["artist_name", "track_title", "play_count"]]
)

In [18]:
# 5. Add ranking index
top_250.index = top_250.index + 1
top_250.index.name = "rank"

In [19]:
top_250

Unnamed: 0_level_0,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Dwight Yoakam,You're The One,726885
5,Björk,Undo,648239
6,Kings Of Leon,Revelry,527893
7,Harmonia,Sehr kosmisch,425463
8,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,389880
...,...,...,...
266,Triple Six Mafia,Now I'm High_ Really High,35253
267,The Red Jumpsuit Apparatus,Face Down (Album Version),35245
268,Linkin Park,New Divide (Album Version),35191
269,Selena Gomez & The Scene,Naturally,35074


### Top 100 tracks by genre

In [2]:
genres_df.head()

In [7]:
genre = "Rock"

In [None]:
# 1. Merge interactions with track metadata
merged = (
    interactions_df
    .merge(tracks_df, on="song_id", how="left")
    .merge(genres_df[["track_id", "majority_genre"]], on="track_id", how="left")
)

In [None]:
# 2. Filter by genre
genre_df = merged[merged["majority_genre"] == genre]

In [None]:
# 3. Aggregate play counts per track
track_playcounts = (
    genre_df
    .groupby(['track_id', 'artist_name', 'track_title'])['play_count']
    .sum()
    .reset_index()
)

In [None]:
# 4. Sort and take top k
top_100 = (
    track_playcounts
    .sort_values("play_count", ascending=False)
    .head(100)
    .reset_index(drop=True)
)

In [None]:
# 5. Add ranking index
top_100.index = top_100.index + 1
top_100.index.name = "rank"

In [None]:
def recommend_top_k_by_genre(interactions_df, unique_tracks_df, tagtraum_df, genre, k=100):
    """
    Return Top k tracks for a given genre.
    Output: DataFrame [index, artist_name, track_title, play_count]
    """
    # 1. Merge interactions with track metadata
    merged = (
        interactions_df
        .merge(unique_tracks_df, on="song_id", how="left")
        .merge(tagtraum_df[["track_id", "majority_genre"]], on="track_id", how="left")
    )

    # 2. Filter by genre
    genre_df = merged[merged["majority_genre"] == genre]

    # 3. Aggregate play counts per track
    track_playcounts = (
        genre_df.groupby(["track_id", "artist_name", "track_title"])["play_count"]
        .sum()
        .reset_index()
    )

    # 4. Sort and take top k
    top_k = (
        track_playcounts
        .sort_values("play_count", ascending=False)
        .head(k)
        .reset_index(drop=True)
    )

    # 5. Add ranking index
    top_k.index = top_k.index + 1
    top_k.index.name = "rank"

    return top_k


### Collections

#### Baseline

In [63]:
# lyrics_df.head()
# interactions_df.head()
# tracks_df.head()

In [72]:
keyword = "love"
k = 50
threshold = 50

In [74]:
mask = lyrics_df['bow'].apply(lambda bow: bow.get(keyword, 0) >= threshold)
keyword_tracks = lyrics_df[mask][["track_id"]]
keyword_tracks

Unnamed: 0,track_id
340,TRAAZKV12903CDDFE0
2604,TRAIATD128F9331CD6
6590,TRAUGTD128F9341258
8354,TRAZWJN128F92FAABA
10271,TRBFYNY128F92F8755
...,...
188765,TRXHMID128F42BA902
191998,TRXSCYN12903CE9E81
201635,TRYWXXY12903CB9898
203608,TRZDARI128F427D53F


In [75]:
merged = (
    keyword_tracks
    .merge(tracks_df, on="track_id", how="left")
    .merge(interactions_df, on="song_id", how="left")
)

In [83]:
# merged

In [77]:
track_playcounts = (
    merged.groupby(["artist_name", "track_title"])["play_count"]
    .sum()
    .reset_index()
    .sort_values(by="play_count", ascending=False)
    .head(k)
)

In [80]:
track_playcounts.index = track_playcounts.index + 1
track_playcounts.index.name = "rank"

In [81]:
track_playcounts.head(10)

Unnamed: 0_level_0,artist_name,track_title,play_count
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
75,Wilson Pickett,I'm In Love (Single/LP Version),9713.0
21,Enrique Iglesias / Ciara,Takin' Back My Love,3155.0
31,Jill Scott,It's Love,2648.0
76,Zapp & Roger,Computer Love,1896.0
46,Luther Vandross,Wait For Love,1330.0
48,Matt Redman,Here Is Love,1072.0
70,Third World,Try Jah Love,1027.0
62,Squeeze,If It's Love,734.0
36,Karyn White,The Way You Love Me (LP Version),636.0
49,N.E.R.D.,Backseat Love,621.0


In [None]:
def collection_baseline(self, keyword : str, threshold: int = 1, k: int = 50):
    """Return Top-k tracks containing a keyword in lyrics (baseline approach)."""
    mask = self.lyrics_df["bow"].apply(lambda bow: bow.get(keyword, 0) >= threshold)
    keyword_tracks = self.lyrics_df[mask]["track_id"]

    # Merge with play counts
    merged = (
        keyword_tracks
        .merge(self.tracks_df, on="track_id", how="left")
        .merge(self.interactions_df, on="song_id", how="left")
    )

    track_playcounts = (
        merged.groupby(["track_id", "artist_name", "track_title"])["play_count"]
        .sum().reset_index()
        .sort_values("play_count", ascending=False)
        .head(k)
    )

    track_playcounts.index = track_playcounts.index + 1
    track_playcounts.index.name = "rank"
    return track_playcounts

#### WORD2VEC

In [9]:
# !pip install --upgrade numpy scipy
!pip install -qqq gensim
!pip install numpy==1.26.2

In [None]:
 def collection_word2vec(self, keyword: str, model, topn: int = 10, threshold: int = 1, k: int = 50):
        """
        Use Word2Vec expansion of keyword.
        model: trained gensim Word2Vec model
        """
        similar_words = [w for w, _ in model.wv.most_similar(keyword, topn=topn)]
        all_keywords = [keyword] + similar_words

        mask = self.lyrics_df["bow"].apply(
            lambda bow: any(bow.get(w, 0) >= threshold for w in all_keywords)
        )
        keyword_tracks = self.lyrics_df[mask][["track_id"]]

        merged = (
            keyword_tracks
            .merge(self.interactions_df, on="track_id", how="left")
            .merge(self.tracks_df, on="track_id", how="left")
        )

        track_playcounts = (
            merged.groupby(["track_id", "artist_name", "track_title"])["play_count"]
            .sum()
            .reset_index()
            .sort_values("play_count", ascending=False)
            .head(k)
        )

        track_playcounts.index = track_playcounts.index + 1
        track_playcounts.index.name = "rank"
        return track_playcounts

In [None]:
import numpy
import gensim
from gensim.models import Word2Vec


# Convert bow dict -> tokenized list of words (repeated by count)
sentences = [
    [word for word, cnt in bow.items() for _ in range(cnt)]
    for bow in lyrics_df["bow"]
]

# Train Word2Vec
w2v_model = Word2Vec(
    sentences,
    vector_size=100,  # embedding dim
    window=5,
    min_count=5,      # ignore rare words
    workers=4,
    sg=1,             # skip-gram
    epochs=10
)

In [88]:
print(w2v_model.wv.most_similar("love", topn=10))

### CLASSIFIER

In [None]:
def collection_classifier(self, keyword: str, classifier, vectorizer, k: int = 50):
        """
        Predict tracks about a keyword using a trained classifier.
        classifier: sklearn model (e.g., LogisticRegression)
        vectorizer: TF-IDF or CountVectorizer fitted on lyrics
        """
        # Prepare features (BoW -> text)
        texts = self.lyrics_df["bow"].apply(
            lambda bow: " "
            .join(
                [w for w, c in bow.items()
                for _ in range(c)])
            )
        X = vectorizer.transform(texts)
        probs = classifier.predict_proba(X)[:, 1]  # probability of positive class

        scored = pd.DataFrame({
            "track_id": self.lyrics_df["track_id"],
            "score": probs
        })

        merged = (
            scored.merge(self.interactions_df, on="track_id", how="left")
            .merge(self.tracks_df, on="track_id", how="left")
        )

        ranked = (
            merged.groupby(["track_id", "artist_name", "track_title"])[["play_count", "score"]]
            .sum()
            .reset_index()
        )

        # Re-rank by score * popularity
        ranked["final_score"] = ranked["score"] * ranked["play_count"]
        ranked = ranked.sort_values("final_score", ascending=False).head(k)

        ranked.index = ranked.index + 1
        ranked.index.name = "rank"
        return ranked