# Music Taste Prediction Model: New Music Friday Recommender
In this model, I use my liked songs playlist, my recently loved and not loved albums, to train my regression model on what kind of music I do and don't like. At the end my test model will be the new music friday albums from the most recent Friday. 

## Library Imports

In [4]:
# Data Manipulation and Analysis
import pandas as pd
import numpy as np
from datetime import datetime
import os
import csv

# API and Network Requests
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Network Analysis
import networkx as nx

# Visualization
import matplotlib.pyplot as plt
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go

# Type Hints
from typing import List, Dict, Tuple, Optional

# Load Datasets
Here we have 4 spotify playlists from my libarary, downloaded using Exportify.net. Plus a dataset of similar artists to the artists I've enjoyed prior to this new music friday.

In [6]:
df_liked = pd.read_csv("data/liked.csv")  # Liked playlist on Spotify
df_fav_albums = pd.read_csv("data/liked_albums.csv")  # Albums I've Liked in Recent Years
df_not_liked = pd.read_csv("data/did_not_like.csv")  # Albums I've not liked in Recent Years
df_nmf = pd.read_csv("data/nmf.csv")  # The most recent New Music Friday Playlist
df_liked_similar = pd.read_csv("data/liked_artists_only_similar.csv") #Lastfm pull of similar artists to my liked artists

## Pull Similar Artists to Your Favorite Artists
This api will pull similar artists to all the unique artists from my liked songs and liked albums of the last few years.

In [8]:
class LastFMAPI:
    def __init__(self, api_key: str, rate_limit_delay: float = 0.25, limit: int = 5):
        self.api_key = api_key
        self.base_url = "http://ws.audioscrobbler.com/2.0/"
        self.rate_limit_delay = rate_limit_delay
        self.limit = limit

    def get_similar_artists(self, artist_name: str) -> List[str]:
        """Fetch similar artists for a given artist from LastFM API."""
        params = {
            'method': 'artist.getSimilar',
            'artist': artist_name,
            'api_key': self.api_key,
            'limit': self.limit,  # Add limit parameter
            'format': 'json'
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            
            # Handle rate limiting
            if 'X-RateLimit-Remaining' in response.headers:
                remaining = int(response.headers['X-RateLimit-Remaining'])
                if remaining == 0:
                    sleep(self.rate_limit_delay)
            
            data = response.json()
            if 'similarartists' in data and 'artist' in data['similarartists']:
                return [artist['name'] for artist in data['similarartists']['artist'][:self.limit]]
            return []
            
        except Exception as e:
            print(f"Error fetching similar artists for {artist_name}: {e}")
            return []

def extract_primary_artist(artist_string: str) -> str:
    """Extract the first artist name before any comma."""
    if pd.isna(artist_string):
        return ""
    return artist_string.split(",")[0].strip()

def update_similar_artists(liked_path: str, 
                         albums_path: str, 
                         output_path: str, 
                         api_key: str) -> pd.DataFrame:
    """
    Update the similar artists database with new artists from liked playlists.
    Returns the complete DataFrame of artists and their similar artists.
    """
    
    print("Loading existing and new data...")
    
    # Load existing similar artists data
    existing_data: Dict[str, List[str]] = {}
    if os.path.exists(output_path):
        existing_df = pd.read_csv(output_path)
        existing_data = dict(zip(existing_df['Artist'], existing_df['Similar Artists']))
        print(f"Loaded {len(existing_data)} existing artists from database")
    
    # Load and process current playlists
    df_liked = pd.read_csv(liked_path)
    df_albums = pd.read_csv(albums_path)
    
    # Extract and combine primary artists
    current_artists = set(
        pd.concat([
            df_liked['Artist Name(s)'].apply(extract_primary_artist),
            df_albums['Artist Name(s)'].apply(extract_primary_artist)
        ]).unique()
    )
    current_artists.discard("")  # Remove empty strings
    
    # Find new artists not in existing data
    new_artists = current_artists - set(existing_data.keys())
    print(f"Found {len(new_artists)} new artists to process")
    
    if not new_artists:
        print("No new artists to process. Database is up to date!")
        # Create and return DataFrame even if no updates
        return pd.DataFrame({
            'Artist': list(existing_data.keys()),
            'Similar Artists': list(existing_data.values())
        })
    
    # Initialize LastFM API client
    api = LastFMAPI(api_key)
    
    # Process artists with concurrent requests
    results = {}
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_artist = {
            executor.submit(api.get_similar_artists, artist): artist 
            for artist in new_artists
        }
        
        # Show progress bar while processing
        for future in tqdm(as_completed(future_to_artist), 
                         total=len(future_to_artist),
                         desc="Fetching similar artists"):
            artist = future_to_artist[future]
            similar_artists = future.result()
            results[artist] = ', '.join(similar_artists)
    
    # Combine existing and new data
    combined_data = {**existing_data, **results}
    
    # Create DataFrame
    output_df = pd.DataFrame({
        'Artist': list(combined_data.keys()),
        'Similar Artists': list(combined_data.values())
    })
    
    # Save updated data
    output_df.to_csv(output_path, index=False)
    print(f"Successfully updated database with {len(new_artists)} new artists")
    print(f"Total artists in database: {len(combined_data)}")
    
    return output_df

if __name__ == "__main__":
    # Configuration
    API_KEY = "74a510ecc9fc62bf3e0edc6adc2e99f9"
    LIKED_PATH = "data/liked.csv"
    ALBUMS_PATH = "data/liked_albums.csv"
    OUTPUT_PATH = "data/liked_artists_only_similar.csv"
    
    # Run the update and get the DataFrame
    df_liked_similar = update_similar_artists(
        LIKED_PATH, 
        ALBUMS_PATH, 
        OUTPUT_PATH, 
        API_KEY
    )
    
    # Now df_liked_similar is ready to use
    print("\nFirst few rows of the similar artists DataFrame:")
    print(df_liked_similar.head())

Loading existing and new data...
Loaded 2126 existing artists from database
Found 0 new artists to process
No new artists to process. Database is up to date!

First few rows of the similar artists DataFrame:
        Artist                                    Similar Artists
0         RY X                                                NaN
1     The Faim  Oh The Larceny, City Wolf, Random Hero, needsh...
2  Melody Lake  Ian Wong, Limelight Glow, Slow Rising Hope, Po...
3    Liza Anne  Miya Folick, Torres, Billie Marten, Pom Pom Sq...
4    The Kinks  Dave Davies, The Who, Small Faces, The Zombies...


In [9]:
df_liked_similar[df_liked_similar["Similar Artists"].isna()]

Unnamed: 0,Artist,Similar Artists
0,RY X,
22,Sampa the Great,
30,Spillage Village,
45,serpentwithfeet,
76,Omar Apollo,
...,...,...
2041,Urban Jams United,
2043,JAY-Z,
2045,PinkPantheress,
2067,Cate Le Bon,


In [10]:
"""
commented out for now because i might want to use this site for producers,
and other things. Good to have a sample code that worked.

# Load the existing CSV file into a DataFrame
df_liked_similar = pd.read_csv('data/liked_artists_only_similar.csv')

# Filter out the artists with missing similar artists (where 'Similar Artists' is NaN)
df_missing_similar = df_liked_similar[df_liked_similar["Similar Artists"].isna()]

# Function to fetch similar artists using ListenBrainz
def get_similar_artists(artist_mbid):
    api_url = f'https://labs.api.listenbrainz.org/similar-artists?artist_mbid={artist_mbid}&algorithm=session_based_days_9000_session_300_contribution_5_threshold_15_limit_50_skip_30'
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            similar_artists = [artist['artist_name'] for artist in data.get('payload', {}).get('artists', [])]
            return ', '.join(similar_artists)
        else:
            print(f"Failed to fetch data for MBID {artist_mbid}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching data for MBID {artist_mbid}: {str(e)}")
        return None

# Function to get MBID from artist name using MusicBrainz API
def get_artist_mbid(artist_name):
    api_url = f'https://musicbrainz.org/ws/2/artist?query={artist_name}&limit=1&fmt=json'
    try:
        response = requests.get(api_url, headers={'User-Agent': 'YourApp/1.0'})
        if response.status_code == 200:
            data = response.json()
            if data.get('artists'):
                return data['artists'][0]['id']  # Return the first matching MBID
        else:
            print(f"Failed to fetch MBID for {artist_name}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching MBID for {artist_name}: {str(e)}")
        return None

# To avoid overwhelming the API, we will process a batch at a time
batch_size = 5
delay_seconds = 3  # Delay in seconds between requests

# Iterate over missing artists in batches
updated_rows = []
for i, row in df_missing_similar.iterrows():
    artist_name = row['Artist']
    artist_mbid = get_artist_mbid(artist_name)
    
    if artist_mbid:
        similar_artists = get_similar_artists(artist_mbid)
        updated_rows.append((i, similar_artists))
    else:
        updated_rows.append((i, None))
    
    # Wait for the specified delay before the next request
    if (i + 1) % batch_size == 0:
        print(f"Processed {i + 1} artists, pausing for {delay_seconds} seconds...")
        time.sleep(delay_seconds)

# Update the 'Similar Artists' column for the rows that were processed
for index, similar_artists in updated_rows:
    df_liked_similar.at[index, 'Similar Artists'] = similar_artists

# Save the updated DataFrame back to the CSV file
df_liked_similar.to_csv('data/liked_artists_only_similar.csv', index=False)

# After the loop, you can check if all the missing artists have been filled
print("Updated DataFrame:")
print(df_liked_similar.head())
"""

'\ncommented out for now because i might want to use this site for producers,\nand other things. Good to have a sample code that worked.\n\n# Load the existing CSV file into a DataFrame\ndf_liked_similar = pd.read_csv(\'data/liked_artists_only_similar.csv\')\n\n# Filter out the artists with missing similar artists (where \'Similar Artists\' is NaN)\ndf_missing_similar = df_liked_similar[df_liked_similar["Similar Artists"].isna()]\n\n# Function to fetch similar artists using ListenBrainz\ndef get_similar_artists(artist_mbid):\n    api_url = f\'https://labs.api.listenbrainz.org/similar-artists?artist_mbid={artist_mbid}&algorithm=session_based_days_9000_session_300_contribution_5_threshold_15_limit_50_skip_30\'\n    try:\n        response = requests.get(api_url)\n        if response.status_code == 200:\n            data = response.json()\n            similar_artists = [artist[\'artist_name\'] for artist in data.get(\'payload\', {}).get(\'artists\', [])]\n            return \', \'.join(sim

## Quick Glance at our Refreshed Datasets

In [12]:
df_liked.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,0AAXgVpk3VBbjcwNBNt3Iz,Hope You Find What You're Looking For,Out Of The Blue,Morgan Saint,2025-02-14,190834,12,,2025-02-16T17:59:46Z,,...,6,-8.898,1,0.0372,0.722,0.00154,0.092,0.696,104.993,4
1,5FfGYE5eTM2K1Si4ucy2XC,Monuments & Bricks,Poison,CATHEDRALE,2025-02-14,243029,9,,2025-02-15T22:44:31Z,post-punk,...,9,-7.327,1,0.0322,3.6e-05,0.0036,0.264,0.234,137.635,4
2,6J1qJGfbB31D0WxiehfcoV,Did You Lose Your Heart,Out Of The Blue,Morgan Saint,2025-02-14,244143,11,,2025-02-15T19:50:54Z,,...,0,-7.904,1,0.0367,0.36,0.00743,0.115,0.914,94.995,4
3,5Iel8B8LQe12lEofB20OTp,Over and Over,Heartache in Room 14,The Altons,2025-02-14,228933,23,,2025-02-15T18:46:58Z,retro soul,...,6,-6.22,1,0.0261,0.78,2.1e-05,0.162,0.26,115.136,3
4,5sZ5AyRhjKSYAh7vqNJXuH,Victory Lap,Victory Lap,Valley Palace,2022-11-29,187629,24,,2025-02-12T16:12:28Z,dream pop,...,2,-6.695,1,0.0283,0.024,0.658,0.262,0.59,152.014,4


In [13]:
# Liked Albums in Recent Years
df_fav_albums.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,0UOeq7bSskoJa4cJaJOmFS,Ticking,Letter to Self,SPRINTS,2024-01-05,186949,31,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,11.0,-6.49,1.0,0.344,0.025,0.0765,0.0934,0.291,175.574,4.0
1,02bA26OEe0nNFyE3YcNx4K,Heavy,Letter to Self,SPRINTS,2024-01-05,207409,46,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,11.0,-5.925,1.0,0.0591,0.00435,0.000738,0.0877,0.189,88.581,4.0
2,7IPDhCIQlpvxVxtC1Q7Jq4,Cathedral,Letter to Self,SPRINTS,2024-01-05,179694,30,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,7.0,-6.231,1.0,0.0473,0.00978,0.0027,0.0887,0.397,119.056,4.0
3,65fPteG9ctHt2rrJxlbMr8,Shaking Their Hands,Letter to Self,SPRINTS,2024-01-05,222489,28,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,4.0,-5.658,0.0,0.0533,0.199,0.108,0.133,0.551,89.485,4.0
4,4UgkFdXpJD0fhw06BMk0bz,Adore Adore Adore,Letter to Self,SPRINTS,2024-01-05,157766,36,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T00:53:10Z,post-punk,...,4.0,-4.401,0.0,0.257,0.0107,0.000107,0.101,0.402,176.054,4.0


In [14]:
# Albums Not Liked in Recent Years
df_not_liked.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,54KEm0VI9i3ic7VHHKHKRx,¿Cómo Así?,ORQUÍDEAS,Kali Uchis,2024-01-12,169654,56,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,6.0,-7.662,0.0,0.0892,0.0417,0.346,0.154,0.379,135.985,4.0
1,5mVkTPlTPxlQOn7kEvuM3j,Me Pongo Loca,ORQUÍDEAS,Kali Uchis,2024-01-12,177815,53,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,7.0,-8.68,0.0,0.0426,0.0371,0.152,0.106,0.407,114.999,4.0
2,6XaJfhwof7qIgbbXO5tIQI,Igual Que Un Ángel (with Peso Pluma),ORQUÍDEAS,"Kali Uchis,Peso Pluma",2024-01-12,260370,75,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,"corrido,corridos tumbados,corridos bélicos,mús...",...,5.0,-5.34,0.0,0.032,0.00449,0.000663,0.185,0.482,108.001,4.0
3,52x8HIGuk1gGTlvO8CuLNS,Pensamientos Intrusivos,ORQUÍDEAS,Kali Uchis,2024-01-12,192027,60,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,9.0,-8.333,0.0,0.0394,0.575,0.0129,0.11,0.511,119.994,4.0
4,3RleMgz4iO0BNezGdSxDnY,Diosa,ORQUÍDEAS,Kali Uchis,2024-01-12,156037,58,mmr4r23xnc6oh1c77lysfbqg4,2025-01-29T01:04:15Z,,...,5.0,-5.518,0.0,0.0668,0.0675,0.000101,0.078,0.698,107.994,4.0


In [15]:
# New Music Friday Playlist
df_nmf.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature
0,3y0DIcBw075qzj3dOwJ5aL,From The Gods,Food From the Gods,"Black Milk,Fat Ray",2025-02-13,88678,26,jaytroymo,2025-02-14T06:13:23Z,"alternative hip hop,underground hip hop",...,9.0,-8.414,0.0,0.408,0.449,0.118,0.376,0.559,89.45,3.0
1,6TR2jKnq9yFwk3oqKUTnWp,ELDERBERRY,Food From the Gods,"Black Milk,Fat Ray",2025-02-13,189000,27,jaytroymo,2025-02-14T06:13:23Z,"alternative hip hop,underground hip hop",...,2.0,-7.407,1.0,0.347,0.454,0.00431,0.0916,0.634,81.092,4.0
2,6UC2W5IR7lx3BQGsWf4qD3,Talcum,Food From the Gods,"Black Milk,Fat Ray",2025-02-13,156144,27,jaytroymo,2025-02-14T06:13:23Z,"alternative hip hop,underground hip hop",...,11.0,-6.015,0.0,0.24,0.66,5e-06,0.215,0.628,79.591,4.0
3,7HSjLESxFyDRrmq6AdGkJh,CANE,Food From the Gods,"Black Milk,Fat Ray,Guilty Simpson",2025-02-13,149010,26,jaytroymo,2025-02-14T06:13:23Z,"alternative hip hop,underground hip hop,boom bap",...,8.0,-6.045,1.0,0.28,0.219,0.0,0.443,0.719,90.831,4.0
4,1XZqPCN9JU3xtnS0aC72UC,Just Say No,Food From the Gods,"Black Milk,Fat Ray,Danny Brown",2025-02-13,201276,31,jaytroymo,2025-02-14T06:13:23Z,"alternative hip hop,underground hip hop,experi...",...,4.0,-8.114,0.0,0.206,0.737,0.261,0.363,0.907,90.02,4.0


In [16]:
# Similar Artists to Recently Played Artists (Last.fm)

df_liked_similar.head()

Unnamed: 0,Artist,Similar Artists
0,RY X,
1,The Faim,"Oh The Larceny, City Wolf, Random Hero, needsh..."
2,Melody Lake,"Ian Wong, Limelight Glow, Slow Rising Hope, Po..."
3,Liza Anne,"Miya Folick, Torres, Billie Marten, Pom Pom Sq..."
4,The Kinks,"Dave Davies, The Who, Small Faces, The Zombies..."


> A quick reminder of the standard columns of a spotify export.

In [18]:
df_liked.columns

Index(['Track ID', 'Track Name', 'Album Name', 'Artist Name(s)',
       'Release Date', 'Duration (ms)', 'Popularity', 'Added By', 'Added At',
       'Genres', 'Record Label', 'Danceability', 'Energy', 'Key', 'Loudness',
       'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature'],
      dtype='object')

In [19]:
df_liked_similar.columns

Index(['Artist', 'Similar Artists'], dtype='object')

### Add Target Labels for Training Feature

In [21]:
# Assign liked scores before combining
df_liked['liked'] = 100
df_fav_albums['liked'] = 50
df_not_liked['liked'] = 0
df_nmf['liked'] = np.nan 

# Add playlist_origin column before combining
df_liked['playlist_origin'] = 'df_liked'
df_fav_albums['playlist_origin'] = 'df_fav_albums'
df_not_liked['playlist_origin'] = 'df_not_liked'
df_nmf['playlist_origin'] = 'df_nmf'
df_liked_similar['source'] = 'liked_similar'

### Check application of the target encoding

In [23]:
df_liked[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
0,100,df_liked
1,100,df_liked
2,100,df_liked
3,100,df_liked
4,100,df_liked


In [24]:
df_fav_albums[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
0,50,df_fav_albums
1,50,df_fav_albums
2,50,df_fav_albums
3,50,df_fav_albums
4,50,df_fav_albums


In [25]:
df_not_liked[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
0,0,df_not_liked
1,0,df_not_liked
2,0,df_not_liked
3,0,df_not_liked
4,0,df_not_liked


In [26]:
df_nmf[['liked', 'playlist_origin']].head()

Unnamed: 0,liked,playlist_origin
0,,df_nmf
1,,df_nmf
2,,df_nmf
3,,df_nmf
4,,df_nmf


In [27]:
df_liked_similar[['Artist', 'Similar Artists', 'source']].head()

Unnamed: 0,Artist,Similar Artists,source
0,RY X,,liked_similar
1,The Faim,"Oh The Larceny, City Wolf, Random Hero, needsh...",liked_similar
2,Melody Lake,"Ian Wong, Limelight Glow, Slow Rising Hope, Po...",liked_similar
3,Liza Anne,"Miya Folick, Torres, Billie Marten, Pom Pom Sq...",liked_similar
4,The Kinks,"Dave Davies, The Who, Small Faces, The Zombies...",liked_similar


## Merge The Datasets

In [29]:
df = pd.concat([df_liked, df_fav_albums, df_not_liked, df_nmf], ignore_index=True)

In [30]:
#How Large is the Dataset, Now?
df.shape

(13319, 25)

#### Remove the Duplicates

In [32]:
# Remove duplicates: Keep the highest 'liked' score (100 > 50)
df = df.sort_values(by='liked', ascending=False)  # Ensures 100-rated songs come first
df = df.drop_duplicates(subset=['Track Name', 'Artist Name(s)'], keep='first')
df.shape

(11053, 25)

In [33]:
df.columns #Checking to remind myself what is all available to drop, keep seperate as metadata, etc.

Index(['Track ID', 'Track Name', 'Album Name', 'Artist Name(s)',
       'Release Date', 'Duration (ms)', 'Popularity', 'Added By', 'Added At',
       'Genres', 'Record Label', 'Danceability', 'Energy', 'Key', 'Loudness',
       'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature', 'liked', 'playlist_origin'],
      dtype='object')

#### Drop columns that won't help the model (Track ID, Added By, Added At, Time Signature)

In [35]:
df.drop(columns=['Track ID', 'Added By', 'Added At', 'Time Signature', 'Key', 'Mode',
                'Loudness', 'Speechiness', 'Liveness', 'Valence'], inplace=True)

#### Getting missing Genres (of which Spotify is "Spotty" at best)

In [37]:
class LastFMAPI:
    def __init__(self, api_key: str, rate_limit_delay: float = 0.25):
        self.api_key = api_key
        self.base_url = "http://ws.audioscrobbler.com/2.0/"
        self.rate_limit_delay = rate_limit_delay
        
    def _make_request(self, params: dict) -> dict:
        try:
            # Handle rate limit by checking headers for remaining requests
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()

            # Check for rate limit info in the response headers
            remaining = int(response.headers.get('X-RateLimit-Remaining', 1))
            if remaining == 0:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                wait_time = reset_time - int(datetime.now().timestamp())
                print(f"Rate limit hit, waiting for {wait_time} seconds...")
                sleep(wait_time + 1)  # wait for the reset time plus 1 second for safety
                response = requests.get(self.base_url, params=params)  # retry after waiting
                response.raise_for_status()

            return response.json()

        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}")
            if response.status_code == 429:
                print("Rate limit exceeded, increasing delay.")
                self.rate_limit_delay *= 2
            return None

    def get_artist_tags(self, artist_name: str, limit: int = 5) -> List[str]:
        params = {
            'method': 'artist.getTopTags',
            'artist': artist_name,
            'api_key': self.api_key,
            'format': 'json',
            'limit': limit
        }
        data = self._make_request(params)
        if data and 'toptags' in data:
            return [tag['name'] for tag in data['toptags'].get('tag', [])]
        return []

def export_artist_tags(api_key: str, unique_artists: List[str], output_file: str = 'data/missing_genres.csv'):
    api = LastFMAPI(api_key)

    # Load existing data if the file exists
    existing_data = {}
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                existing_data[row['Artist']] = row['Tags']

    # Identify new artists not in the existing data
    new_artists = [artist for artist in unique_artists if artist not in existing_data]
    print(f"Total new artists with missing genres: {len(new_artists)}")

    if not new_artists:
        print("No new missing artists to process.")
        return

    # Using ThreadPoolExecutor to parallelize API requests
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers based on your needs
        future_to_artist = {executor.submit(api.get_artist_tags, artist): artist for artist in new_artists}
        
        try:
            with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=['Artist', 'Tags'])
                if not os.path.exists(output_file):  # Write header only if file doesn't exist
                    writer.writeheader()
                
                for i, future in enumerate(as_completed(future_to_artist), 1):
                    artist = future_to_artist[future]
                    tags = future.result()
                    
                    # Ensure only the top 5 tags are saved
                    top_5_tags = tags[:5]
                    writer.writerow({
                        'Artist': artist,
                        'Tags': ', '.join(top_5_tags)  # Join only the top 5 tags
                    })

                    # Print progress in increments of 100
                    if i % 100 == 0:
                        print(f"Processed tags for {i} new artists")
        
        except Exception as e:
            print(f"Fatal error during export: {e}")
    
    print(f"\nExport complete! Processed {len(new_artists)} new artists.")

# Extract first artist if multiple are listed
df['Primary Artist'] = df['Artist Name(s)'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

# Get unique artists with missing genres
artists_missing_genres = df[df['Genres'].isna()]['Primary Artist'].unique()
print(f"Total artists with missing genres: {len(artists_missing_genres)}")

# Fetch tags for missing artists
API_KEY = '74a510ecc9fc62bf3e0edc6adc2e99f9'
export_artist_tags(API_KEY, artists_missing_genres, output_file='data/missing_genres.csv')

# Load the fetched tags
missing_genres_df = pd.read_csv('data/missing_genres.csv')

# Merge with the original dataframe
df = df.merge(
    missing_genres_df,
    how='left',
    left_on='Primary Artist',
    right_on='Artist',
    suffixes=('', '_tags')  # Add a suffix to overlapping columns from missing_genres_df
)

# Fill missing genres in the original 'Genres' column
df['Genres'] = df['Genres'].fillna(df['Tags'])

# Drop the temporary 'Tags' and 'Artist' columns if no longer needed
df.drop(columns=['Tags', 'Artist'], inplace=True)

# Now `df` is updated in memory with the new genre data!

Total artists with missing genres: 1135
Total new artists with missing genres: 2

Export complete! Processed 2 new artists.


In [38]:
# Fill missing 'Genres' and 'Record Label' with 'Unknown'
df['Genres'] = df['Genres'].fillna('Unknown')
df['Record Label'] = df['Record Label'].fillna('Unknown')

#### Handle missing values (if any)

In [40]:
 df.isna().sum()

Track Name            0
Album Name            0
Artist Name(s)        0
Release Date          0
Duration (ms)         0
Popularity            0
Genres                0
Record Label          0
Danceability          8
Energy                8
Acousticness          8
Instrumentalness      8
Tempo                 8
liked               383
playlist_origin       0
Primary Artist        0
dtype: int64

In [41]:
# Drop rows with nulls in any column except 'liked'
df = df[df.drop(columns=['liked']).notna().all(axis=1)].reset_index(drop=True)

In [42]:
 df.isna().sum()

Track Name            0
Album Name            0
Artist Name(s)        0
Release Date          0
Duration (ms)         0
Popularity            0
Genres                0
Record Label          0
Danceability          0
Energy                0
Acousticness          0
Instrumentalness      0
Tempo                 0
liked               382
playlist_origin       0
Primary Artist        0
dtype: int64

In [43]:
df.shape

(11045, 16)

In [44]:
# Count how many of each 'playlist_origin' are in the df dataset
playlist_origin_counts = df['playlist_origin'].value_counts()

print("Playlist Origin Counts:")
print(playlist_origin_counts)


Playlist Origin Counts:
playlist_origin
df_fav_albums    5334
df_liked         4128
df_not_liked     1201
df_nmf            382
Name: count, dtype: int64


## Target Encoding Record Labels

Record labels are currently having an outsized influence on our model, making up 48% of feature importance (at the time of this addition to the model). This is mainly due to the target encoding being too granular, with hundreds of individual labels. To address this, we're grouping labels by size and frequency first. This helps reduce the risk of overfitting to smaller labels, creates more meaningful categories based on their reach and influence, and makes it easier to handle rare or lesser-known labels without distorting the model.

In [47]:
def categorize_labels_by_size(df):
    # Count number of tracks per label
    label_counts = df[df['playlist_origin'] != 'df_nmf']['Record Label'].value_counts()
    
    # Calculate percentiles
    p75 = label_counts.quantile(0.75)
    p25 = label_counts.quantile(0.25)
    
    # Create category mapping
    label_categories = {}
    for label, count in label_counts.items():
        if count >= p75:
            label_categories[label] = 'Large Label'
        elif count <= p25:
            label_categories[label] = 'Small Label'
        else:
            label_categories[label] = 'Medium Label'
    
    # Map unknown labels to 'Unknown/DIY'
    df['Label_Category'] = df['Record Label'].map(label_categories).fillna('Unknown/DIY')
    
    # Print some statistics about the categorization
    print("\nLabel Category Distribution:")
    print(df['Label_Category'].value_counts())
    print("\nSample of Large Labels:", 
          list(label for label, count in label_counts.items() if count >= p75)[:5])
    
    return df

def target_encode_categories(df, column, target, smoothing=100):
    # Separate out df_nmf to ensure it's never used in encoding
    df_train = df[df['playlist_origin'] != 'df_nmf'].copy()
    mean_target = df_train[target].mean()
    
    # Calculate encoding values for each category
    label_means = df_train.groupby(column)[target].mean()
    label_counts = df_train[column].value_counts()
    smoothed_values = (label_means * label_counts + mean_target * smoothing) / (label_counts + smoothing)
    
    # Map with a fallback to the overall mean
    df[column + '_encoded'] = df[column].map(smoothed_values).fillna(mean_target)
    return df

# Apply the categorization and encoding
df = categorize_labels_by_size(df)
df = target_encode_categories(df, 'Label_Category', 'liked', smoothing=100)

# Print sample of results
print("\nSample of encoded values:")
print(df[['Label_Category', 'Label_Category_encoded', 'liked']].head())


Label Category Distribution:
Label_Category
Large Label     8688
Medium Label    1394
Small Label      820
Unknown/DIY      143
Name: count, dtype: int64

Sample of Large Labels: ['Anti/Epitaph', 'Loma Vista Recordings', 'Columbia', 'Mom+Pop', 'Dead Oceans']

Sample of encoded values:
  Label_Category  Label_Category_encoded  liked
0   Medium Label               82.183174  100.0
1    Large Label               57.356399  100.0
2    Large Label               57.356399  100.0
3    Large Label               57.356399  100.0
4    Large Label               57.356399  100.0


## Artists with Missing Genres (Last.fm to the rescue!)

> I noticed in the data previews that one of the common genres imported from last.fm was 'seen live', which I take to meen a lot of last.fm users have seen that artist. 

In [50]:
# Remove 'seen live' from the 'Genres' column
df['Genres'] = df['Genres'].apply(lambda x: ', '.join([genre for genre in x.split(', ') if genre != 'seen live']) if pd.notna(x) else x)

## Target Encode Genres

In [52]:
# Create a binary indicator column for 'Unknown' genres
df['is_unknown_genre'] = (df['Genres'] == 'Unknown').astype(int)

# Define the target encoding function
def target_encode_multi_genre(df, genre_column, target, smoothing=1, aggregation_method='mean', nmf_fallback=0):
    """
    Target encode a multi-genre column by splitting genres, encoding individually, and aggregating.
    Explicitly handles 'Unknown' genres and NMF rows.
    """
    # Separate out df_nmf to ensure it's never used in encoding
    df_train = df[df['playlist_origin'] != 'df_nmf'].copy()

    # Calculate the global mean of the target variable
    global_mean = df_train[target].mean()

    # Split genres into individual categories and exclude 'seen live' and 'Unknown'
    df_train['split_genres'] = df_train[genre_column].str.split(', ').apply(
        lambda x: [genre for genre in x if genre != 'seen live' and genre != 'Unknown'] if isinstance(x, list) else x
    )

    # Explode the list of genres into separate rows
    exploded_genres = df_train.explode('split_genres')

    # Calculate target encoding for individual genres
    label_means = exploded_genres.groupby('split_genres')[target].mean()
    label_counts = exploded_genres['split_genres'].value_counts()

    # Calculate smoothed target encoding for individual genres
    smoothed_values = (label_means * label_counts + global_mean * smoothing) / (label_counts + smoothing)

    # Map the smoothed values back to the exploded genres
    exploded_genres['genre_encoded'] = exploded_genres['split_genres'].map(smoothed_values).fillna(global_mean)

    # Aggregate encodings for multi-genre rows
    if aggregation_method == 'mean':
        aggregated_encodings = exploded_genres.groupby(exploded_genres.index)['genre_encoded'].mean()
    elif aggregation_method == 'max':
        aggregated_encodings = exploded_genres.groupby(exploded_genres.index)['genre_encoded'].max()
    else:
        raise ValueError(f"Unsupported aggregation method: {aggregation_method}")

    # Add the aggregated encodings to the original dataframe
    df[genre_column + '_encoded'] = aggregated_encodings

    # Handle 'Unknown' genres
    is_unknown = df[genre_column] == 'Unknown'
    df.loc[is_unknown, genre_column + '_encoded'] = global_mean  # Use global mean for non-NMF rows

    # Handle NMF rows with 'Unknown' genres separately
    is_nmf = df['playlist_origin'] == 'df_nmf'
    df.loc[is_nmf & is_unknown, genre_column + '_encoded'] = nmf_fallback  # Use nmf_fallback for NMF rows

    return df

# Apply the target encoding function
df = target_encode_multi_genre(df, 'Genres', 'liked', smoothing=100, nmf_fallback=0)

# Inspect the results
print(df[['Genres', 'Genres_encoded', 'is_unknown_genre', 'liked', 'playlist_origin']].head())

                                              Genres  Genres_encoded  \
0        electronic, indie, pop, chill, experimental       64.012642   
1  indie, folk, indie pop, british, female vocalists       69.553807   
2            indie rock, dream pop, indie, indie pop       67.790130   
3  indie, folk, indie pop, british, female vocalists       69.553807   
4  indie, folk, indie pop, british, female vocalists       69.553807   

   is_unknown_genre  liked playlist_origin  
0                 0  100.0        df_liked  
1                 0  100.0        df_liked  
2                 0  100.0        df_liked  
3                 0  100.0        df_liked  
4                 0  100.0        df_liked  


In [53]:
# After the genre target encoding, add:
def extract_genre_features(genres_str):
    genres = genres_str.split(', ')
    primary_genre = genres[0]  # First genre often most relevant
    return {
        'genre_count': len(genres),
        'has_location_genre': any(g for g in genres if g.title() in STATES_AND_CITIES),
        'primary_genre': primary_genre
    }

# Add location check (add at top with imports)
STATES_AND_CITIES = {'Minnesota', 'Seattle', 'Brooklyn', 'Portland', 'Austin'} # Add more as needed

# Apply extraction
genre_features = df['Genres'].apply(extract_genre_features).apply(pd.Series)
df = pd.concat([df, genre_features], axis=1)

# Target encode primary genre
df = target_encode_categories(df, 'primary_genre', 'liked', smoothing=50)

In [54]:
df[df['Genres'] == 'Unknown']['Genres_encoded'].value_counts()

Genres_encoded
63.72503    222
0.00000      13
Name: count, dtype: int64

In [55]:
# Fill NaN values in Genres_encoded
genres_encoded_mean = df['Genres_encoded'].mean()
df['Genres_encoded'] = df['Genres_encoded'].fillna(genres_encoded_mean)

#### Further Examination of Missing Genres

In [57]:
#Which rows still have Unknown Genres
unknown_genres = df[df['Genres'] == 'Unknown']

# Count the number of 'Unknown' genre tracks from each playlist_origin
unknown_genres_origin_counts = unknown_genres['playlist_origin'].value_counts()

print("Unknown Genres by Playlist Origin:")
print(unknown_genres_origin_counts)

Unknown Genres by Playlist Origin:
playlist_origin
df_fav_albums    133
df_liked          89
df_nmf            13
Name: count, dtype: int64


##### Getting Rid of Rows Where Genre is Unknown and Playlist Origin is Not df_nmf

In [59]:
# Keep rows where:
# 1. Genres is not 'Unknown', OR
# 2. Genres is 'Unknown' and playlist_origin is 'df_nmf'
df_filtered = df[~((df['Genres'] == 'Unknown') & (df['playlist_origin'] != 'df_nmf'))]

# Verify the result
print("Remaining Rows by Playlist Origin:")
print(df_filtered['playlist_origin'].value_counts())

# Check remaining 'Unknown' genre rows
remaining_unknown_genres = df_filtered[df_filtered['Genres'] == 'Unknown']

print("\nRemaining 'Unknown' Genres by Playlist Origin:")
print(remaining_unknown_genres['playlist_origin'].value_counts())

# Inspect the remaining NMF tracks with 'Unknown' genres
unknown_genres_nmf = df_filtered[(df_filtered['Genres'] == 'Unknown') & (df_filtered['playlist_origin'] == 'df_nmf')]

print("\nRemaining NMF Tracks with 'Unknown' Genres:")
print(unknown_genres_nmf[['Track Name', 'Artist Name(s)', 'Genres_encoded']])

# Save the filtered dataframe to a CSV file (optional)
df_filtered.to_csv('data/filtered_data.csv', index=False)

Remaining Rows by Playlist Origin:
playlist_origin
df_fav_albums    5201
df_liked         4039
df_not_liked     1201
df_nmf            382
Name: count, dtype: int64

Remaining 'Unknown' Genres by Playlist Origin:
playlist_origin
df_nmf    13
Name: count, dtype: int64

Remaining NMF Tracks with 'Unknown' Genres:
                    Track Name             Artist Name(s)  Genres_encoded
10674  The Night Comes For You        Last Days of Heaven             0.0
10675             Pain Machine        Last Days of Heaven             0.0
10677       Before the Sadness  Last Days of Heaven,Almma             0.0
10678                 Babygirl  Last Days of Heaven,Almma             0.0
10894               Millennium                      Gaiko             0.0
10895                 Setagaya                      Gaiko             0.0
10896          Misses Euphoria                      Gaiko             0.0
10897                  Subdued                      Gaiko             0.0
10898                

# Finding How Central an Artist is to My Music Taste

In [61]:
# Step 1: Build the graph
G = nx.Graph()

# Add nodes for liked artists
liked_artists = set(
    df[df['playlist_origin'].isin(['df_liked', 'df_fav_albums'])]['Artist Name(s)']
    .str.split(',').explode().str.strip()
)
G.add_nodes_from(liked_artists, type='liked')

# Add nodes for similar artists (from liked)
similar_artists_liked = set(
    df_liked_similar['Similar Artists']
    .dropna()  # Remove NaN values
    .str.split(',').explode().str.strip()
)
G.add_nodes_from(similar_artists_liked, type='similar_liked')

# Add edges based on similarity (from liked)
for _, row in df_liked_similar.iterrows():
    artist = row['Artist']
    # Check if Similar Artists is a string before splitting
    if isinstance(row['Similar Artists'], str):
        similar = row['Similar Artists'].split(', ')
        for s in similar:
            G.add_edge(artist, s, weight=1.0)

# Step 2: Calculate centrality scores
centrality_scores = nx.pagerank(G)

# Step 3: Map centrality scores back to DataFrame
df['Artist Centrality'] = (
    df['Artist Name(s)']
    .str.split(',').str[0].str.strip()
    .map(centrality_scores).fillna(0)
)

# Normalize centrality scores to 0-100
df['Artist Centrality'] = (df['Artist Centrality'] / df['Artist Centrality'].max()) * 100


# Check the result
print(df[['Artist Name(s)', 'Artist Centrality']].head())

  Artist Name(s)  Artist Centrality
0   Morgan Saint          46.729154
1  Billie Marten          52.044802
2     Blondshell          38.560083
3  Billie Marten          52.044802
4  Billie Marten          52.044802


In [62]:
print("Rows with NaN Similar Artists:")
print(df_liked_similar[df_liked_similar['Similar Artists'].isna()])

Rows with NaN Similar Artists:
                 Artist Similar Artists         source
0                  RY X             NaN  liked_similar
22      Sampa the Great             NaN  liked_similar
30     Spillage Village             NaN  liked_similar
45      serpentwithfeet             NaN  liked_similar
76          Omar Apollo             NaN  liked_similar
...                 ...             ...            ...
2041  Urban Jams United             NaN  liked_similar
2043              JAY-Z             NaN  liked_similar
2045     PinkPantheress             NaN  liked_similar
2067        Cate Le Bon             NaN  liked_similar
2106              GAYLE             NaN  liked_similar

[121 rows x 3 columns]


## Genre Strength Feature

In [64]:
# Get the set of liked genres
liked_genres = set(df[df['playlist_origin'].isin(['df_liked', 'df_fav_albums'])]['Genres'].str.split(',').explode().str.strip())

# Calculate genre strength for each artist
def calculate_genre_strength(genres):
    if pd.isna(genres):
        return 0
    artist_genres = set(genres.split(','))
    overlap = artist_genres.intersection(liked_genres)
    return len(overlap) / len(artist_genres) if artist_genres else 0

df['Genre Strength'] = df['Genres'].apply(calculate_genre_strength)

# Normalize genre strength to 0–100
df['Genre Strength'] = (df['Genre Strength'] / df['Genre Strength'].max()) * 100

# Check the result
print(df[['Genres', 'Genre Strength']].head())

                                              Genres  Genre Strength
0        electronic, indie, pop, chill, experimental            20.0
1  indie, folk, indie pop, british, female vocalists            20.0
2            indie rock, dream pop, indie, indie pop            25.0
3  indie, folk, indie pop, british, female vocalists            20.0
4  indie, folk, indie pop, british, female vocalists            20.0


## Extra Feature Engineering!

In [66]:
df.columns

Index(['Track Name', 'Album Name', 'Artist Name(s)', 'Release Date',
       'Duration (ms)', 'Popularity', 'Genres', 'Record Label', 'Danceability',
       'Energy', 'Acousticness', 'Instrumentalness', 'Tempo', 'liked',
       'playlist_origin', 'Primary Artist', 'Label_Category',
       'Label_Category_encoded', 'is_unknown_genre', 'Genres_encoded',
       'genre_count', 'has_location_genre', 'primary_genre',
       'primary_genre_encoded', 'Artist Centrality', 'Genre Strength'],
      dtype='object')

In [67]:
df.dtypes

Track Name                 object
Album Name                 object
Artist Name(s)             object
Release Date               object
Duration (ms)               int64
Popularity                  int64
Genres                     object
Record Label               object
Danceability              float64
Energy                    float64
Acousticness              float64
Instrumentalness          float64
Tempo                     float64
liked                     float64
playlist_origin            object
Primary Artist             object
Label_Category             object
Label_Category_encoded    float64
is_unknown_genre            int32
Genres_encoded            float64
genre_count                 int64
has_location_genre           bool
primary_genre              object
primary_genre_encoded     float64
Artist Centrality         float64
Genre Strength            float64
dtype: object

## Standardize the numeric columns

### Seperate New Music Friday and Save it for Later!

In [70]:
# Filter out 'df_nmf' from the main dataframe and save it for later
df_nmf = df[df['playlist_origin'] == 'df_nmf'].copy()

# Remove df_nmf entries from the original dataframe
df = df[df['playlist_origin'] != 'df_nmf'].copy()

# Save df_nmf to CSV for later use
df_nmf.to_csv('data/df_nmf_later.csv', index=False)

In [71]:
# Define numeric columns to standardize
numeric_columns = [
    'Duration (ms)', 'Popularity', 'Danceability', 'Energy',
    'Acousticness', 'Instrumentalness',
    'Tempo', 'Label_Category_encoded', 'Genres_encoded',  
    'Artist Centrality', 'Genre Strength' 
]
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data (df)
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Transform the test data (df_nmf) using the fitted scaler
df_nmf[numeric_columns] = scaler.transform(df_nmf[numeric_columns])

# Save the standardized df_nmf for later use
df_nmf.to_csv('data/df_nmf_later.csv', index=False)

In [72]:
df_nmf[numeric_columns].head()

Unnamed: 0,Duration (ms),Popularity,Danceability,Energy,Acousticness,Instrumentalness,Tempo,Label_Category_encoded,Genres_encoded,Artist Centrality,Genre Strength
10663,-1.798552,0.138851,-0.187197,0.483369,0.285986,0.035748,-1.040421,-0.484897,-0.020241,-1.534955,0.962292
10664,-0.360264,0.188928,-0.18097,0.9522,0.300948,-0.436113,-1.322086,-0.484897,-0.020241,-1.534955,0.962292
10665,-0.831311,0.188928,-0.766273,0.788754,0.917404,-0.453979,-1.37267,-0.484897,-0.020241,-1.534955,0.962292
10666,-0.933589,0.138851,0.42924,1.550065,-0.40229,-0.454001,-0.993881,-0.484897,-0.020241,-1.534955,0.962292
10667,-0.184266,0.389235,-0.336636,0.289816,1.147827,0.629257,-1.021212,-0.484897,-0.020241,-1.534955,0.962292


#### The data is now ready for modeling


## One last look at our columns before we run our model(s)

| Column Name           | Description                                                                                      | Data Type   | Drop From Model? |
|-----------------------|--------------------------------------------------------------------------------------------------|-------------|------------------|
| Track Name            | Name of the track (song)                                                                          | object      | Yes              |
| Album Name            | Name of the album the track belongs to                                                            | object      | Yes              |
| Artist Name(s)        | Name(s) of the artist(s) associated with the track                                                | object      | Yes              |
| Release Date          | Release date of the track (in object format for now, can be converted to datetime)               | object      | Yes              |
| Duration (ms)         | Duration of the track in milliseconds                                                             | int64       | No               |
| Popularity            | Popularity score of the track (higher is more popular)                                            | int64       | No               |
| Genres                | Genres associated with the track                                                                  | object      | Yes              |
| Record Label          | Record label associated with the track                                                            | object      | Yes              |
| Danceability          | Measure of the track's danceability (0-1 scale)                                                  | float64     | No               |
| Energy                | Energy level of the track (0-1 scale)                                                            | float64     | No               |
| Key                   | The key of the track (musical key)                                                                | float64     | No               |
| Loudness              | The loudness of the track (in decibels)                                                          | float64     | No               |
| Mode                  | Mode of the track (major or minor key)                                                           | float64     | No               |
| Speechiness           | Amount of speech-like content in the track                                                        | float64     | No               |
| Acousticness          | Measure of acoustic quality (0-1 scale)                                                           | float64     | No               |
| Instrumentalness      | Measure of instrumental content (0-1 scale)                                                      | float64     | No               |
| Liveness              | Measure of the track's liveness (0-1 scale)                                                      | float64     | No               |
| Valence               | Measure of the track's mood (0-1 scale, from negative to positive)                               | float64     | No               |
| Tempo                 | Tempo of the track (beats per minute)                                                            | float64     | No               |
| liked                 | Target variable: Whether the track was liked (1 = liked, 0 = not liked)                          | float64     | No               |
| playlist_origin       | The playlist where the track originates from (e.g., 'df_nmf' for New Music Friday)               | object      | Yes              |
| Primary Artist        | Main artist of the track (extracted from Artist Name(s))                                         | object      | Yes              |
| Record Label_encoded  | Encoded version of the record label (numeric representation)                                     | float64     | No               |
| is_unknown_genre      | Binary indicator if the track has an unknown genre (1 = unknown, 0 = known)                     | int32       | No               |
| Genres_encoded        | Encoded version of the genre (numeric representation)                                            | float64     | No               |
| Artist Centrality     | Measure of artist's importance in the similarity network (0-100 scale)                          | float64     | No               |
| NMF Similarity        | Similarity score based on NMF algorithm (0-100 scale)                                           | float64     | No               |
| Genre Strength        | Measure of how strongly a track belongs to its assigned genres                                   | float64     | No               |

# Run the Model

In [77]:
# Step 1: Calculate Genre_Diversity for the Entire Dataframe (df)
df['Genre_Diversity'] = df['Genres'].str.count(',') / df['Genres'].str.len()

# Step 2: Split the Data into Training and Test Sets
df_train = df[df['playlist_origin'] != 'df_nmf'].copy()  # Training data (liked, fav_albums, not_liked)
df_nmf = df[df['playlist_origin'] == 'df_nmf'].copy()    # Test data (New Music Friday)


# Define the Features List
features = [
    'Popularity', 'Genres_encoded', 'Artist Centrality',
]

# Prepare Training Data
X = df[features]  # Features
y = df['liked']   # Target variable

# Train LightGBM Model
lgb_model = LGBMRegressor(n_estimators=100, random_state=42)
lgb_model.fit(X, y)

# Normalize the Target Variable
y_mean = df['liked'].mean()
y_std = df['liked'].std()
y_normalized = (df['liked'] - y_mean) / y_std

# Use Normalized Target
X = df[features]  # Keep X the same
y = y_normalized  # Use normalized target

# Train Other Models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)

rf_model.fit(X, y)
xgb_model.fit(X, y)

# Get Feature Importance from All Models
rf_importance = pd.DataFrame({
    'feature': features,
    'importance_rf': rf_model.feature_importances_
}).sort_values('importance_rf', ascending=False)

xgb_importance = pd.DataFrame({
    'feature': features,
    'importance_xgb': xgb_model.feature_importances_
}).sort_values('importance_xgb', ascending=False)

lgb_importance = pd.DataFrame({
    'feature': features,
    'importance_lgb': lgb_model.feature_importances_
}).sort_values('importance_lgb', ascending=False)

# Combine Importance Scores from All Three Models
feature_importance = pd.merge(rf_importance, xgb_importance, on='feature')
feature_importance = pd.merge(feature_importance, lgb_importance, on='feature')
feature_importance['avg_importance'] = (
    feature_importance['importance_rf'] + 
    feature_importance['importance_xgb'] + 
    feature_importance['importance_lgb']
) / 3

# Scale Feature Importances to Percentages
feature_importance['avg_importance'] = feature_importance['avg_importance'] * 100 / feature_importance['avg_importance'].sum()
feature_importance = feature_importance.sort_values('avg_importance', ascending=False)

# Prepare NMF Data for Prediction
df_nmf['Genre_Diversity'] = df_nmf['Genres'].str.count(',') / df_nmf['Genres'].str.len()
df_nmf_cleaned = df_nmf[features]

# Get Base Predictions and Scale Them to 0-100 Range
rf_predictions = np.clip((rf_model.predict(df_nmf_cleaned) * y_std + y_mean) / df['liked'].max() * 100, 0, 100)
xgb_predictions = np.clip((xgb_model.predict(df_nmf_cleaned) * y_std + y_mean) / df['liked'].max() * 100, 0, 100)
lgb_predictions = np.clip((lgb_model.predict(df_nmf_cleaned) * y_std + y_mean) / df['liked'].max() * 100, 0, 100)

# Get CV Scores for All Models (for Weighted Ensemble)
custom_scorer_func = make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true * y_std + y_mean, y_pred * y_std + y_mean), greater_is_better=False)
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring=custom_scorer_func)
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring=custom_scorer_func)
lgb_cv_scores = cross_val_score(lgb_model, X, y, cv=5, scoring=custom_scorer_func)

# Create Weighted Ensemble Based on CV Scores
cv_scores = {
    'rf': abs(rf_cv_scores.mean()),
    'xgb': abs(xgb_cv_scores.mean()),
    'lgb': abs(lgb_cv_scores.mean())
}
total = sum(cv_scores.values())
weights = {k: v/total for k, v in cv_scores.items()}

# Combine Scaled Predictions
final_predictions = (
    rf_predictions * weights['rf'] +
    xgb_predictions * weights['xgb'] +
    lgb_predictions * weights['lgb']
)

df_nmf['predicted_score'] = final_predictions

# Get Prediction Intervals (Modified for Scaled Values)
def get_prediction_interval(X, model, percentile=95):
    predictions = []
    for estimator in model.estimators_:
        pred = estimator.predict(X) * y_std + y_mean
        pred = np.clip(pred / df['liked'].max() * 100, 0, 100)  # Scale to 0-100
        predictions.append(pred)
    predictions = np.array(predictions)
    lower = np.percentile(predictions, (100-percentile)/2, axis=0)
    upper = np.percentile(predictions, 100-(100-percentile)/2, axis=0)
    return lower, upper

# Calculate Prediction Intervals
lower_bound, upper_bound = get_prediction_interval(df_nmf_cleaned, rf_model)
df_nmf['prediction_lower'] = lower_bound
df_nmf['prediction_upper'] = upper_bound
df_nmf['prediction_uncertainty'] = upper_bound - lower_bound

# Get the Most Common Release Date from NMF Dataset
nmf_release_date = df_nmf['Release Date'].mode().iloc[0]

# Aggregate by Album (Working with Already-Scaled Values)
album_predictions = df_nmf.groupby('Album Name').agg({
    'Artist Name(s)': 'first',
    'predicted_score': ['mean', 'min', 'max', 'std', 'count'],
    'prediction_uncertainty': 'mean',
    'Genres': 'first',
    'Record Label': 'first'
}).reset_index()

# Flatten Column Names
album_predictions.columns = [
    'Album Name', 'Artist', 
    'avg_score', 'min_score', 'max_score', 'score_std', 'track_count',
    'avg_uncertainty', 'Genres', 'Record Label'
]

# Calculate Confidence Score
max_std = album_predictions['score_std'].max()
max_uncertainty = album_predictions['avg_uncertainty'].max()

album_predictions['confidence_score'] = (
    (1 - album_predictions['score_std'] / max_std) * 
    (1 - album_predictions['avg_uncertainty'] / max_uncertainty) * 
    (1 - 1/(1 + album_predictions['track_count']))
) * 100

# Clip Confidence Scores
album_predictions['confidence_score'] = np.clip(
    album_predictions['confidence_score'], a_min=1, a_max=100
)

# Add Weighted Score for Ranking
album_predictions['weighted_score'] = (
    album_predictions['avg_score'] * album_predictions['confidence_score'] / 100
)

# Format the Date for the Filename
date_str = datetime.strptime(nmf_release_date, '%Y-%m-%d').strftime('%m-%d-%y')
filename = f"{date_str}_Album_Recommendations.csv"

# Create Predictions Directory if It Doesn't Exist
os.makedirs('predictions', exist_ok=True)

# Reorder Columns and Sort by avg_score
album_recommendations = album_predictions.reindex(columns=[
    'Artist', 'Album Name', 'avg_score', 'confidence_score',
    'min_score', 'max_score', 'score_std', 'track_count',
    'avg_uncertainty', 'Genres', 'Record Label', 'weighted_score'
]).sort_values('avg_score', ascending=False)

# Save Recommendations
album_recommendations.to_csv(f'predictions/{filename}', index=False)

# Sort df_nmf by predicted_score Before Saving Detailed Predictions
df_nmf_sorted = df_nmf.sort_values('predicted_score', ascending=False)
df_nmf_sorted[['Artist Name(s)', 'Track Name', 'Album Name', 'predicted_score',
               'prediction_lower', 'prediction_upper', 'prediction_uncertainty']].to_csv(
                   f'predictions/nmf_predictions_with_uncertainty.csv', index=False)

# Print Results
print(f"\nNew Music Friday Release Date: {nmf_release_date}")
print("\nTop 20 Recommended Albums:")
print(album_recommendations[['Album Name', 'Artist', 'avg_score', 'track_count', 
                           'confidence_score']].head(20))

# Print Cross-Validation Results
print("\nModel Performance Metrics:")
print(f"Random Forest CV Score: {rf_cv_scores.mean():.3f} (+/- {rf_cv_scores.std() * 2:.3f})")
print(f"XGBoost CV Score: {xgb_cv_scores.mean():.3f} (+/- {xgb_cv_scores.std() * 2:.3f})")

# Print Top 20 Most Important Features
print("\nTop 20 Most Important Features:")
print(feature_importance[['feature', 'avg_importance']].head(20).to_string(index=False))

# Visualize Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['avg_importance'])
plt.xlabel('Importance (%)')
plt.title('Feature Importance (% of Total Impact)')
plt.gca().invert_yaxis()
plt.show()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 600
[LightGBM] [Info] Number of data points in the train set: 10663, number of used features: 3
[LightGBM] [Info] Start training from score 63.725030


ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required by RandomForestRegressor.