# Music Taste Prediction Model: New Music Friday Recommender
In this model, I use my liked songs playlist, my recently loved and not loved albums, to train my regression model on what kind of music I do and don't like. At the end my test model will be the new music friday albums from the most recent Friday. 

## Library Imports

In [5]:
# Data Manipulation and Analysis
import pandas as pd
import numpy as np
from datetime import datetime
import os
import csv

# API and Network Requests
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import sleep

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Network Analysis
import networkx as nx

# Visualization
import matplotlib.pyplot as plt
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go

# Type Hints
from typing import List, Dict, Tuple, Optional

In [6]:
# Load Datasets

In [7]:
df_liked = pd.read_csv("data/liked.csv")  # Liked playlist on Spotify
df_fav_albums = pd.read_csv("data/liked_albums.csv")  # Albums I've Liked in Recent Years
df_not_liked = pd.read_csv("data/did_not_like.csv")  # Albums I've not liked in Recent Years
df_nmf = pd.read_csv("data/nmf.csv")  # The most recent New Music Friday Playlist
df_nmf_similar = pd.read_csv("data/nmf_artist_adjacent.csv")  # Lastfm pull of similar artists to this weeks NMF artists

In [8]:
## Pull Similar Artists to Your Favorite Artists

In [9]:
import os
import pandas as pd
import requests
from datetime import datetime
from time import sleep
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Set

class LastFMAPI:
    def __init__(self, api_key: str, rate_limit_delay: float = 0.25, limit: int = 5):
        self.api_key = api_key
        self.base_url = "http://ws.audioscrobbler.com/2.0/"
        self.rate_limit_delay = rate_limit_delay
        self.limit = limit

    def get_similar_artists(self, artist_name: str) -> List[str]:
        """Fetch similar artists for a given artist from LastFM API."""
        params = {
            'method': 'artist.getSimilar',
            'artist': artist_name,
            'api_key': self.api_key,
            'limit': self.limit,  # Add limit parameter
            'format': 'json'
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            
            # Handle rate limiting
            if 'X-RateLimit-Remaining' in response.headers:
                remaining = int(response.headers['X-RateLimit-Remaining'])
                if remaining == 0:
                    sleep(self.rate_limit_delay)
            
            data = response.json()
            if 'similarartists' in data and 'artist' in data['similarartists']:
                return [artist['name'] for artist in data['similarartists']['artist'][:self.limit]]
            return []
            
        except Exception as e:
            print(f"Error fetching similar artists for {artist_name}: {e}")
            return []

def extract_primary_artist(artist_string: str) -> str:
    """Extract the first artist name before any comma."""
    if pd.isna(artist_string):
        return ""
    return artist_string.split(",")[0].strip()

def update_similar_artists(liked_path: str, 
                         albums_path: str, 
                         output_path: str, 
                         api_key: str) -> pd.DataFrame:
    """
    Update the similar artists database with new artists from liked playlists.
    Returns the complete DataFrame of artists and their similar artists.
    """
    
    print("Loading existing and new data...")
    
    # Load existing similar artists data
    existing_data: Dict[str, List[str]] = {}
    if os.path.exists(output_path):
        existing_df = pd.read_csv(output_path)
        existing_data = dict(zip(existing_df['Artist'], existing_df['Similar Artists']))
        print(f"Loaded {len(existing_data)} existing artists from database")
    
    # Load and process current playlists
    df_liked = pd.read_csv(liked_path)
    df_albums = pd.read_csv(albums_path)
    
    # Extract and combine primary artists
    current_artists = set(
        pd.concat([
            df_liked['Artist Name(s)'].apply(extract_primary_artist),
            df_albums['Artist Name(s)'].apply(extract_primary_artist)
        ]).unique()
    )
    current_artists.discard("")  # Remove empty strings
    
    # Find new artists not in existing data
    new_artists = current_artists - set(existing_data.keys())
    print(f"Found {len(new_artists)} new artists to process")
    
    if not new_artists:
        print("No new artists to process. Database is up to date!")
        # Create and return DataFrame even if no updates
        return pd.DataFrame({
            'Artist': list(existing_data.keys()),
            'Similar Artists': list(existing_data.values())
        })
    
    # Initialize LastFM API client
    api = LastFMAPI(api_key)
    
    # Process artists with concurrent requests
    results = {}
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_artist = {
            executor.submit(api.get_similar_artists, artist): artist 
            for artist in new_artists
        }
        
        # Show progress bar while processing
        for future in tqdm(as_completed(future_to_artist), 
                         total=len(future_to_artist),
                         desc="Fetching similar artists"):
            artist = future_to_artist[future]
            similar_artists = future.result()
            results[artist] = ', '.join(similar_artists)
    
    # Combine existing and new data
    combined_data = {**existing_data, **results}
    
    # Create DataFrame
    output_df = pd.DataFrame({
        'Artist': list(combined_data.keys()),
        'Similar Artists': list(combined_data.values())
    })
    
    # Save updated data
    output_df.to_csv(output_path, index=False)
    print(f"Successfully updated database with {len(new_artists)} new artists")
    print(f"Total artists in database: {len(combined_data)}")
    
    return output_df

if __name__ == "__main__":
    # Configuration
    API_KEY = "74a510ecc9fc62bf3e0edc6adc2e99f9"
    LIKED_PATH = "data/liked.csv"
    ALBUMS_PATH = "data/liked_albums.csv"
    OUTPUT_PATH = "data/liked_artists_only_similar.csv"
    
    # Run the update and get the DataFrame
    df_liked_similar = update_similar_artists(
        LIKED_PATH, 
        ALBUMS_PATH, 
        OUTPUT_PATH, 
        API_KEY
    )
    
    # Now df_liked_similar is ready to use
    print("\nFirst few rows of the similar artists DataFrame:")
    print(df_liked_similar.head())

Loading existing and new data...
Loaded 2126 existing artists from database
Found 0 new artists to process
No new artists to process. Database is up to date!

First few rows of the similar artists DataFrame:
        Artist                                    Similar Artists
0         RY X                                                NaN
1     The Faim  Oh The Larceny, City Wolf, Random Hero, needsh...
2  Melody Lake  Ian Wong, Limelight Glow, Slow Rising Hope, Po...
3    Liza Anne  Miya Folick, Torres, Billie Marten, Pom Pom Sq...
4    The Kinks  Dave Davies, The Who, Small Faces, The Zombies...


In [10]:
df_liked_similar[df_liked_similar["Similar Artists"].isna()]

Unnamed: 0,Artist,Similar Artists
0,RY X,
22,Sampa the Great,
30,Spillage Village,
45,serpentwithfeet,
76,Omar Apollo,
...,...,...
2041,Urban Jams United,
2043,JAY-Z,
2045,PinkPantheress,
2067,Cate Le Bon,


In [11]:
import time
import requests
import pandas as pd

# Load the existing CSV file into a DataFrame
df_liked_similar = pd.read_csv('data/liked_artists_only_similar.csv')

# Filter out the artists with missing similar artists (where 'Similar Artists' is NaN)
df_missing_similar = df_liked_similar[df_liked_similar["Similar Artists"].isna()]

# Function to fetch similar artists using ListenBrainz
def get_similar_artists(artist_mbid):
    api_url = f'https://labs.api.listenbrainz.org/similar-artists?artist_mbid={artist_mbid}&algorithm=session_based_days_9000_session_300_contribution_5_threshold_15_limit_50_skip_30'
    try:
        response = requests.get(api_url)
        if response.status_code == 200:
            data = response.json()
            similar_artists = [artist['artist_name'] for artist in data.get('payload', {}).get('artists', [])]
            return ', '.join(similar_artists)
        else:
            print(f"Failed to fetch data for MBID {artist_mbid}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching data for MBID {artist_mbid}: {str(e)}")
        return None

# Function to get MBID from artist name using MusicBrainz API
def get_artist_mbid(artist_name):
    api_url = f'https://musicbrainz.org/ws/2/artist?query={artist_name}&limit=1&fmt=json'
    try:
        response = requests.get(api_url, headers={'User-Agent': 'YourApp/1.0'})
        if response.status_code == 200:
            data = response.json()
            if data.get('artists'):
                return data['artists'][0]['id']  # Return the first matching MBID
        else:
            print(f"Failed to fetch MBID for {artist_name}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching MBID for {artist_name}: {str(e)}")
        return None

# To avoid overwhelming the API, we will process a batch at a time
batch_size = 5
delay_seconds = 3  # Delay in seconds between requests

# Iterate over missing artists in batches
updated_rows = []
for i, row in df_missing_similar.iterrows():
    artist_name = row['Artist']
    artist_mbid = get_artist_mbid(artist_name)
    
    if artist_mbid:
        similar_artists = get_similar_artists(artist_mbid)
        updated_rows.append((i, similar_artists))
    else:
        updated_rows.append((i, None))
    
    # Wait for the specified delay before the next request
    if (i + 1) % batch_size == 0:
        print(f"Processed {i + 1} artists, pausing for {delay_seconds} seconds...")
        time.sleep(delay_seconds)

# Update the 'Similar Artists' column for the rows that were processed
for index, similar_artists in updated_rows:
    df_liked_similar.at[index, 'Similar Artists'] = similar_artists

# Save the updated DataFrame back to the CSV file
df_liked_similar.to_csv('data/liked_artists_only_similar.csv', index=False)

# After the loop, you can check if all the missing artists have been filled
print("Updated DataFrame:")
print(df_liked_similar.head())

Error fetching data for MBID 3b4cd16e-3a25-4c7b-ada6-33f5ea91e1b1: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID 9d79c790-9897-464e-aef0-db5bd3290f00: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID 4b1d3ebc-b45a-45ec-a97e-426a20c1c6ab: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID cc9ddb3d-1217-4a40-9864-d85920cfa1ed: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID 0b966a38-5ad9-43c1-b86b-c07079523165: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID a148ffde-581c-4c39-a8f1-bc49dec7fe68: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID 950c20e2-dbab-4c7d-8784-cee86be11787: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID 66c662b6-6e2f-4930-8610-912e24c63ed1: Expecting value: line 1 column 1 (char 0)
Error fetching data for MBID 85c18775-22bd-4a66-8415-3fe11c026040: Expecting value: line 1 column 1 (char 0)
Error fetching data

In [12]:
df_liked_similar[df_liked_similar["Similar Artists"].isna()]

Unnamed: 0,Artist,Similar Artists
0,RY X,
22,Sampa the Great,
30,Spillage Village,
45,serpentwithfeet,
76,Omar Apollo,
...,...,...
2041,Urban Jams United,
2043,JAY-Z,
2045,PinkPantheress,
2067,Cate Le Bon,


In [13]:
import pandas as pd
import requests
import time

# Load the CSV with liked artists (make sure the path is correct)
df_liked_similar = pd.read_csv('data/liked_artists_only.csv')

# Target rows where 'Similar Artists' is NaN
df_target = df_liked_similar[df_liked_similar["Similar Artists"].isna()]

# Function to get similar artists using MusicBrainz API
def get_similar_artists(artist_name):
    # API URL to search for artists
    url = f'https://musicbrainz.org/ws/2/artist?query={artist_name}&fmt=json'
    
    try:
        # Request data from MusicBrainz
        response = requests.get(url)
        response.raise_for_status()
        
        data = response.json()
        
        # Check if results exist
        if data['artists']:
            # Get the first artist in the results
            artist_data = data['artists'][0]
            artist_id = artist_data['id']
            
            # Get related artists (if any) using artist-rels
            related_url = f'https://musicbrainz.org/ws/2/artist/{artist_id}?inc=artist-rels&fmt=json'
            related_response = requests.get(related_url)
            related_response.raise_for_status()
            
            related_data = related_response.json()
            
            # Check if 'artist-rels' exists and has data
            if 'artist-rels' in related_data:
                related_artists = [rel['artist']['name'] for rel in related_data['artist-rels']]
                return related_artists
            else:
                return []
        else:
            return []
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving data for {artist_name}: {e}")
        return []

# Iterate through the target rows and get similar artists
for index, row in df_target.iterrows():
    artist_name = row['Artist Name']  # Assuming the artist name column is 'Artist Name'
    
    # Get similar artists
    similar_artists = get_similar_artists(artist_name)
    
    # If similar artists are found, update the 'Similar Artists' column
    if similar_artists:
        df_liked_similar.at[index, 'Similar Artists'] = ', '.join(similar_artists)
    
    # Pause between requests to avoid overwhelming the API
    time.sleep(1)

# Save the updated DataFrame back to CSV
df_liked_similar.to_csv('data/liked_artists_only_similar.csv', index=False)

print("Updated 'Similar Artists' for target rows.")


FileNotFoundError: [Errno 2] No such file or directory: 'data/liked_artists_only.csv'

## Quick Glance at our Refreshed Datasets

In [None]:
df_liked.head()

In [None]:
# Liked Albums in Recent Years
df_fav_albums.head()

In [None]:
# Albums Not Liked in Recent Years
df_not_liked.head()

In [None]:
# New Music Friday Playlist
df_nmf.head()

In [None]:
# Similar Artists to Recently Played Artists (Last.fm)

df_liked_similar.head()

In [None]:
# Similar Artists to NMF Artists (Last.fm)
df_nmf_similar.head()

> A quick reminder of the standard columns of a spotify export.

In [None]:
df_liked.columns

In [None]:
df_liked_similar.columns

In [None]:
df_nmf_similar.columns

### Add Target Labels for Training Feature

In [None]:
# Assign liked scores before combining
df_liked['liked'] = 100
df_fav_albums['liked'] = 50
df_not_liked['liked'] = 0
df_nmf['liked'] = np.nan 

# Add playlist_origin column before combining
df_liked['playlist_origin'] = 'df_liked'
df_fav_albums['playlist_origin'] = 'df_fav_albums'
df_not_liked['playlist_origin'] = 'df_not_liked'
df_nmf['playlist_origin'] = 'df_nmf'
df_liked_similar['source'] = 'liked_similar'
df_nmf_similar['source'] = 'nmf_similar'

### Check application of the target encoding

In [None]:
df_liked[['liked', 'playlist_origin']].head()

In [None]:
df_fav_albums[['liked', 'playlist_origin']].head()

In [None]:
df_not_liked[['liked', 'playlist_origin']].head()

In [None]:
df_nmf[['liked', 'playlist_origin']].head()

In [None]:
df_liked_similar[['Artist', 'Similar Artists', 'source']].head()

In [None]:
df_nmf_similar[['Artist', 'Similar Artists', 'source']].head()

## Merge The Datasets

In [None]:
df = pd.concat([df_liked, df_fav_albums, df_not_liked, df_nmf], ignore_index=True)

In [None]:
#How Large is the Dataset, Now?
df.shape

#### Remove the Duplicates

In [None]:
# Remove duplicates: Keep the highest 'liked' score (100 > 50)
df = df.sort_values(by='liked', ascending=False)  # Ensures 100-rated songs come first
df = df.drop_duplicates(subset=['Track Name', 'Artist Name(s)'], keep='first')
df.shape

In [None]:
df.columns #Checking to remind myself what is all available to drop, keep seperate as metadata, etc.

#### Drop columns that won't help the model (Track ID, Added By, Added At, Time Signature)

In [None]:
df.drop(columns=['Track ID', 'Added By', 'Added At', 'Time Signature'], inplace=True)

#### Getting missing Genres (of which Spotify is "Spotty" at best)

In [None]:
class LastFMAPI:
    def __init__(self, api_key: str, rate_limit_delay: float = 0.25):
        self.api_key = api_key
        self.base_url = "http://ws.audioscrobbler.com/2.0/"
        self.rate_limit_delay = rate_limit_delay
        
    def _make_request(self, params: dict) -> dict:
        try:
            # Handle rate limit by checking headers for remaining requests
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()

            # Check for rate limit info in the response headers
            remaining = int(response.headers.get('X-RateLimit-Remaining', 1))
            if remaining == 0:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                wait_time = reset_time - int(datetime.now().timestamp())
                print(f"Rate limit hit, waiting for {wait_time} seconds...")
                sleep(wait_time + 1)  # wait for the reset time plus 1 second for safety
                response = requests.get(self.base_url, params=params)  # retry after waiting
                response.raise_for_status()

            return response.json()

        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}")
            if response.status_code == 429:
                print("Rate limit exceeded, increasing delay.")
                self.rate_limit_delay *= 2
            return None

    def get_artist_tags(self, artist_name: str, limit: int = 5) -> List[str]:
        params = {
            'method': 'artist.getTopTags',
            'artist': artist_name,
            'api_key': self.api_key,
            'format': 'json',
            'limit': limit
        }
        data = self._make_request(params)
        if data and 'toptags' in data:
            return [tag['name'] for tag in data['toptags'].get('tag', [])]
        return []

def export_artist_tags(api_key: str, unique_artists: List[str], output_file: str = 'data/missing_genres.csv'):
    api = LastFMAPI(api_key)

    # Load existing data if the file exists
    existing_data = {}
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                existing_data[row['Artist']] = row['Tags']

    # Identify new artists not in the existing data
    new_artists = [artist for artist in unique_artists if artist not in existing_data]
    print(f"Total new artists with missing genres: {len(new_artists)}")

    if not new_artists:
        print("No new missing artists to process.")
        return

    # Using ThreadPoolExecutor to parallelize API requests
    with ThreadPoolExecutor(max_workers=5) as executor:  # Adjust max_workers based on your needs
        future_to_artist = {executor.submit(api.get_artist_tags, artist): artist for artist in new_artists}
        
        try:
            with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=['Artist', 'Tags'])
                if not os.path.exists(output_file):  # Write header only if file doesn't exist
                    writer.writeheader()
                
                for i, future in enumerate(as_completed(future_to_artist), 1):
                    artist = future_to_artist[future]
                    tags = future.result()
                    
                    # Ensure only the top 5 tags are saved
                    top_5_tags = tags[:5]
                    writer.writerow({
                        'Artist': artist,
                        'Tags': ', '.join(top_5_tags)  # Join only the top 5 tags
                    })

                    # Print progress in increments of 100
                    if i % 100 == 0:
                        print(f"Processed tags for {i} new artists")
        
        except Exception as e:
            print(f"Fatal error during export: {e}")
    
    print(f"\nExport complete! Processed {len(new_artists)} new artists.")

# Extract first artist if multiple are listed
df['Primary Artist'] = df['Artist Name(s)'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

# Get unique artists with missing genres
artists_missing_genres = df[df['Genres'].isna()]['Primary Artist'].unique()
print(f"Total artists with missing genres: {len(artists_missing_genres)}")

# Fetch tags for missing artists
API_KEY = '74a510ecc9fc62bf3e0edc6adc2e99f9'
export_artist_tags(API_KEY, artists_missing_genres, output_file='data/missing_genres.csv')

# Load the fetched tags
missing_genres_df = pd.read_csv('data/missing_genres.csv')

# Merge with the original dataframe
df = df.merge(
    missing_genres_df,
    how='left',
    left_on='Primary Artist',
    right_on='Artist',
    suffixes=('', '_tags')  # Add a suffix to overlapping columns from missing_genres_df
)

# Fill missing genres in the original 'Genres' column
df['Genres'] = df['Genres'].fillna(df['Tags'])

# Drop the temporary 'Tags' and 'Artist' columns if no longer needed
df.drop(columns=['Tags', 'Artist'], inplace=True)

# Now `df` is updated in memory with the new genre data!

In [None]:
# Fill missing 'Genres' and 'Record Label' with 'Unknown'
df['Genres'] = df['Genres'].fillna('Unknown')
df['Record Label'] = df['Record Label'].fillna('Unknown')

#### Handle missing values (if any)

In [None]:
 df.isna().sum()

In [None]:
# Drop rows with nulls in any column except 'liked'
df = df[df.drop(columns=['liked']).notna().all(axis=1)].reset_index(drop=True)

In [None]:
 df.isna().sum()

In [None]:
df.shape

In [None]:
# Count how many of each 'playlist_origin' are in the df dataset
playlist_origin_counts = df['playlist_origin'].value_counts()

print("Playlist Origin Counts:")
print(playlist_origin_counts)


## Target Encoding Record Labels

In [None]:
def target_encode(df, column, target, smoothing=1):
    # Separate out df_nmf to ensure it's never used in encoding
    df_train = df[df['playlist_origin'] != 'df_nmf'].copy()

    mean_target = df_train[target].mean()
    label_means = df_train.groupby(column)[target].mean()
    label_counts = df_train[column].value_counts()

    smoothed_values = (label_means * label_counts + mean_target * smoothing) / (label_counts + smoothing)

    # Map with a fallback to the overall mean
    df[column + '_encoded'] = df[column].map(smoothed_values).fillna(mean_target)

    return df

# Target encode only on the training data (excludes df_nmf)
df = target_encode(df, 'Record Label', 'liked', smoothing=10)
df[['Record Label', 'Record Label_encoded', 'liked']].head()


## Artists with Missing Genres (Last.fm to the rescue!)

> I noticed in the data previews that one of the common genres imported from last.fm was 'seen live', which I take to meen a lot of last.fm users have seen that artist. 

In [None]:
# Remove 'seen live' from the 'Genres' column
df['Genres'] = df['Genres'].apply(lambda x: ', '.join([genre for genre in x.split(', ') if genre != 'seen live']) if pd.notna(x) else x)

## Target Encode Genres

In [None]:
# Create a binary indicator column for 'Unknown' genres
df['is_unknown_genre'] = (df['Genres'] == 'Unknown').astype(int)

# Define the target encoding function
def target_encode_multi_genre(df, genre_column, target, smoothing=1, aggregation_method='mean', nmf_fallback=0):
    """
    Target encode a multi-genre column by splitting genres, encoding individually, and aggregating.
    Explicitly handles 'Unknown' genres and NMF rows.
    """
    # Separate out df_nmf to ensure it's never used in encoding
    df_train = df[df['playlist_origin'] != 'df_nmf'].copy()

    # Calculate the global mean of the target variable
    global_mean = df_train[target].mean()

    # Split genres into individual categories and exclude 'seen live' and 'Unknown'
    df_train['split_genres'] = df_train[genre_column].str.split(', ').apply(
        lambda x: [genre for genre in x if genre != 'seen live' and genre != 'Unknown'] if isinstance(x, list) else x
    )

    # Explode the list of genres into separate rows
    exploded_genres = df_train.explode('split_genres')

    # Calculate target encoding for individual genres
    label_means = exploded_genres.groupby('split_genres')[target].mean()
    label_counts = exploded_genres['split_genres'].value_counts()

    # Calculate smoothed target encoding for individual genres
    smoothed_values = (label_means * label_counts + global_mean * smoothing) / (label_counts + smoothing)

    # Map the smoothed values back to the exploded genres
    exploded_genres['genre_encoded'] = exploded_genres['split_genres'].map(smoothed_values).fillna(global_mean)

    # Aggregate encodings for multi-genre rows
    if aggregation_method == 'mean':
        aggregated_encodings = exploded_genres.groupby(exploded_genres.index)['genre_encoded'].mean()
    elif aggregation_method == 'max':
        aggregated_encodings = exploded_genres.groupby(exploded_genres.index)['genre_encoded'].max()
    else:
        raise ValueError(f"Unsupported aggregation method: {aggregation_method}")

    # Add the aggregated encodings to the original dataframe
    df[genre_column + '_encoded'] = aggregated_encodings

    # Handle 'Unknown' genres
    is_unknown = df[genre_column] == 'Unknown'
    df.loc[is_unknown, genre_column + '_encoded'] = global_mean  # Use global mean for non-NMF rows

    # Handle NMF rows with 'Unknown' genres separately
    is_nmf = df['playlist_origin'] == 'df_nmf'
    df.loc[is_nmf & is_unknown, genre_column + '_encoded'] = nmf_fallback  # Use nmf_fallback for NMF rows

    return df

# Apply the target encoding function
df = target_encode_multi_genre(df, 'Genres', 'liked', smoothing=100, nmf_fallback=0)

# Inspect the results
print(df[['Genres', 'Genres_encoded', 'is_unknown_genre', 'liked', 'playlist_origin']].head())

In [None]:
df[df['Genres'] == 'Unknown']['Genres_encoded'].value_counts()

#### Further Examination of Missing Genres

In [None]:
#Which rows still have Unknown Genres
unknown_genres = df[df['Genres'] == 'Unknown']

# Count the number of 'Unknown' genre tracks from each playlist_origin
unknown_genres_origin_counts = unknown_genres['playlist_origin'].value_counts()

print("Unknown Genres by Playlist Origin:")
print(unknown_genres_origin_counts)

##### Getting Rid of Rows Where Genre is Unknown and Playlist Origin is Not df_nmf

In [None]:
# Keep rows where:
# 1. Genres is not 'Unknown', OR
# 2. Genres is 'Unknown' and playlist_origin is 'df_nmf'
df_filtered = df[~((df['Genres'] == 'Unknown') & (df['playlist_origin'] != 'df_nmf'))]

# Verify the result
print("Remaining Rows by Playlist Origin:")
print(df_filtered['playlist_origin'].value_counts())

# Check remaining 'Unknown' genre rows
remaining_unknown_genres = df_filtered[df_filtered['Genres'] == 'Unknown']

print("\nRemaining 'Unknown' Genres by Playlist Origin:")
print(remaining_unknown_genres['playlist_origin'].value_counts())

# Inspect the remaining NMF tracks with 'Unknown' genres
unknown_genres_nmf = df_filtered[(df_filtered['Genres'] == 'Unknown') & (df_filtered['playlist_origin'] == 'df_nmf')]

print("\nRemaining NMF Tracks with 'Unknown' Genres:")
print(unknown_genres_nmf[['Track Name', 'Artist Name(s)', 'Genres_encoded']])

# Save the filtered dataframe to a CSV file (optional)
df_filtered.to_csv('data/filtered_data.csv', index=False)

In [None]:
# check all encoded columns to confirm the encoding
df[['Record Label', 'Record Label_encoded', 'Genres', 'Genres_encoded']].head()


# Finding How Central an Artist is to My Music Taste

In [None]:
# Step 1: Build the graph
G = nx.Graph()

# Add nodes for liked artists
liked_artists = set(
    df[df['playlist_origin'].isin(['df_liked', 'df_fav_albums'])]['Artist Name(s)']
    .str.split(',').explode().str.strip()
)
G.add_nodes_from(liked_artists, type='liked')

# Add nodes for similar artists (from liked)
similar_artists_liked = set(
    df_liked_similar['Similar Artists']
    .dropna()  # Remove NaN values
    .str.split(',').explode().str.strip()
)
G.add_nodes_from(similar_artists_liked, type='similar_liked')

# Add edges based on similarity (from liked)
for _, row in df_liked_similar.iterrows():
    artist = row['Artist']
    # Check if Similar Artists is a string before splitting
    if isinstance(row['Similar Artists'], str):
        similar = row['Similar Artists'].split(', ')
        for s in similar:
            G.add_edge(artist, s, weight=1.0)

# Step 2: Calculate centrality scores
centrality_scores = nx.pagerank(G)

# Step 3: Map centrality scores back to DataFrame
df['Artist Centrality'] = (
    df['Artist Name(s)']
    .str.split(',').str[0].str.strip()
    .map(centrality_scores).fillna(0)
)

# Normalize centrality scores to 0-100
df['Artist Centrality'] = (df['Artist Centrality'] / df['Artist Centrality'].max()) * 100

# Step 4: Calculate NMF Similarity Scores
# For NMF artists, calculate similarity to liked artists
nmf_artists = df[df['playlist_origin'] == 'df_nmf']['Artist Name(s)'].str.split(',').str[0].str.strip()
nmf_similarity_scores = {}

for artist in nmf_artists:
    similar_artists = (
        df_nmf_similar[df_nmf_similar['Artist'] == artist]['Similar Artists']
        .dropna()  # Add dropna here too
        .str.split(',').explode().str.strip()
    )
    similarity_score = similar_artists.map(centrality_scores).mean()  # Average centrality of similar artists
    nmf_similarity_scores[artist] = similarity_score if not pd.isna(similarity_score) else 0

# Map NMF similarity scores back to DataFrame
df['NMF Similarity'] = (
    df['Artist Name(s)']
    .str.split(',').str[0].str.strip()
    .map(nmf_similarity_scores).fillna(0)
)

# Normalize NMF similarity scores to 0-100
df['NMF Similarity'] = (df['NMF Similarity'] / df['NMF Similarity'].max()) * 100

# Check the result
print(df[['Artist Name(s)', 'Artist Centrality', 'NMF Similarity']].head())

In [None]:
print("Rows with NaN Similar Artists:")
print(df_liked_similar[df_liked_similar['Similar Artists'].isna()])

## Genre Strength Feature

In [None]:
# Get the set of liked genres
liked_genres = set(df[df['playlist_origin'].isin(['df_liked', 'df_fav_albums'])]['Genres'].str.split(',').explode().str.strip())

# Calculate genre strength for each artist
def calculate_genre_strength(genres):
    if pd.isna(genres):
        return 0
    artist_genres = set(genres.split(','))
    overlap = artist_genres.intersection(liked_genres)
    return len(overlap) / len(artist_genres) if artist_genres else 0

df['Genre Strength'] = df['Genres'].apply(calculate_genre_strength)

# Normalize genre strength to 0–100
df['Genre Strength'] = (df['Genre Strength'] / df['Genre Strength'].max()) * 100

# Check the result
print(df[['Genres', 'Genre Strength']].head())

In [None]:
# Fill NaN values in Genres_encoded
genres_encoded_mean = df['Genres_encoded'].mean()
df['Genres_encoded'] = df['Genres_encoded'].fillna(genres_encoded_mean)

In [None]:
df.columns

In [None]:
df.dtypes

## Standardize the numeric columns

### Seperate New Music Friday and Save it for Later!

In [None]:
# Filter out 'df_nmf' from the main dataframe and save it for later
df_nmf = df[df['playlist_origin'] == 'df_nmf'].copy()

# Remove df_nmf entries from the original dataframe
df = df[df['playlist_origin'] != 'df_nmf'].copy()

# Save df_nmf to CSV for later use
df_nmf.to_csv('data/df_nmf_later.csv', index=False)

In [None]:
# Define numeric columns to standardize
numeric_columns = [
    'Duration (ms)', 'Popularity', 'Danceability', 'Energy', 'Key', 'Loudness',
    'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
    'Valence', 'Tempo', 'Record Label_encoded', 'Genres_encoded',
    'Artist Centrality', 'NMF Similarity'
]

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data (df)
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Transform the test data (df_nmf) using the fitted scaler
df_nmf[numeric_columns] = scaler.transform(df_nmf[numeric_columns])

# Save the standardized df_nmf for later use
df_nmf.to_csv('data/df_nmf_later.csv', index=False)

In [None]:
df_nmf[numeric_columns].head()

#### The data is now ready for modeling


## One last look at our columns before we run our model(s)

| Column Name           | Description                                                                                      | Data Type   | Drop From Model? |
|-----------------------|--------------------------------------------------------------------------------------------------|-------------|------------------|
| Track Name            | Name of the track (song)                                                                          | object      | Yes              |
| Album Name            | Name of the album the track belongs to                                                            | object      | Yes              |
| Artist Name(s)        | Name(s) of the artist(s) associated with the track                                                | object      | Yes              |
| Release Date          | Release date of the track (in object format for now, can be converted to datetime)               | object      | Yes              |
| Duration (ms)         | Duration of the track in milliseconds                                                             | int64       | No               |
| Popularity            | Popularity score of the track (higher is more popular)                                            | int64       | No               |
| Genres                | Genres associated with the track                                                                  | object      | Yes              |
| Record Label          | Record label associated with the track                                                            | object      | Yes              |
| Danceability          | Measure of the track's danceability (0-1 scale)                                                  | float64     | No               |
| Energy                | Energy level of the track (0-1 scale)                                                            | float64     | No               |
| Key                   | The key of the track (musical key)                                                                | float64     | No               |
| Loudness              | The loudness of the track (in decibels)                                                          | float64     | No               |
| Mode                  | Mode of the track (major or minor key)                                                           | float64     | No               |
| Speechiness           | Amount of speech-like content in the track                                                        | float64     | No               |
| Acousticness          | Measure of acoustic quality (0-1 scale)                                                           | float64     | No               |
| Instrumentalness      | Measure of instrumental content (0-1 scale)                                                      | float64     | No               |
| Liveness              | Measure of the track's liveness (0-1 scale)                                                      | float64     | No               |
| Valence               | Measure of the track's mood (0-1 scale, from negative to positive)                               | float64     | No               |
| Tempo                 | Tempo of the track (beats per minute)                                                            | float64     | No               |
| liked                 | Target variable: Whether the track was liked (1 = liked, 0 = not liked)                          | float64     | No               |
| playlist_origin       | The playlist where the track originates from (e.g., 'df_nmf' for New Music Friday)               | object      | Yes              |
| Primary Artist        | Main artist of the track (extracted from Artist Name(s))                                         | object      | Yes              |
| Record Label_encoded  | Encoded version of the record label (numeric representation)                                     | float64     | No               |
| is_unknown_genre      | Binary indicator if the track has an unknown genre (1 = unknown, 0 = known)                     | int32       | No               |
| Genres_encoded        | Encoded version of the genre (numeric representation)                                            | float64     | No               |
| Artist Centrality     | Measure of artist's importance in the similarity network (0-100 scale)                          | float64     | No               |
| NMF Similarity        | Similarity score based on NMF algorithm (0-100 scale)                                           | float64     | No               |
| Genre Strength        | Measure of how strongly a track belongs to its assigned genres                                   | float64     | No               |

# Run the Model

In [None]:
# [Previous code until model training remains the same]
features = [
    'Duration (ms)', 'Popularity', 'Danceability', 'Energy', 'Key', 
    'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness',
    'Liveness', 'Valence', 'Tempo', 'Record Label_encoded', 
    'is_unknown_genre', 'Genres_encoded', 'Artist Centrality',
    'NMF Similarity', 'Genre Strength'
]

# Normalize the target variable
y_mean = df['liked'].mean()
y_std = df['liked'].std()
y_normalized = (df['liked'] - y_mean) / y_std

# Prepare training data
X = df[features]
y = y_normalized  # Use normalized target

# Train models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, random_state=42)

rf_model.fit(X, y)
xgb_model.fit(X, y)

# Get feature importance from both models
rf_importance = pd.DataFrame({
    'feature': features,
    'importance_rf': rf_model.feature_importances_
}).sort_values('importance_rf', ascending=False)

xgb_importance = pd.DataFrame({
    'feature': features,
    'importance_xgb': xgb_model.feature_importances_
}).sort_values('importance_xgb', ascending=False)

# Combine importance scores
feature_importance = pd.merge(rf_importance, xgb_importance, on='feature')
feature_importance['avg_importance'] = (feature_importance['importance_rf'] + feature_importance['importance_xgb']) / 2
feature_importance = feature_importance.sort_values('avg_importance', ascending=False)

# Prepare NMF data for prediction
df_nmf_cleaned = df_nmf[features]

# Make predictions and denormalize
rf_predictions = rf_model.predict(df_nmf_cleaned) * y_std + y_mean
xgb_predictions = xgb_model.predict(df_nmf_cleaned) * y_std + y_mean

# Combine predictions (ensemble)
final_predictions = (rf_predictions + xgb_predictions) / 2

# Get prediction intervals
def get_prediction_interval(X, model, percentile=95):
    predictions = []
    for estimator in model.estimators_:
        predictions.append(estimator.predict(X) * y_std + y_mean)
    predictions = np.array(predictions)
    lower = np.percentile(predictions, (100-percentile)/2, axis=0)
    upper = np.percentile(predictions, 100-(100-percentile)/2, axis=0)
    return lower, upper

# Calculate prediction intervals
lower_bound, upper_bound = get_prediction_interval(df_nmf_cleaned, rf_model)
df_nmf['predicted_score'] = final_predictions
df_nmf['prediction_lower'] = lower_bound
df_nmf['prediction_upper'] = upper_bound
df_nmf['prediction_uncertainty'] = upper_bound - lower_bound

# Get the most common release date from NMF dataset
nmf_release_date = df_nmf['Release Date'].mode().iloc[0]

# Aggregate by album
album_predictions = df_nmf.groupby('Album Name').agg({
    'Artist Name(s)': 'first',
    'predicted_score': ['mean', 'min', 'max', 'std', 'count'],
    'prediction_uncertainty': 'mean',
    'Genres': 'first',
    'Record Label': 'first',
    'Release Date': 'first'
}).reset_index()

# Flatten column names
album_predictions.columns = [
    'Album Name', 'Artist', 
    'avg_score', 'min_score', 'max_score', 'score_std', 'track_count',
    'avg_uncertainty', 'Genres', 'Record Label', 'Release Date'
]

# Calculate confidence score (revised)
max_std = album_predictions['score_std'].max()
max_uncertainty = album_predictions['avg_uncertainty'].max()

album_predictions['confidence_score'] = (
    (1 - album_predictions['score_std'] / max_std) * 
    (1 - album_predictions['avg_uncertainty'] / max_uncertainty) * 
    (1 - 1/(1 + album_predictions['track_count']))
) * 100

# Clip confidence scores to avoid extremely low values
album_predictions['confidence_score'] = np.clip(
    album_predictions['confidence_score'], a_min=1, a_max=100
)

# Add weighted score for ranking
album_predictions['weighted_score'] = (
    album_predictions['avg_score'] * album_predictions['confidence_score'] / 100
)

# Add NMF release date
album_predictions['NMF_Date'] = nmf_release_date

# Sort by weighted score
album_recommendations = album_predictions.sort_values('weighted_score', ascending=False)

# Format the date for the filename
date_str = datetime.strptime(nmf_release_date, '%Y-%m-%d').strftime('%m-%d-%y')
filename = f"{date_str}_Album_Recommendations.csv"

# Create predictions directory if it doesn't exist
os.makedirs('predictions', exist_ok=True)

# Save recommendations to the root predictions folder
album_recommendations.to_csv(f'predictions/{filename}', index=False)

# Sort df_nmf by predicted_score before saving detailed predictions
df_nmf_sorted = df_nmf.sort_values('predicted_score', ascending=False)
df_nmf_sorted[['Artist Name(s)', 'Track Name', 'Album Name', 'predicted_score',
               'prediction_lower', 'prediction_upper', 'prediction_uncertainty']].to_csv(
                   f'predictions/nmf_predictions_with_uncertainty.csv', index=False)

# Print results
print(f"\nNew Music Friday Release Date: {nmf_release_date}")
print("\nTop 10 Recommended Albums:")
print(album_recommendations[['Album Name', 'Artist', 'avg_score', 'track_count', 
                           'confidence_score', 'NMF_Date']].head(10))

# Define the custom scorer for cross-validation
def custom_scorer(y_true, y_pred):
    y_true_denormalized = y_true * y_std + y_mean
    y_pred_denormalized = y_pred * y_std + y_mean
    return -mean_squared_error(y_true_denormalized, y_pred_denormalized)  # Negative MSE for scoring

# Wrap the custom scorer for use with cross_val_score
custom_scorer_func = make_scorer(custom_scorer, greater_is_better=False)

# Evaluate models using cross-validation
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring=custom_scorer_func)
xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring=custom_scorer_func)

# Print cross-validation results
print("\nModel Performance Metrics:")
print(f"Random Forest CV Score: {rf_cv_scores.mean():.3f} (+/- {rf_cv_scores.std() * 2:.3f})")
print(f"XGBoost CV Score: {xgb_cv_scores.mean():.3f} (+/- {xgb_cv_scores.std() * 2:.3f})")

# Print top 20 most important features
print("\nTop 20 Most Important Features:")
print(feature_importance[['feature', 'avg_importance']].head(20).to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['avg_importance'])
plt.xlabel('Average Importance')
plt.title('Top 20 Most Important Features')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()