# Code Pudding 2024
***

The purpose of this notebook will be to analyse data retrieved from the [Spotify Web API](https://developer.spotify.com/documentation/web-api) in order to train various machine learning models to predict the genre of any given song. Once the models have been trained, validated and tested, a function will be built that feeds the data from the API to the best preforming model, and it's genre will be predicted.

## Initialization

In [1]:
import os
import random
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import seaborn as sns

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from dotenv import load_dotenv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

## Data Gathering (Isaiah Montoya)
### Spotify API Setup and Authentication

In [2]:
# Load environment variables from the .env file
load_dotenv()

# Access the client_id and client_secret from environment variables
client_id = os.getenv("Client_ID")
client_secret = os.getenv("Client_secret")

# Authentication
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=10)

In [3]:
# Increase timeout and add retry logic
session = sp._session
retry = Retry(
    total=5,  # Total number of retries
    backoff_factor=0.3,  # Wait time between retries
    status_forcelist=[500, 502, 503, 504],  # Retry on these HTTP status codes
    raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry)  # Increase timeout to 10 seconds
session.mount("https://", adapter)

# Set random seed
random.seed(42)

### Functions

In [4]:
def test_genre_string(genre, debug=False):
    """
    Input:
        genre: str, genre string to test
        debug: bool, print debug messages
    Output:
        bool: True if the genre is valid, False if not
        
    Tests if a genre string is valid by searching for tracks with that genre. If no tracks are found, the genre is invalid.
    """
    results = sp.search(q=f'genre:{genre}', type='track', limit=1)
    number_of_tracks = len(results['tracks']['items'])
    if number_of_tracks == 0:
        print(f"No tracks found for genre: {genre}")
        return False
    else:
        if debug:
            print(f"Found {number_of_tracks} tracks for genre: {genre}")
        return True
    
def get_genres_of_interest(genres_dict, genre_record_limit, pagination_limit=45, debug=False):
    """
    Fetches track IDs for specified genres from Spotify, ensuring a balanced representation of each genre.
    Args:
        genres_dict (dict): A dictionary where keys are super genres and values are lists of sub-genres.
        genre_record_limit (int): The maximum number of tracks to fetch per super genre.
        pagination_limit (int, optional): The number of tracks to fetch per API call. Defaults to 45.
        debug (bool, optional): If True, prints debug information. Defaults to False.
    Returns:
        tuple: A tuple containing two lists:
            - track_ids (list): A list of track IDs fetched from Spotify.
            - track_genre (list): A list of super genres corresponding to each track ID.
    Raises:
        AssertionError: If duplicate track IDs are found in the final list.
    """
    
    # Initialize variables
    track_ids = []
    track_genre = []
    seen_track_ids = set()

    for super_genre, sub_genres in genres_dict.items():
        print(f"Getting records for super genre: {super_genre}")
        super_genre_track_ids = []

        # Dictionary to track how many tracks we pulled per sub-genre
        sub_genre_counts = {sub_genre: 0 for sub_genre in sub_genres}

        # Loop until we hit the genre_record_limit for the super genre
        total_tracks_pulled = 0
        
        while total_tracks_pulled < genre_record_limit and sub_genres:
            # Calculate remaining tracks needed for the super genre
            tracks_needed = genre_record_limit - total_tracks_pulled

            # Shuffle sub-genres to randomize the pulls
            random.shuffle(sub_genres)

            for sub_genre in sub_genres[:]:
                
                # Adjust the batch size to ensure we don't exceed the genre_record_limit
                batch_size = min(pagination_limit, tracks_needed)

                # Fetch a batch of tracks for the sub-genre
                results = sp.search(q=f'genre:{sub_genre}', type='track', limit=batch_size, offset=sub_genre_counts[sub_genre])
                
                # If no items are returned, remove the sub-genre and move on
                if not results['tracks']['items']:
                    if debug:
                        print(f"No items for sub-genre: {sub_genre}, removing from sub-genres")
                    sub_genres.remove(sub_genre)  # Remove sub-genre if no more tracks are returned
                    continue  # Skip the rest of the code for this sub-genre
            
                # Add new track IDs that are not already seen, but ensure we don't exceed the genre_record_limit
                new_track_ids = [track['id'] for track in results['tracks']['items'] if track['id'] not in seen_track_ids]
                new_tracks_needed = genre_record_limit - total_tracks_pulled
                
                # Only add as many tracks as needed to reach the limit
                new_track_ids = new_track_ids[:new_tracks_needed]
                
                for track_id in new_track_ids:
                    # Add the new track to the super genre's collection
                    super_genre_track_ids.append(track_id)
                    track_genre.append(super_genre)  # Label the track with the super genre
                    seen_track_ids.add(track_id)
                
                # Update counts and totals
                sub_genre_counts[sub_genre] += len(new_track_ids)
                total_tracks_pulled += len(new_track_ids)

                if debug:
                    print(f"Fetched {len(new_track_ids)} new tracks for sub-genre: {sub_genre}")

                # If we've reached the limit for the super genre, stop
                if total_tracks_pulled >= genre_record_limit:
                    break
                
            # Check again if we've exhausted all sub-genres
            if not sub_genres:
                if debug:
                    print(f"All sub-genres for super genre exhausted.")
                break

        print(f"{len(super_genre_track_ids)} records found for super genre: {super_genre}\n")
        # Add the super genre track ids to the main list
        track_ids.extend(super_genre_track_ids)

    print("Total number of records:", len(track_ids), "\n")

    # Check for duplicates in the final list
    track_id_counts = Counter(track_ids)
    duplicates = {track_id: count for track_id, count in track_id_counts.items() if count > 1}

    # Assert no duplicates
    assert not duplicates, f"Duplicate track IDs found: {duplicates}"

    return track_ids, track_genre

def get_other_genres(genres_of_interest, genre_record_limit, already_seen_ids, pagination_limit=45, debug=False):
    """
    Fetches tracks from genres not included in the genres_of_interest.
    Ensures an even distribution of tracks across genres and respects the genre_record_limit.
    Args:
        genres_of_interest (dict): A dictionary where keys are super genres and values are lists of sub-genres of interest.
        genre_record_limit (int): The maximum number of tracks to fetch.
        already_seen_ids (list or set): A list or set of track IDs that have already been gathered.
        pagination_limit (int, optional): The number of tracks to fetch per API call. Defaults to 45.
        debug (bool, optional): If True, prints debug information. Defaults to False.
    Returns:
        tuple: A tuple containing:
            - other_track_ids (list): A list of track IDs from the 'other' genres.
            - genre_labels (list): A list of genre labels corresponding to the fetched track IDs.
            - genre_counts (dict): A dictionary with genres as keys and the count of fetched tracks as values.
    Raises:
        AssertionError: If duplicate track IDs are found in the final list of track IDs.
    """

    print("Getting 'other' genres.")
    already_seen_ids = set(already_seen_ids)  # Ensure it's a set for fast lookup

    # Flatten the dictionary to get all sub-genres in genres_of_interest
    sub_genres_of_interest = {sub_genre for super_genre, sub_genres in genres_of_interest.items() for sub_genre in sub_genres}

    # All genres excluding genres_of_interest
    other_genres = [genre for genre in sp.recommendation_genre_seeds()['genres'] if genre not in sub_genres_of_interest]

    # Dictionary to track how many tracks we pulled per genre
    genre_counts = {genre: 0 for genre in other_genres}

    # List to hold unique track IDs for this function
    other_track_ids = []

    # Track total number of new tracks pulled
    total_tracks_pulled = 0

    # Loop until we hit the genre_record_limit
    while total_tracks_pulled < genre_record_limit and other_genres:
        # Calculate remaining tracks needed for the overall genre
        tracks_needed = genre_record_limit - total_tracks_pulled

        # Shuffle genres to randomize the pulls
        random.shuffle(other_genres)

        for genre in other_genres[:]:
            # Adjust the batch size to ensure we don't exceed the genre_record_limit
            batch_size = min(pagination_limit, tracks_needed)

            # Fetch a batch of tracks for the genre
            results = sp.search(q=f'genre:{genre}', type='track', limit=batch_size, offset=genre_counts[genre])

            # If no items are returned, remove the genre and move on
            if not results['tracks']['items']:
                if debug:
                    print(f"No items for genre: {genre}, removing from other_genres")
                other_genres.remove(genre)
                continue

            # Add new track IDs that are not already seen, but ensure we don't exceed the genre_record_limit
            new_track_ids = [track['id'] for track in results['tracks']['items'] if track['id'] not in already_seen_ids]
            new_tracks_needed = genre_record_limit - total_tracks_pulled

            # Only add as many tracks as needed to reach the limit
            new_track_ids = new_track_ids[:new_tracks_needed]

            for track_id in new_track_ids:
                # Add the new track to the other track IDs collection
                other_track_ids.append(track_id)
                already_seen_ids.add(track_id)  # Also add to already seen IDs to avoid duplicates
                genre_counts[genre] += 1
                total_tracks_pulled += 1

            if debug:
                print(f"Fetched {len(new_track_ids)} new tracks for genre: {genre}")

            # If we've reached the genre record limit, stop
            if total_tracks_pulled >= genre_record_limit:
                break

    print(f"Total number of new records: {len(other_track_ids)} new tracks")

    if debug:
        # Print genre counts only for genres with tracks > 0
        print(f"\nGenre Counts:")
        for genre, count in genre_counts.items():
            if count > 0:
                print(f"{genre}: {count}")

    # Create genre labels for the new tracks
    genre_labels = ["other"] * len(other_track_ids)

    # Check for duplicates in the final list of track IDs
    track_id_counts = Counter(other_track_ids)
    duplicates = {track_id: count for track_id, count in track_id_counts.items() if count > 1}

    # Assert no duplicates
    assert not duplicates, f"Duplicate track IDs found: {duplicates}"

    return other_track_ids, genre_labels, genre_counts

def amend_sub_genres(sub_genres):
    """
    Input: A list of sub-genres
    Output: The same list with any sub-genres removed that do not return results from the Spotify API
    """
    sub_genres = sub_genres.copy()
    # Modify the sub_genres list in place
    before = len(sub_genres)
    print(f"Number of sub genres before check: {before}")
    
    # Create a copy of the list to avoid modifying it while iterating
    for genre in sub_genres[:]:
        if not test_genre_string(genre):
            sub_genres.remove(genre)
            print(f"Removed {genre} from sub_genres.")
    
    print(f"Number of sub genres after check: {len(sub_genres)}, {before - len(sub_genres)} removed.")
    
    return sub_genres

In [5]:
def get_artist_and_album(track_ids, limit=50):
    """
    Input: A list of track IDs
    Output: A DataFrame with the artist and album for each track ID
    """
    
    # Initialize lists to hold artist and album data
    artists = []
    albums = []
    
    # Break the track IDs into chunks of limit to avoid hitting the API rate limit
    chunks = [track_ids[i:i + limit] for i in range(0, len(track_ids), limit)]
    
    with tqdm(total=len(chunks), desc="Fetching artist and album data") as pbar:
        for chunk in chunks:
            # Fetch the track data
            tracks = sp.tracks(chunk)['tracks']
            for track in tracks:
                artists.append(track['artists'][0]['name'])
                albums.append(track['album']['name'])
            pbar.update(1)
            
    # Create a DataFrame from the lists
    df = pd.DataFrame({'track_id': track_ids, 'artist': artists, 'album': albums})
    
    # Assert that the DataFrame has the correct number of rows
    assert len(df) == len(track_ids), "Number of rows in DataFrame does not match number of track IDs"
    
    return df

### Define Genres of Interest

In [6]:
# Here you can add any string to any list in the dictionary. 
genres_of_interest = {
    'rock': [
            'rock',
            'alt-rock',
            'hard-rock',
            'j-rock',
            'psych-rock',
            'punk-rock',
            'rock-n-roll',
            'rockabilly',
            'grunge',
            'punk'
            ],
    'pop': [
            "pop",
            "Dance Pop",
            "Electropop",
            "Indie Pop",
            "Synth-pop",
            "Pop Rock",
            "Teen Pop",
            "Power Pop",
            "Art Pop",
            "Pop Punk",
            "K-Pop",
            "J-Pop",
            "Latin Pop",
            "Dream Pop",
            "Bubblegum Pop",
            "Euro Pop",
            "Pop Rap",
            "Chamber Pop",
            "Baroque Pop",
            "Pop Soul",
            "Acoustic Pop",
            "j-pop",
            "k-pop",
            ],
    'rap/hip-hop': [
                "Hip Hop",
                "Hip-Hop",
                "Rap",
                "Trap",
                "Gangsta Rap",
                "East Coast Hip Hop",
                "West Coast Hip Hop",
                "Conscious Hip Hop",
                "Alternative Hip Hop",
                "Boom Bap",
                "Dirty South",
                "Crunk",
                "Drill",
                "Grime",
                "Cloud Rap",
                "Underground Hip Hop",
                "Emo Rap",
                "Hardcore Hip Hop",
                "Lofi Hip Hop",
                "Old School Hip Hop",
                "Christian Hip Hop",
                "Latin Hip Hop"
                ],
    'classical': [
                "Classical",
                "Baroque",
                "Romantic",
                "Classical",
                "Chamber Music",
                "Symphony",
                "Opera",
                "Choral",
                "Contemporary Classical",
                "Minimalism",
                "Orchestral",
                "Piano",
                "String Quartet",
                "Early Music",
                "Renaissance",
                "Modern Classical",
                "Neoclassical",
                "Impressionism",
                "Avant-Garde",
                "Sacred Classical",
                "Cantata",
                "Piano"
                ],
    'jazz': [
                "Jazz",
                "Bebop",
                "Swing",
                "Smooth Jazz",
                "Cool Jazz",
                "Hard Bop",
                "Free Jazz",
                "Fusion",
                "Modal Jazz",
                "Latin Jazz",
                "Avant-Garde Jazz",
                "Gypsy Jazz",
                "Vocal Jazz",
                "Jazz Funk",
                "Jazz Blues",
                "Soul Jazz",
                "Post-Bop",
                "Ragtime",
                "Big Band",
                "Dixieland",
                "Nu Jazz",
                "Jazz Fusion",
                ]
}

# This validates each string in the lists per super genre. If the string is not a recognized genre, it gets removed from the super genre list - preventing unnecessary API calls.

for super_genre in genres_of_interest:
    print(f"\nChecking sub-genres for {super_genre}")
    genres_of_interest[super_genre] = amend_sub_genres(genres_of_interest[super_genre])


Checking sub-genres for rock
Number of sub genres before check: 10
Number of sub genres after check: 10, 0 removed.

Checking sub-genres for pop
Number of sub genres before check: 23
Number of sub genres after check: 23, 0 removed.

Checking sub-genres for rap/hip-hop
Number of sub genres before check: 22
No tracks found for genre: Lofi Hip Hop
Removed Lofi Hip Hop from sub_genres.
Number of sub genres after check: 21, 1 removed.

Checking sub-genres for classical
Number of sub genres before check: 22
No tracks found for genre: Sacred Classical
Removed Sacred Classical from sub_genres.
No tracks found for genre: Cantata
Removed Cantata from sub_genres.
Number of sub genres after check: 20, 2 removed.

Checking sub-genres for jazz
Number of sub genres before check: 22
No tracks found for genre: Modal Jazz
Removed Modal Jazz from sub_genres.
Number of sub genres after check: 21, 1 removed.


### Data Retrieval
You can adjust the genre record limit, each super genre gets a maximum of genre_record_limit records.  
Pagination is passed to the api as the limit parameter. Documentation says the max should be 50, but 45 seems to work best.

In [7]:
genre_record_limit = 1500
pagination_limit = 45

In [8]:
track_ids, track_genre = get_genres_of_interest(genres_of_interest, genre_record_limit, pagination_limit)

Getting records for super genre: rock
1500 records found for super genre: rock

Getting records for super genre: pop
1500 records found for super genre: pop

Getting records for super genre: rap/hip-hop
1500 records found for super genre: rap/hip-hop

Getting records for super genre: classical
1500 records found for super genre: classical

Getting records for super genre: jazz
1500 records found for super genre: jazz

Total number of records: 7500 



In [9]:
# Assert length of track_ids is equal to genre_record_limit * number of super genres
assert len(track_ids) == genre_record_limit * len(genres_of_interest), f"Expected {genre_record_limit * len(genres_of_interest)} tracks, but got {len(track_ids)}"

# Assert no duplicates
assert len(track_ids) == len(set(track_ids)), "Duplicate tracks found"

In [10]:
# Get other genres
other_track_ids, other_genre_labels, other_genre_counts = get_other_genres(genres_of_interest, genre_record_limit, track_ids)

# # This is helpful to see how many tracks were fetched for each super genre
# for genre, count in other_genre_counts.items():
#     if count > 0:
#         print(f"{genre}: {count}")

Getting 'other' genres.
Total number of new records: 1500 new tracks


In [11]:
# Combine the two lists
all_track_ids = track_ids + other_track_ids
all_track_genre = track_genre + other_genre_labels

# Create a DataFrame
track_genres_df = pd.DataFrame({"track_id": all_track_ids, "genre": all_track_genre})

# Assert no duplicates
assert track_genres_df['track_id'].nunique() == len(track_genres_df), "Duplicate track IDs found in the final DataFrame"

In [12]:
# Get audio features for each track with a progress bar
track_features = []
for i in tqdm(range(0, len(track_genres_df), pagination_limit), desc="Fetching audio features"):
    features = sp.audio_features(all_track_ids[i:i+pagination_limit])
    
    # Raise error if no features are returned
    if not features:
        raise ValueError(f"No audio features returned for tracks: {all_track_ids[i:i+pagination_limit]}")
    
    for feature in features:
        # Raise error if no features are returned for individual tracks
        if not feature:
            raise ValueError(f"No audio features returned for track: {all_track_ids[i:i+pagination_limit]}")
        track_features.append(feature)

Fetching audio features: 100%|██████████| 200/200 [00:29<00:00,  6.76it/s]


In [13]:
# To DataFrame
track_features_df = pd.DataFrame(track_features)
# Rename id to track_id
track_features_df.rename(columns={'id': 'track_id'}, inplace=True)

# Assert no duplicates
assert track_features_df['track_id'].nunique() == len(track_features_df), "Duplicate track IDs found in the final DataFrame"

# Assert same length as track_genres_df
assert len(track_features_df) == len(track_genres_df), "Length of track_features_df and track_genres_df do not match"

In [14]:
# Merge the two DataFrames
all_data = pd.merge(track_genres_df, track_features_df, on='track_id')

# Get artist and album data
artist_album_df = get_artist_and_album(all_data['track_id'].tolist())


Fetching artist and album data: 100%|██████████| 180/180 [01:08<00:00,  2.64it/s]


In [15]:
# Merge artist and album data
all_data = pd.merge(all_data, artist_album_df, on='track_id')

# Save the data
all_data.to_csv('spotify_data.csv', index=False)

## (End of Isaiah's work)
# EDA

In [16]:
data = pd.read_csv('spotify_data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9000 non-null   object 
 1   genre             9000 non-null   object 
 2   danceability      9000 non-null   float64
 3   energy            9000 non-null   float64
 4   key               9000 non-null   int64  
 5   loudness          9000 non-null   float64
 6   mode              9000 non-null   int64  
 7   speechiness       9000 non-null   float64
 8   acousticness      9000 non-null   float64
 9   instrumentalness  9000 non-null   float64
 10  liveness          9000 non-null   float64
 11  valence           9000 non-null   float64
 12  tempo             9000 non-null   float64
 13  type              9000 non-null   object 
 14  uri               9000 non-null   object 
 15  track_href        9000 non-null   object 
 16  analysis_url      9000 non-null   object 


In [17]:
data = data.drop(['type', 'uri', 'track_href', 'analysis_url'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_id          9000 non-null   object 
 1   genre             9000 non-null   object 
 2   danceability      9000 non-null   float64
 3   energy            9000 non-null   float64
 4   key               9000 non-null   int64  
 5   loudness          9000 non-null   float64
 6   mode              9000 non-null   int64  
 7   speechiness       9000 non-null   float64
 8   acousticness      9000 non-null   float64
 9   instrumentalness  9000 non-null   float64
 10  liveness          9000 non-null   float64
 11  valence           9000 non-null   float64
 12  tempo             9000 non-null   float64
 13  duration_ms       9000 non-null   int64  
 14  time_signature    9000 non-null   int64  
 15  artist            9000 non-null   object 
 16  album             8999 non-null   object 


In [18]:
data.head()

Unnamed: 0,track_id,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artist,album
0,1hQFF33xi8ruavZNyovtUN,rock,0.737,0.495,1,-13.489,1,0.027,0.0588,9e-06,0.0542,0.967,104.409,197147,4,Frankie Valli & The Four Seasons,Who Loves You
1,44AyOl4qVkzS48vBsbNXaC,rock,0.396,0.293,2,-14.062,1,0.0275,0.941,0.000196,0.105,0.343,100.307,182360,3,Elvis Presley,Blue Hawaii
2,2QfiRTz5Yc8DdShCxG1tB2,rock,0.534,0.803,10,-9.129,1,0.0743,0.741,6.1e-05,0.307,0.969,167.983,161560,4,Chuck Berry,Berry Is On Top
3,7zMUCLm1TN9o9JlLISztxO,rock,0.66,0.748,2,-11.206,1,0.0284,0.0993,0.00585,0.283,0.972,143.549,170293,4,Elvis Presley,Elvis (Fool)
4,48i055G1OT5KxGGftwFxWy,rock,0.619,0.603,9,-9.481,1,0.0342,0.712,0.0,0.0721,0.958,127.433,178933,4,Roy Orbison,"Oh, Pretty Woman"


In [19]:
print(data.duplicated().sum())
print(data['track_id'].duplicated().sum())

0
0


In [20]:
data['genre'].value_counts()

genre
rock           1500
pop            1500
rap/hip-hop    1500
classical      1500
jazz           1500
other          1500
Name: count, dtype: int64

In [21]:
data.isna().sum()

track_id            0
genre               0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
artist              0
album               1
dtype: int64

Initial review shows our data has object and int/float columns. Object columns, aside from the genre, all can be dropped since they dont contribute to how genres are assigned. Int and float columns are labeled properly and can be used for training the model. There are no missing data and no duplicate rows.

In [None]:
# Initialize list of attributes
attributes = data.columns.drop('genre').tolist()

# Violinplot of attributes per genre
fig, axs = plt.subplots(2, 7, figsize=(20,10))
axs = axs.flatten()

for i, attribute in enumerate(attributes):
    sns.violinplot(data, x=attribute, y='genre', hue='genre', ax=axs[i], palette=['#006450', '#477D95', '#90EDDA', '#7D4B32', '#8D67AB', '#777777'])
    # Remove labels for figures not in first column
    if i % 7 != 0:
        axs[i].set_ylabel('')
        axs[i].set_yticks([])

fig.delaxes(axs[-1])
plt.suptitle('Distribution of Attributes per Genre', fontsize=30)

Visually, we can see which attributes have a tendency to distiguish between genres (e.g. low energy score for classical music, higher speechiness for rap/hiphop), but we also want to verify if they are statistically different from each other per attribute so we can statistically determine whether to keep an attribute for the model training or not. If we were to just rely on visuals, we may drop key and mode due to indistinguishable differences, but let's use ANOVA to prove or disprove otherwise.

We can also generate the grouped means of each attribute per genre to get a more numerical sense.

In [None]:
# Get grouped attribute means by genre
grouped_means = data.groupby('genre').mean()
grouped_means

In [None]:
# Initialize list for values
attribute_stats = []

# Perform ANOVA
for attribute in attributes:
    anova_results = stats.f_oneway(
        data[data['genre'] == 'classical'][attribute],
        data[data['genre'] == 'jazz'][attribute],
        data[data['genre'] == 'pop'][attribute],
        data[data['genre'] == 'rock'][attribute],
        data[data['genre'] == 'rap/hip-hop'][attribute],
        data[data['genre'] == 'other'][attribute],
    )
    
    alpha = 0.05
    if anova_results.pvalue < alpha:
        attribute_stats.append([attribute, anova_results.statistic, anova_results.pvalue, 'Yes'])
    else:
        attribute_stats.append([attribute, anova_results.statistic, anova_results.pvalue, 'No'])

In [None]:
attribute_summary = pd.DataFrame(attribute_stats, columns=['Attribute', 'F-statistic', 'p-value', 'Significantly Different?'])
attribute_summary = attribute_summary.set_index('Attribute')
attribute_summary = attribute_summary.sort_values(by='F-statistic', ascending=False)
attribute_summary

In [None]:
sns.barplot(attribute_summary, y='F-statistic', x='Attribute')
plt.xticks(rotation=60)
plt.tight_layout()

Looking at the differences statistically, we can see based on the ANOVA test that there are some attributes that have more differences between each genre than the other, confirming what we were able to see visually in the violinplots. Additionally, the results of the key and mode attributes show that they are still significantly different per genre, although not as pronounced as the other attributes, but can still be used to distinguish between genres. All the attributes got a p-value less than 0.05, so we reject the null hypothesis that attributes per genre are not significantly different.

In [None]:
corr_matrix = data.iloc[:,1:].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5, fmt=".2f")

To avoid multicollinearity, we may need to omit some features/attributes that have a high correlation coefficient. Coupling this information with the F-statistic results, it's best to keep the attribute with the highest F-statistic since it has a better ratio of variance between the grouped means and variance within the group. This gives the model a better gauge of distinction between the classes and should improve our results.

# Model Training

Before spliting the data into training and test datasets, the data is scaled between -1 and 1 to provide optimal training conditions for the models.

In [None]:
scaler = StandardScaler()
scaled_data = data.drop(['genre', 'track_id'], axis=1)
scaled_data = pd.DataFrame(scaler.fit_transform(scaled_data), index=scaled_data.index, columns=scaled_data.columns)
data[scaled_data.columns] = scaled_data[scaled_data.columns]
data.head()

In [4]:
train, test = train_test_split(data, test_size=0.1, random_state=42)
print(train.shape)
print(test.shape)

X_train = train.drop(['genre'], axis=1)
y_train = train['genre']
X_test = test.drop(['genre'], axis=1)
y_test = test['genre']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(8100, 14)
(900, 14)
(8100, 13) (8100,)
(900, 13) (900,)


A single KFold instance will be used for training all the models.

In [5]:
# Initializing KFold Instance
cross_validator = KFold(n_splits=3, shuffle=True, random_state=42)

### Dummy Model- Sanity Check

In [None]:
# Creating Dummy model param grid
dummy_params = {
    'strategy':['most_frequent', 'prior', 'stratified', 'uniform', 'constant'],
    'constant':['rock']
}

# Initializing DummyClassifier and its GridSearchCV
dummy_model = DummyClassifier(random_state=42)
dummy_grid = GridSearchCV(dummy_model, dummy_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
dummy_grid.fit(X_train, y_train)
dummy_roc_auc = cross_val_score(dummy_grid.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

# Plotting Confusion Matrix
dummy_pred = dummy_grid.best_estimator_.predict(X_train)
dummy_cm = metrics.confusion_matrix(y_train, dummy_pred)
plt.figure(figsize=(7,7))
sns.heatmap(dummy_cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=y_train.value_counts().index, yticklabels=y_train.value_counts().index)
plt.ylabel('Prediction',fontsize=12)
plt.xlabel('Actual',fontsize=12)
plt.title('Dummy Model Confusion Matrix',fontsize=16)
plt.show()

print(dummy_grid.best_estimator_)
print(f'F1: {dummy_grid.best_score_}')
print('ROC_AUC of Dummy model: ', dummy_roc_auc)

### Decision Tree Model

In [None]:
# Creating DecisionTree model param grid
tree_params = {
    'max_depth':np.arange(3, 11, 1),
    'min_samples_split':[2,4,6]
}

# Initializing DecisionTree and its GridSearchCV
tree_model = DecisionTreeClassifier(random_state=42)
tree_grid = GridSearchCV(tree_model, tree_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
tree_grid.fit(X_train, y_train)
tree_roc_auc = cross_val_score(tree_grid.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

# Plotting Confusion Matrix
tree_pred = tree_grid.best_estimator_.predict(X_train)
tree_cm = metrics.confusion_matrix(y_train, tree_pred)
plt.figure(figsize=(7,7))
sns.heatmap(tree_cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=y_train.value_counts().index, yticklabels=y_train.value_counts().index)
plt.ylabel('Prediction',fontsize=12)
plt.xlabel('Actual',fontsize=12)
plt.title('Decision Tree Model Confusion Matrix',fontsize=16)
plt.show()

print(tree_grid.best_estimator_)
print(f'F1: {tree_grid.best_score_}')
print('ROC_AUC of Tree model: ', tree_roc_auc)

### Light GBM Model

In [None]:
# Creating LightGBM model param grid
lightgbm_params = {
    'num_leaves':[31, 100, 200],
    'learning_rate':[0.01]
}

# Initializing LightGBM and its GridSearchCV
lightgbm_model = lgb.LGBMClassifier(random_state=42, verbosity=-1)
lightgbm_grid = GridSearchCV(lightgbm_model, lightgbm_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
lightgbm_grid.fit(X_train, y_train)
lightgbm_roc_auc = cross_val_score(lightgbm_grid.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

# Plotting Confusion Matrix
lightgbm_pred = lightgbm_grid.best_estimator_.predict(X_train)
lightgbm_cm = metrics.confusion_matrix(y_train, lightgbm_pred)
plt.figure(figsize=(7,7))
sns.heatmap(lightgbm_cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=y_train.value_counts().index, yticklabels=y_train.value_counts().index)
plt.ylabel('Prediction',fontsize=12)
plt.xlabel('Actual',fontsize=12)
plt.title('Light GBM Model Confusion Matrix',fontsize=16)
plt.show()

print(lightgbm_grid.best_estimator_)
print(f'F1: {lightgbm_grid.best_score_}')
print('ROC_AUC of LightGBM model: ', lightgbm_roc_auc)

### CatBoost model

In [None]:
# Creating CatBoost model param grid
catboost_params = {
    'iterations':[1001, 2001],
    'learning_rate':[0.01]
}

# Initializing CatBoost and its GridSearchCV
catboost_model = CatBoostClassifier(random_seed=42, verbose=1000)
catboost_grid = GridSearchCV(catboost_model, catboost_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
catboost_grid.fit(X_train, y_train)
catboost_roc_auc = cross_val_score(catboost_grid.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

# Plotting Confusion Matrix
catboost_pred = catboost_grid.best_estimator_.predict(X_train)
catboost_cm = metrics.confusion_matrix(y_train, catboost_pred)
plt.figure(figsize=(7,7))
sns.heatmap(catboost_cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=y_train.value_counts().index, yticklabels=y_train.value_counts().index)
plt.ylabel('Prediction',fontsize=12)
plt.xlabel('Actual',fontsize=12)
plt.title('CatBoost Model Confusion Matrix',fontsize=16)
plt.show()

print(catboost_grid.best_estimator_)
print( f'F1: {catboost_grid.best_score_}')
print('ROC_AUC of CatBoost model: ', catboost_roc_auc)

### Random Forest Model

In [None]:
# Creating RandomForest model param grid
forest_params = {
    'n_estimators':[1000, 1500],
    'max_depth':np.arange(18, 27, 2),
}

# Initializing RandomForest and its GridSearchCV
forest_model = RandomForestClassifier(random_state=42)
forest_grid = GridSearchCV(forest_model, forest_params, scoring='f1_weighted', cv=cross_validator)

# Training Grid
forest_grid.fit(X_train, y_train)
forest_roc_auc = cross_val_score(forest_grid.best_estimator_, X_train, y_train, scoring='roc_auc_ovo_weighted', cv=cross_validator).mean()

# Plotting Confusion Matrix
forest_pred = forest_grid.best_estimator_.predict(X_train)
forest_cm = metrics.confusion_matrix(y_train, forest_pred)
plt.figure(figsize=(7,7))
sns.heatmap(forest_cm, annot=True, fmt='d', cmap='YlGnBu', xticklabels=y_train.value_counts().index, yticklabels=y_train.value_counts().index)
plt.ylabel('Prediction',fontsize=12)
plt.xlabel('Actual',fontsize=12)
plt.title('Random Forest Model Confusion Matrix',fontsize=16)
plt.show()

print(forest_grid.best_estimator_)
print(f'F1: {forest_grid.best_score_}')
print('ROC_AUC of Random Forest model: ', forest_roc_auc)

# Test

In [None]:
test_model = forest_grid.best_estimator_

# Predicting test dataset
test_predict = test_model.predict(X_test)
test_proba = test_model.predict_proba(X_test)
test_roc_auc = metrics.roc_auc_score(y_test, test_proba, average='weighted', multi_class='ovo')
test_f1 = metrics.f1_score(y_test, test_predict, average='weighted')
test_cm = metrics.confusion_matrix(y_test, test_predict)

plt.figure(figsize=(7,7))
sns.heatmap(test_cm, annot=True,fmt='d', cmap='YlGnBu', xticklabels=y_train.value_counts().index, yticklabels=y_train.value_counts().index)
plt.ylabel('Prediction',fontsize=12)
plt.xlabel('Actual',fontsize=12)
plt.title('Test Model Confusion Matrix',fontsize=16)
plt.show()

print(test_model)
print('Test ROC_AUC:', test_roc_auc)
print('Test F1 : ', test_f1)

## Song Prediction

In [None]:
def get_track_features(song_title, artist_name):
    # Search for the song using Spotipy's search function
    result = sp.search(q=f"track:{song_title} artist:{artist_name}", type='track', limit=1)
    
    if result['tracks']['items']:
        # Extract the track ID from the search result
        track = result['tracks']['items'][0]
        track_id = track['id']
        track_name = track['name']
        artist_name = track['artists'][0]['name']
        
        print(f"Found track: {track_name} by {artist_name}")
        
        # Use the track ID to get the song's features
        features = sp.audio_features(track_id)
        return features[0]  # Return the features dictionary
    else:
        print(f"No results found for {song_title} by {artist_name}")
        return None

# Example usage
song_title = "Spybreak-Short One"
artist_name = "Propellerheads"
features = get_track_features(song_title, artist_name)

if features:
    print("Audio Features:")
    print(features)

# Removing columns & scaling data
song_data = pd.DataFrame(features, index=[0], columns=features.keys()).drop(['type', 'uri', 'track_href', 'analysis_url', 'id'], axis=1)
song_data[scaled_data.columns] = scaler.transform(song_data[scaled_data.columns])
search_genre = test_model.predict(song_data)
print(f'Genre of chosen track: {search_genre}')