# Supervised Learning Music Genre Classification

## Import Statements

Start by importing necessary libraries.

In [12]:
import os
import librosa
import pandas as pd
from pydub import AudioSegment
import tempfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Helper Functions

Convert to MP3 function.

In [13]:
def convert_mp3_to_wav(mp3_file):
    try:
        sound = AudioSegment.from_mp3(mp3_file)
        wav_file = tempfile.mktemp(suffix='.wav')
        sound.export(wav_file, format="wav")
        return wav_file
    except Exception as e:
        print(f"Error converting {mp3_file} to WAV: {e}")
        return None

Feature extraction from a segment. 

In [14]:
# Function to extract features from an audio segment
def extract_features_from_segment(y, sr, start_time, end_time):
    segment = y[start_time:end_time]

    # Compute chroma feature from the waveform and sample rate
    chroma_stft = librosa.feature.chroma_stft(y=segment, sr=sr)
    # Compute Root Mean Square (RMS) energy for each frame
    rms = librosa.feature.rms(y=segment)
    # Compute spectral centroid, which indicates where the center of mass for a sound is located
    spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)
    # Compute spectral bandwidth, which is a measure of the width of the band of frequencies
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr)
    # Compute spectral rolloff point, which is the frequency below which a specified percentage of the total spectral energy lies
    rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr)
    # Compute zero crossing rate, which is the rate at which the signal changes sign
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=segment)
    # Compute harmony and perceptr (percussive) components of the audio
    harmony, perceptr = librosa.effects.hpss(segment)
    # Compute tempo (beats per minute)
    tempo, _ = librosa.beat.beat_track(y=segment, sr=sr)
    # Compute Mel-frequency cepstral coefficients (MFCCs)
    mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=20)

    # Aggregate the features into a dictionary
    features = {
        'chroma_stft_mean': chroma_stft.mean() if chroma_stft.size else 0,
        'chroma_stft_var': chroma_stft.var() if chroma_stft.size else 0,
        'rms_mean': rms.mean() if rms.size else 0,
        'rms_var': rms.var() if rms.size else 0,
        'spectral_centroid_mean': spectral_centroid.mean() if spectral_centroid.size else 0,
        'spectral_centroid_var': spectral_centroid.var() if spectral_centroid.size else 0,
        'spectral_bandwidth_mean': spectral_bandwidth.mean() if spectral_bandwidth.size else 0,
        'spectral_bandwidth_var': spectral_bandwidth.var() if spectral_bandwidth.size else 0,
        'rolloff_mean': rolloff.mean() if rolloff.size else 0,
        'rolloff_var': rolloff.var() if rolloff.size else 0,
        'zero_crossing_rate_mean': zero_crossing_rate.mean() if zero_crossing_rate.size else 0,
        'zero_crossing_rate_var': zero_crossing_rate.var() if zero_crossing_rate.size else 0,
        'harmony_mean': harmony.mean() if harmony.size else 0,
        'harmony_var': harmony.var() if harmony.size else 0,
        'perceptr_mean': perceptr.mean() if perceptr.size else 0,
        'perceptr_var': perceptr.var() if perceptr.size else 0,
        'tempo': tempo,
    }

    for i in range(1, 21):
        features[f'mfcc{i}_mean'] = mfcc[i-1].mean() if mfcc.shape[0] >= i else 0
        features[f'mfcc{i}_var'] = mfcc[i-1].var() if mfcc.shape[0] >= i else 0

    return features


Segment data and call feature extraction.

In [15]:
# Function to extract features from an audio file
def extract_features(audio_file, segment_duration=3):
    try:
        # Load the audio file
        y, sr = librosa.load(audio_file, sr=None)
        total_duration = len(y) / sr
        segment_length = int(sr * segment_duration)

        features_list = []

        for start in range(0, len(y), segment_length):
            end = start + segment_length
            if end <= len(y):
                segment_features = extract_features_from_segment(y, sr, start, end)
                segment_features['filename'] = os.path.basename(audio_file)
                segment_features['start'] = start / sr
                segment_features['end'] = end / sr
                features_list.append(segment_features)

        return features_list

    except Exception as e:
        print(f"Error extracting features from {audio_file}: {e}")
        return []

## Define Data Paths

Specify the paths to the CSV files containing data.

In [16]:
# List of genres
genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

# Base folder containing genre subfolders
base_folder_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/GTZan/genres_original'

## Process Input Audio Files

Process input files and label the data. 

In [17]:
# Main function to process all audio files in a folder
def process_audio_folder(folder_path, genre_label):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav') or filename.endswith('.mp3'):
            file_path = os.path.join(folder_path, filename)
            if filename.endswith('.mp3'):
                try:
                    file_path = convert_mp3_to_wav(file_path)
                except Exception as e:
                    print(f"Error converting {file_path} to WAV: {e}")
                    continue
            features_list = extract_features(file_path)
            for features in features_list:
                features['genre'] = genre_label
                results.append(features)
    return results

## Create Singular CSV

Aggregate results of the feature extraction into a singular CSV.

In [18]:
# Collect results from all genres
all_results = []

# Process each genre
for genre in genres:
    print(f"Processing genre: {genre}")
    folder_path = os.path.join(base_folder_path, genre)
    genre_results = process_audio_folder(folder_path, genre)
    all_results.extend(genre_results)
    print(f"Completed processing genre: {genre}")

# Write all results to a single CSV file
print("Writing results to CSV file...")
df = pd.DataFrame(all_results)
csv_file_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/all_genres_audio_features.csv'
df.to_csv(csv_file_path, index=False)
print("CSV file generation completed.")

# I (simon) mainly added this for testing just so I could see when everything is done
print("Feature extraction and CSV generation completed for all genres.")

df.head()

Processing genre: blues
Completed processing genre: blues
Processing genre: classical
Completed processing genre: classical
Processing genre: country
Completed processing genre: country
Processing genre: disco
Completed processing genre: disco
Processing genre: hiphop
Completed processing genre: hiphop
Processing genre: jazz


  y, sr = librosa.load(audio_file, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error extracting features from /Users/isaiah/Desktop/Career/Projects/music-genre-detector/GTZan/genres_original/jazz/jazz.00054.wav: 
Completed processing genre: jazz
Processing genre: metal
Completed processing genre: metal
Processing genre: pop
Completed processing genre: pop
Processing genre: reggae
Completed processing genre: reggae
Processing genre: rock
Completed processing genre: rock
Writing results to CSV file...
CSV file generation completed.
Feature extraction and CSV generation completed for all genres.


Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,filename,start,end,genre
0,0.399003,0.09466,0.056844,0.00188,640.531249,160926.527211,1022.053919,104654.734035,1190.123573,726941.687695,...,0.315695,44.211178,7.07478,56.459846,2.348529,45.106159,blues.00093.wav,0.0,3.0,blues
1,0.37666,0.088501,0.080834,0.002519,607.890994,240755.406608,934.617893,137064.948571,1069.703275,951525.794395,...,-4.404086,38.417225,2.518581,29.093346,3.841909,31.415276,blues.00093.wav,3.0,6.0,blues
2,0.426684,0.09577,0.044714,0.000568,534.128623,15999.601033,994.35444,41982.502498,812.133038,87253.547063,...,4.660423,30.106455,5.132306,35.961784,1.360925,18.663179,blues.00093.wav,6.0,9.0,blues
3,0.394977,0.102008,0.06887,0.001562,555.363338,194792.027978,1008.076563,127401.321491,870.769606,871780.442348,...,0.496577,22.756107,0.56417,26.510843,-4.204146,16.829517,blues.00093.wav,9.0,12.0,blues
4,0.43266,0.093615,0.047667,0.000905,543.095038,30510.827127,1029.849967,73572.166629,817.930439,152939.004439,...,0.099773,23.850695,2.277985,56.035744,-4.323173,74.503784,blues.00093.wav,12.0,15.0,blues


## Examine Data

Gain insights into data. 

In [19]:
# Get summary statistics
df.describe()


Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,start,end
count,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,...,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0
mean,0.379964,0.084882,0.130039,0.002672434,2201.910957,415925.5,2244.56246,118312.1,4571.568401,1623468.0,...,-4.193187,51.838394,0.724376,52.343689,-2.497094,54.811691,-0.929246,57.142101,13.487827,16.487827
std,0.090624,0.009675,0.068168,0.003561535,750.540439,433967.5,541.420376,100250.1,1639.481644,1482634.0,...,5.668772,36.301769,5.175787,38.067753,5.107198,41.505917,5.247203,46.342815,8.611614,8.611614
min,0.108073,0.015217,0.000947,4.055916e-08,479.905803,2161.498,499.577101,1295.35,673.906438,1130.834,...,-27.93222,1.531856,-20.749746,3.445751,-27.359076,3.147764,-35.614895,0.253587,0.0,3.0
25%,0.316037,0.07982,0.083223,0.000628582,1634.097151,122833.6,1890.204723,49414.1,3389.905912,556238.5,...,-7.948162,29.821222,-2.524088,29.405123,-5.734853,30.384863,-4.01272,29.925747,6.0,9.0
50%,0.385163,0.085137,0.120488,0.001500287,2211.777107,264393.1,2233.071917,90371.22,4634.773513,1155826.0,...,-4.444726,42.235081,0.730936,41.686157,-2.700388,43.264107,-1.045194,44.173588,12.0,15.0
75%,0.442906,0.091154,0.175334,0.00311376,2713.457812,561220.0,2590.295338,157892.9,5597.307692,2251497.0,...,-0.731065,61.467625,3.871771,61.85434,0.521315,65.165131,2.192562,68.00071,21.0,24.0
max,0.751176,0.120717,0.440458,0.03237973,5432.278842,4801847.0,3708.279664,1237131.0,9486.121357,12936610.0,...,33.869503,523.140564,36.923035,628.774414,31.367567,1147.502441,34.130856,914.816223,27.0,30.0


Check for duplicates. 

In [20]:
def check_duplicates_or_nulls(data):
    # Check for duplicates
    duplicates = data.duplicated().sum() > 0
    
    # Check for any null values
    nulls = data.isnull().sum().sum() > 0
    
    # Return true if either condition met
    return duplicates or nulls

# Assuming csv_file_path is a string representing a file path
data = pd.read_csv(csv_file_path)  # Read the CSV file
if check_duplicates_or_nulls(data):
    print('Duplicates or null values found in data')
else:
    print('No duplicates or null values found in data')

No duplicates or null values found in data


## Shuffle and Split data

Split data into training and testing sets (90/10).

In [21]:
# Shuffle and split the data into training and testing sets
print("Shuffling and splitting the data into training and testing sets...")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['genre'])

Shuffling and splitting the data into training and testing sets...


## Begin Preprocessing

Apply the low pass filter through use of a moving average with a window size of 3.

In [22]:
# Apply moving average filter
def apply_moving_average_filter(df, window_size=3):
    feature_columns = df.columns.difference(['filename', 'start', 'end', 'genre'])
    df[feature_columns] = df[feature_columns].rolling(window=window_size, min_periods=1).mean()
    return df

train_df = apply_moving_average_filter(train_df)
test_df = apply_moving_average_filter(test_df)

## Z Score normalization is done to the extracted features 

Using StandardScaler to properly normalize the individual features for the train and test set, converting back to data frames upon completion. 

In [23]:

# Apply z-score normalization
def apply_zscore_normalization(train_df, test_df):
    feature_columns = train_df.columns.difference(['filename', 'start', 'end', 'genre'])
    scaler = StandardScaler()
    
    # Fit the scaler on the training data
    train_df[feature_columns] = scaler.fit_transform(train_df[feature_columns])
    
    # Transform the testing data
    test_df[feature_columns] = scaler.transform(test_df[feature_columns])
    
    return train_df, test_df

train_df, test_df = apply_zscore_normalization(train_df, test_df)

## Separate features and labels for training and testing sets

Must make sure that the labels and features themselves remain separate. 

In [25]:
# Separate features and labels for training and testing sets
X_train = train_df.drop(columns=['filename', 'start', 'end', 'genre'])
y_train = train_df['genre']
X_test = test_df.drop(columns=['filename', 'start', 'end', 'genre'])
y_test = test_df['genre']

## Save train and test data

save data to separate csv to visualize before feeding into the model

In [26]:
# Save the training and testing sets to separate CSV files
train_features_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/train_audio_features.csv'
train_labels_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/train_audio_labels.csv'
test_features_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/test_audio_features.csv'
test_labels_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/test_audio_labels.csv'

X_train.to_csv(train_features_csv_path, index=False)
y_train.to_csv(train_labels_csv_path, index=False)
X_test.to_csv(test_features_csv_path, index=False)
y_test.to_csv(test_labels_csv_path, index=False)
print("Training and testing CSV files generation completed.")

Training and testing CSV files generation completed.


## Run model

Model is intialized, trained and test with accuracy scores displayed. 

## Accuracy Testing

Check the model to ensure it is doing well in terms of classifications.

## Cross-validation 

To ensure that the model is functioning well, cross-validation is done with an average result shown. 