# Supervised Learning Music Genre Classification

## Import Statements

Start by importing necessary libraries.

In [1]:
import os
import librosa
import pandas as pd
from pydub import AudioSegment
import tempfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import soundfile as sf
import joblib
import librosa.display
from skimage import io
from skimage.transform import resize



## Helper Functions

Convert to MP3 function.

In [2]:
# Function to convert MP3 to WAV
def convert_mp3_to_wav(mp3_file):
    try:
        sound = AudioSegment.from_mp3(mp3_file)
        wav_file = tempfile.mktemp(suffix='.wav')
        sound.export(wav_file, format="wav")
        return wav_file
    except Exception as e:
        print(f"Error converting {mp3_file} to WAV: {e}")
        return None

Feature extraction from a segment. 

In [3]:
# Function to extract features from an audio segment
def extract_features_from_segment(y, sr, start_time, end_time):
    segment = y[start_time:end_time]

    chroma_stft = librosa.feature.chroma_stft(y=segment, sr=sr)
    rms = librosa.feature.rms(y=segment)
    spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=segment)
    harmony, perceptr = librosa.effects.hpss(segment)
    tempo, _ = librosa.beat.beat_track(y=segment, sr=sr)
    mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=20)

    features = {
        'chroma_stft_mean': chroma_stft.mean() if chroma_stft.size else 0,
        'chroma_stft_var': chroma_stft.var() if chroma_stft.size else 0,
        'rms_mean': rms.mean() if rms.size else 0,
        'rms_var': rms.var() if rms.size else 0,
        'spectral_centroid_mean': spectral_centroid.mean() if spectral_centroid.size else 0,
        'spectral_centroid_var': spectral_centroid.var() if spectral_centroid.size else 0,
        'spectral_bandwidth_mean': spectral_bandwidth.mean() if spectral_bandwidth.size else 0,
        'spectral_bandwidth_var': spectral_bandwidth.var() if spectral_bandwidth.size else 0,
        'rolloff_mean': rolloff.mean() if rolloff.size else 0,
        'rolloff_var': rolloff.var() if rolloff.size else 0,
        'zero_crossing_rate_mean': zero_crossing_rate.mean() if zero_crossing_rate.size else 0,
        'zero_crossing_rate_var': zero_crossing_rate.var() if zero_crossing_rate.size else 0,
        'harmony_mean': harmony.mean() if harmony.size else 0,
        'harmony_var': harmony.var() if harmony.size else 0,
        'perceptr_mean': perceptr.mean() if perceptr.size else 0,
        'perceptr_var': perceptr.var() if perceptr.size else 0,
        'tempo': tempo,
    }

    for i in range(1, 21):
        features[f'mfcc{i}_mean'] = mfcc[i-1].mean() if mfcc.shape[0] >= i else 0
        features[f'mfcc{i}_var'] = mfcc[i-1].var() if mfcc.shape[0] >= i else 0

    return features

Load audio helpfer function.

In [4]:
# Function to load audio file
def load_audio(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)
    except sf.LibsndfileError:
        print(f"LibsndfileError: {file_path}")
        return None, None
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None
    return y, sr

Generate spectrogram helper function. 

In [5]:
# Function to generate and save spectrogram
def generate_spectrogram(y, sr, file_path):
    plt.figure(figsize=(10, 4))
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)
    spectrogram_file = os.path.splitext(file_path)[0] + '_spectrogram.png'
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel-frequency spectrogram')
    plt.tight_layout()
    plt.savefig(spectrogram_file)
    plt.close()
    return spectrogram_file

Extract features from spectrogram. 

In [6]:
# Function to extract numerical features from the spectrogram
def extract_spectrogram_features(spectrogram_path):
    try:
        image = io.imread(spectrogram_path, as_gray=True)
        image_resized = resize(image, (128, 128))  # Resize to a fixed size
        features = image_resized.flatten()  # Flatten to 1D array
        return features
    except Exception as e:
        print(f"Error extracting spectrogram features from {spectrogram_path}: {e}")
        return np.zeros(128*128)

Segment data and call feature extraction.

In [7]:
# Function to extract features from an audio file
def extract_features(audio_file, segment_duration=3):
    try:
        y, sr = load_audio(audio_file)
        if y is None:
            return []

        total_duration = len(y) / sr
        segment_length = int(sr * segment_duration)

        features_list = []

        for start in range(0, len(y), segment_length):
            end = start + segment_length
            if end <= len(y):
                segment_features = extract_features_from_segment(y, sr, start, end)
                #spectrogram_path = generate_spectrogram(y[start:end], sr, audio_file)
                #spectrogram_features = extract_spectrogram_features(spectrogram_path)
                #all_features = {**segment_features, **{f'spec_{i}': val for i, val in enumerate(spectrogram_features)}}
                all_features = segment_features
                all_features['filename'] = os.path.basename(audio_file)
                all_features['start'] = start / sr
                all_features['end'] = end / sr
                features_list.append(all_features)

        return features_list

    except Exception as e:
        print(f"Error extracting features from {audio_file}: {e}")
        return []

## Define Data Paths

Specify the paths to the CSV files containing data.

In [8]:
# List of genres
genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

# Base folder containing genre subfolders
base_folder_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/GTZan/genres_original'

# Create a directory to save spectrograms
spectrogram_base_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/spectrograms'
os.makedirs(spectrogram_base_path, exist_ok=True)

## Process Input Audio Files

Process input files and label the data. 

In [9]:
# Function to process a folder of audio files
def process_audio_folder(folder_path, genre_label):
    results = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.wav') or filename.endswith('.mp3'):
            file_path = os.path.join(folder_path, filename)
            if filename.endswith('.mp3'):
                file_path = convert_mp3_to_wav(file_path)
            features_list = extract_features(file_path)
            for features in features_list:
                features['genre'] = genre_label
                results.append(features)
    return results

## Create Singular CSV

Aggregate results of the feature extraction into a singular CSV.

In [10]:
all_results = []

for genre in genres:
    print(f"Processing genre: {genre}")
    folder_path = os.path.join(base_folder_path, genre)
    genre_results = process_audio_folder(folder_path, genre)
    all_results.extend(genre_results)
    print(f"Completed processing genre: {genre}")

print("Writing results to CSV file...")
df = pd.DataFrame(all_results)
csv_file_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/all_genres_audio_features.csv'
df.to_csv(csv_file_path, index=False)
print("CSV file generation completed.")

df.head()

Processing genre: blues
Completed processing genre: blues
Processing genre: classical


KeyboardInterrupt: 

## Examine Data

Gain insights into data. 

In [None]:
# Get summary statistics
df.describe()


Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,start,end
count,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,...,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0
mean,0.379964,0.084882,0.130039,0.002672434,2201.910957,415925.5,2244.56246,118312.1,4571.568401,1623468.0,...,-4.193187,51.838394,0.724376,52.343689,-2.497094,54.811691,-0.929246,57.142101,13.487827,16.487827
std,0.090624,0.009675,0.068168,0.003561535,750.540439,433967.5,541.420376,100250.1,1639.481644,1482634.0,...,5.668772,36.301769,5.175787,38.067753,5.107198,41.505917,5.247203,46.342815,8.611614,8.611614
min,0.108073,0.015217,0.000947,4.055916e-08,479.905803,2161.498,499.577101,1295.35,673.906438,1130.834,...,-27.93222,1.531856,-20.749746,3.445751,-27.359076,3.147764,-35.614895,0.253587,0.0,3.0
25%,0.316037,0.07982,0.083223,0.000628582,1634.097151,122833.6,1890.204723,49414.1,3389.905912,556238.5,...,-7.948162,29.821222,-2.524088,29.405123,-5.734853,30.384863,-4.01272,29.925747,6.0,9.0
50%,0.385163,0.085137,0.120488,0.001500287,2211.777107,264393.1,2233.071917,90371.22,4634.773513,1155826.0,...,-4.444726,42.235081,0.730936,41.686157,-2.700388,43.264107,-1.045194,44.173588,12.0,15.0
75%,0.442906,0.091154,0.175334,0.00311376,2713.457812,561220.0,2590.295338,157892.9,5597.307692,2251497.0,...,-0.731065,61.467625,3.871771,61.85434,0.521315,65.165131,2.192562,68.00071,21.0,24.0
max,0.751176,0.120717,0.440458,0.03237973,5432.278842,4801847.0,3708.279664,1237131.0,9486.121357,12936610.0,...,33.869503,523.140564,36.923035,628.774414,31.367567,1147.502441,34.130856,914.816223,27.0,30.0


Check for duplicates. 

In [None]:
def check_duplicates_or_nulls(data):
    duplicates = data.duplicated().sum() > 0
    nulls = data.isnull().sum().sum() > 0
    return duplicates or nulls

data = pd.read_csv(csv_file_path)
if check_duplicates_or_nulls(data):
    print('Duplicates or null values found in data')
else:
    print('No duplicates or null values found in data')

No duplicates or null values found in data


## Shuffle and Split data

Split data into training and testing sets (90/10).

In [None]:
# Shuffle and split the data into training and testing sets
print("Shuffling and splitting the data into training and testing sets...")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['genre'])

Shuffling and splitting the data into training and testing sets...


## Begin Preprocessing

Apply the low pass filter through use of a moving average with a window size of 3.

In [None]:
# Apply moving average filter
def apply_moving_average_filter(df, window_size=3):
    feature_columns = df.columns.difference(['filename', 'start', 'end', 'genre'])
    df[feature_columns] = df[feature_columns].rolling(window=window_size, min_periods=1).mean()
    return df

train_df = apply_moving_average_filter(train_df)
test_df = apply_moving_average_filter(test_df)

## Z Score normalization is done to the extracted features 

Using StandardScaler to properly normalize the individual features for the train and test set, converting back to data frames upon completion. 

In [None]:

# Apply z-score normalization
def apply_zscore_normalization(train_df, test_df):
    feature_columns = train_df.columns.difference(['filename', 'start', 'end', 'genre'])
    scaler = StandardScaler()
    
    # Fit the scaler on the training data
    train_df[feature_columns] = scaler.fit_transform(train_df[feature_columns])
    
    # Transform the testing data
    test_df[feature_columns] = scaler.transform(test_df[feature_columns])
    
    return train_df, test_df

train_df, test_df = apply_zscore_normalization(train_df, test_df)

## Separate features and labels for training and testing sets

Must make sure that the labels and features themselves remain separate. 

In [None]:
# Separate features and labels for training and testing sets
X_train = train_df.drop(columns=['filename', 'start', 'end', 'genre'])
y_train = train_df['genre']
X_test = test_df.drop(columns=['filename', 'start', 'end', 'genre'])
y_test = test_df['genre']

## Save train and test data

save data to separate csv to visualize before feeding into the model

In [None]:
# Save the training and testing sets to separate CSV files
train_features_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/train_audio_features.csv'
train_labels_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/train_audio_labels.csv'
test_features_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/test_audio_features.csv'
test_labels_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/test_audio_labels.csv'

X_train.to_csv(train_features_csv_path, index=False)
y_train.to_csv(train_labels_csv_path, index=False)
X_test.to_csv(test_features_csv_path, index=False)
y_test.to_csv(test_labels_csv_path, index=False)
print("Training and testing CSV files generation completed.")

Training and testing CSV files generation completed.


## Call for Data

Retrieve the prepared data from the csv files to avoid the need to run the feature extraction cell every attempt. 

In [None]:
# File paths
train_features_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/train_audio_features.csv'
train_labels_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/train_audio_labels.csv'
test_features_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/test_audio_features.csv'
test_labels_csv_path = '/Users/isaiah/Desktop/Career/Projects/music-genre-detector/test_audio_labels.csv'

# Load the datasets
X_train = pd.read_csv(train_features_csv_path)
y_train = pd.read_csv(train_labels_csv_path)['genre']
X_test = pd.read_csv(test_features_csv_path)
y_test = pd.read_csv(test_labels_csv_path)['genre']

## Model initialization function

Setup for the model use and hyper parameterization phase. 

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

def train_and_evaluate_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train_scaled, y_train_encoded)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    report = classification_report(y_test_encoded, y_pred, target_names=encoder.classes_)
    return best_model, accuracy, report

Random Forest Model.

In [None]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestClassifier()
rf_best_model, rf_accuracy, rf_report = train_and_evaluate_model(rf_model, rf_param_grid)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm_model = SVC()
svm_best_model, svm_accuracy, svm_report = train_and_evaluate_model(svm_model, svm_param_grid)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", svm_report)

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}
gb_model = GradientBoostingClassifier()
gb_best_model, gb_accuracy, gb_report = train_and_evaluate_model(gb_model, gb_param_grid)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Classification Report:\n", gb_report)

KeyboardInterrupt: 

Support Vector Machine Model. 

In [None]:
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm_model = SVC()
svm_best_model, svm_accuracy, svm_report = train_and_evaluate_model(svm_model, svm_param_grid)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", svm_report)

Gradient Boost Model.

In [None]:
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}
gb_model = GradientBoostingClassifier()
gb_best_model, gb_accuracy, gb_report = train_and_evaluate_model(gb_model, gb_param_grid)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Classification Report:\n", gb_report)

## Save models for future use

Use library to save model for export/import

In [None]:
# Save the best models
joblib.dump(rf_best_model, 'rf_best_model.pkl')
joblib.dump(svm_best_model, 'svm_best_model.pkl')
joblib.dump(gb_best_model, 'gb_best_model.pkl')

# Save the scalers and encoders
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoder, 'encoder.pkl')

# Save the scaled data
joblib.dump(X_train_scaled, 'X_train_scaled.pkl')
joblib.dump(X_test_scaled, 'X_test_scaled.pkl')
joblib.dump(y_train_encoded, 'y_train_encoded.pkl')
joblib.dump(y_test_encoded, 'y_test_encoded.pkl')