# Supervised Learning Music Genre Classification

## Import Statements

Start by importing necessary libraries.

In [28]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve, validation_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import soundfile as sf
import joblib

## Import Data from prepared CSV

In [29]:
data = pd.read_csv('all_genres_audio_features.csv')
data = data.drop(columns= ['filename','start','end']) 

genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

data.shape

(9981, 58)

Check for duplicates. 

In [30]:
def check_duplicates_or_nulls(data):
    duplicates = data.duplicated().sum() > 0
    nulls = data.isnull().sum().sum() > 0
    return duplicates or nulls

if check_duplicates_or_nulls(data):
    print('Duplicates or null values found in data')
else:
    print('No duplicates or null values found in data')

Duplicates or null values found in data


Deal with strings in features

In [31]:
# Clean the 'tempo' column
data['tempo'] = data['tempo'].str.strip('[]').astype(float)

# Verify the changes
data.describe(include='all')

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,genre
count,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,...,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981.0,9981
unique,,,,,,,,,,,...,,,,,,,,,,10
top,,,,,,,,,,,...,,,,,,,,,,blues
freq,,,,,,,,,,,...,,,,,,,,,,1000
mean,0.379964,0.084882,0.130039,0.002672433,2201.910957,415925.5,2244.56246,118312.1,4571.568401,1623468.0,...,49.879672,-4.193187,51.838396,0.724376,52.343694,-2.497094,54.811697,-0.929246,57.142098,
std,0.090624,0.009675,0.068168,0.003561532,750.540439,433967.5,541.420376,100250.1,1639.481644,1482634.0,...,34.35813,5.668784,36.301829,5.175797,38.06779,5.107192,41.505894,5.247202,46.342797,
min,0.108073,0.015217,0.000947,4.055916e-08,479.905803,2161.498,499.577102,1295.35,673.906438,1130.834,...,1.343237,-27.932222,1.531855,-20.749748,3.445752,-27.359076,3.147765,-35.614895,0.253587,
25%,0.316037,0.07982,0.083223,0.000628582,1634.097152,122833.6,1890.204723,49414.1,3389.905912,556238.5,...,29.52179,-7.948162,29.82122,-2.524088,29.405123,-5.734853,30.38486,-4.01272,29.925747,
50%,0.385163,0.085137,0.120488,0.001500287,2211.777107,264393.1,2233.071916,90371.22,4634.773513,1155826.0,...,41.505714,-4.444725,42.23508,0.730935,41.686157,-2.700388,43.264107,-1.045194,44.173588,
75%,0.442906,0.091154,0.175334,0.00311376,2713.457812,561220.0,2590.295338,157892.9,5597.307692,2251497.0,...,59.06206,-0.731065,61.46762,3.871771,61.854343,0.521315,65.16512,2.192562,68.00071,


## Begin Preprocessing

Apply the low pass filter through use of a moving average with a window size of 3.

In [32]:
def apply_moving_average_filter(df, window_size=3):
    feature_columns = df.columns.difference(['genre'])
    df[feature_columns] = df[feature_columns].rolling(window=window_size, min_periods=1).mean()
    return df

data = apply_moving_average_filter(data)

## Shuffle and Split data

Split data into training and testing sets (70/30).

In [33]:
# Split the data into features and target variable
X = np.array(data.iloc[:, :-1], dtype=float)
y = data.iloc[:, -1]

# Split the dataset with 70% for training set and 30% for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## Normalize the data set and encode the labels

In [34]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Encode the target labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

## Learning curves 

Learning curve function

In [35]:
def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None):
    plt.figure()
    plt.title(title)
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=np.linspace(.1, 1.0, 5))
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

## Model initialization and hyper parameter tuning

Define function for model train and evaluation

In [36]:
# Define the model training and evaluation function with cross-validation
def train_and_evaluate_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Cross-validation
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
    print(f'Cross-validation scores: {cv_scores}')
    print(f'Mean cross-validation score: {np.mean(cv_scores)}')

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=encoder.classes_)
    return best_model, accuracy, report


Random Forest Model.

In [None]:
rf_param_grid = {
    'n_estimators': [100, 250, 1000],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestClassifier()
rf_best_model, rf_accuracy, rf_report = train_and_evaluate_model(rf_model, rf_param_grid)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)

plot_learning_curve(rf_best_model, "Learning Curves (Random Forest)", X, y, cv=5)

Support Vector Machine Model. 

In [None]:
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm_model = SVC()
svm_best_model, svm_accuracy, svm_report = train_and_evaluate_model(svm_model, svm_param_grid)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", svm_report)

plot_learning_curve(svm_best_model, "Learning Curves (SVM)", X, y, cv=5)

Gradient Boost Model.

In [None]:
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}
gb_model = GradientBoostingClassifier()
gb_best_model, gb_accuracy, gb_report = train_and_evaluate_model(gb_model, gb_param_grid)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Classification Report:\n", gb_report)

plot_learning_curve(gb_best_model, "Learning Curves (Gradient Boosting)", X, y, cv=5)


KNeighbours Model. 

In [None]:
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_model = KNeighborsClassifier()
knn_best_model, knn_accuracy, knn_report = train_and_evaluate_model(knn_model, knn_param_grid)
print("KNeighbors Accuracy:", knn_accuracy)
print("KNeighbors Classification Report:\n", knn_report)

plot_learning_curve(knn_best_model, "Learning Curves (KNN)", X, y, cv=5)

## Save models for future use

Use library to save model for export/import

In [None]:
# Save the best models
joblib.dump(rf_best_model, 'outputs/rf_best_model.pkl')
joblib.dump(svm_best_model, 'outputs/svm_best_model.pkl')
joblib.dump(gb_best_model, 'outputs/gb_best_model.pkl')
joblib.dump(knn_best_model, 'outputs/knn_best_model.pkl')

# Save the scaler and encoder
joblib.dump(scaler, 'outputs/scaler.pkl')
joblib.dump(encoder, 'outputs/encoder.pkl')

# Save the scaled data and encoded labels
joblib.dump(X_train, 'outputs/X_train.pkl')
joblib.dump(X_test, 'outputs/X_test.pkl')
joblib.dump(y_train, 'outputs/y_train.pkl')
joblib.dump(y_test, 'outputs/y_test.pkl')

## External validation

Preprocess the external data.

In [38]:
# Load external data
external_data = pd.read_csv('file_features.csv')
external_data = external_data.drop(columns=['filename', 'start', 'end'])

# Clean the 'tempo' column
external_data['tempo'] = external_data['tempo'].str.strip('[]').astype(float)

# Apply the same moving average filter
external_data = apply_moving_average_filter(external_data)

# Separate features
X_external = scaler.transform(np.array(external_data.iloc[:, :], dtype=float))

Apply the best models on the external data. 

In [40]:
# Load the best models
rf_best_model = joblib.load('outputs/rf_best_model.pkl')
svm_best_model = joblib.load('outputs/svm_best_model.pkl')
gb_best_model = joblib.load('outputs/gb_best_model.pkl')
knn_best_model = joblib.load('outputs/knn_best_model.pkl')

# Define a function to evaluate a model on unlabeled external data
def evaluate_model_on_external_data(model, X_ext, model_name):
    y_pred = model.predict(X_ext)
    # Map predicted numbers to genre names
    y_pred_genre = [genres[pred] for pred in y_pred]
    print(f"{model_name} Predictions on External Data:")
    print(y_pred_genre)

# Evaluate each model
evaluate_model_on_external_data(rf_best_model, X_external, "Random Forest")
evaluate_model_on_external_data(svm_best_model, X_external, "SVM")
evaluate_model_on_external_data(gb_best_model, X_external, "Gradient Boosting")
evaluate_model_on_external_data(knn_best_model, X_external, "K-Nearest Neighbors")

Random Forest Predictions on External Data:
['hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop']
SVM Predictions on External Data:
['hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop']
Gradient Boosting Predictions on External Data:
['hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'hiphop', 'pop']
K-Nearest Neighbors Predictions on External Data:
['reggae', 'pop', 'pop', 'hiphop', 'hiphop', 'reggae', 'reggae', 'reggae']
