# Supervised Learning Music Genre Classification

## Import Statements

Start by importing necessary libraries.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import soundfile as sf
import joblib

## Import Data from prepared CSV

In [None]:
data = pd.read_csv('all_genres_audio_features.csv')
data = data.drop(columns= ['filename','start','end']) 

genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

data.shape

Check for duplicates. 

In [None]:
def check_duplicates_or_nulls(data):
    duplicates = data.duplicated().sum() > 0
    nulls = data.isnull().sum().sum() > 0
    return duplicates or nulls

if check_duplicates_or_nulls(data):
    print('Duplicates or null values found in data')
else:
    print('No duplicates or null values found in data')

Deal with strings in features

In [None]:
# Clean the 'tempo' column
data['tempo'] = data['tempo'].str.strip('[]').astype(float)

# Verify the changes
data.describe(include='all')

## Begin Preprocessing

Apply the low pass filter through use of a moving average with a window size of 3.

In [None]:
def apply_moving_average_filter(df, window_size=3):
    feature_columns = df.columns.difference(['genre'])
    df[feature_columns] = df[feature_columns].rolling(window=window_size, min_periods=1).mean()
    return df

data = apply_moving_average_filter(data)

## Shuffle and Split data

Split data into training and testing sets (70/30).

In [None]:
# Split the data into features and target variable
X = np.array(data.iloc[:, :-1], dtype=float)
y = data.iloc[:, -1]

# Split the dataset with 70% for training set and 30% for test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## Normalize the data set and encode the labels

In [None]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Encode the target labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

## Model initialization function

Define function for model train and evaluation

In [None]:
# Define the model training and evaluation function with cross-validation
def train_and_evaluate_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Cross-validation
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
    print(f'Cross-validation scores: {cv_scores}')
    print(f'Mean cross-validation score: {np.mean(cv_scores)}')

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=encoder.classes_)
    return best_model, accuracy, report


Random Forest Model.

In [None]:
rf_param_grid = {
    'n_estimators': [100, 250, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestClassifier()
rf_best_model, rf_accuracy, rf_report = train_and_evaluate_model(rf_model, rf_param_grid)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)

Support Vector Machine Model. 

In [None]:
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}
svm_model = SVC()
svm_best_model, svm_accuracy, svm_report = train_and_evaluate_model(svm_model, svm_param_grid)
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", svm_report)

Gradient Boost Model.

In [None]:
gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}
gb_model = GradientBoostingClassifier()
gb_best_model, gb_accuracy, gb_report = train_and_evaluate_model(gb_model, gb_param_grid)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Classification Report:\n", gb_report)

KNeighbours Model. 

In [None]:
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn_model = KNeighborsClassifier()
knn_best_model, knn_accuracy, knn_report = train_and_evaluate_model(knn_model, knn_param_grid)
print("KNeighbors Accuracy:", knn_accuracy)
print("KNeighbors Classification Report:\n", knn_report)

## Save models for future use

Use library to save model for export/import

In [None]:
# Save the best models
joblib.dump(rf_best_model, 'outputs/rf_best_model.pkl')
joblib.dump(svm_best_model, 'outputs/svm_best_model.pkl')
joblib.dump(gb_best_model, 'outputs/gb_best_model.pkl')
joblib.dump(knn_best_model, 'outputs/knn_best_model.pkl')

# Save the scaler and encoder
joblib.dump(scaler, 'outputs/scaler.pkl')
joblib.dump(encoder, 'outputs/encoder.pkl')

# Save the scaled data and encoded labels
joblib.dump(X_train, 'outputs/X_train.pkl')
joblib.dump(X_test, 'outputs/X_test.pkl')
joblib.dump(y_train, 'outputs/y_train.pkl')
joblib.dump(y_test, 'outputs/y_test.pkl')