This project involves building a machine learning model for audio classification, potentially focused on music genre classification. It uses the librosa library for audio feature extraction, while sklearn is employed for data preprocessing and model training.

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from glob import glob
import librosa
import librosa.display

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")


In [35]:
tracks = pd.read_csv("fma_small/fma_metadata/tracks.csv", index_col=0, header=[0, 1])

In [36]:
audio_files = glob("./fma_medium/*/*.mp3")
print(f"Number of audio files found: {len(audio_files)}")

Number of audio files found: 25000


In [37]:
def extract_features(file_path):
    try:
        x, sr = librosa.load(file_path, sr=None)
        
        mfcc = np.mean(librosa.feature.mfcc(y=x, sr=sr, n_mfcc=13).T, axis=0)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=x, sr=sr).T, axis=0)
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=x))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=x, sr=sr))
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=x, sr=sr).T, axis=0)
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=x, sr=sr))
        rms = np.mean(librosa.feature.rms(y=x))
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=x, sr=sr))
        tempo = librosa.beat.tempo(y=x, sr=sr)[0]
        
        features = np.hstack([
            mfcc, spectral_contrast, zero_crossing_rate, spectral_centroid, 
            chroma_stft, spectral_rolloff, rms, spectral_bandwidth, tempo
        ])
        return features
    except Exception as e:
        print(f"Could not process file {file_path}: {e}")
        return None


In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm

def process_file(file):
    try:
        track_id = int(file.split("\\")[-1].split(".")[0])
        
        if track_id in tracks.index:
            features = extract_features(file)
            
            if features is not None:
                genre = tracks.loc[track_id, ('track', 'genre_top')]
                return [track_id, genre] + list(features)
    except Exception as e:
        print(f"Could not process file {file}: {e}")
        return None

data = Parallel(n_jobs=-1, backend="threading")(delayed(process_file)(file) for file in tqdm(audio_files, desc="Extracting Features"))

data = [d for d in data if d is not None]

columns = ['track_id', 'genre'] + [f'mfcc_{i+1}' for i in range(13)] + \
          [f'spectral_contrast_{i+1}' for i in range(7)] + \
          ['zero_crossing_rate', 'spectral_centroid'] + \
          [f'chroma_{i+1}' for i in range(12)] + \
          ['spectral_rolloff', 'rms', 'spectral_bandwidth', 'tempo']

df = pd.DataFrame(data, columns=columns)

df.head()


In [117]:
df.to_csv('genre_features.csv', index=False)
df = pd.read_csv('genre_features.csv')

In [39]:
le = LabelEncoder()
df['genre'] = le.fit_transform(df['genre'])

X = df.drop(['track_id', 'genre'], axis=1)
y = df['genre']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
estimators = [
    ('xgb', XGBClassifier(random_state=42)),
    ('svc', SVC(kernel='linear', probability=True)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(100,), max_iter=500))
]

stacking = StackingClassifier(estimators=estimators, final_estimator=XGBClassifier(random_state=42))
stacking.fit(X_train, y_train)

y_pred_stack = stacking.predict(X_test)
print("Stacking Ensemble Accuracy:", accuracy_score(y_test, y_pred_stack))


In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_stack, target_names=le.classes_))

In [110]:
def extract_features_custom(file_path):
    try:
        x, sr = librosa.load(file_path, sr=None)

        mfcc = np.mean(librosa.feature.mfcc(y=x, sr=sr, n_mfcc=13).T, axis=0)
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=x, sr=sr).T, axis=0)
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(y=x))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=x, sr=sr))
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=x, sr=sr).T, axis=0)
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=x, sr=sr))
        rms = np.mean(librosa.feature.rms(y=x))
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=x, sr=sr))
        tempo = librosa.beat.tempo(y=x, sr=sr)[0]

        features = np.hstack([mfcc, spectral_contrast, zero_crossing_rate, spectral_centroid, chroma_stft,
                              spectral_rolloff, rms, spectral_bandwidth, tempo])
        return features
    except Exception as e:
        print(f"Could not process file {file_path}: {e}")
        return None

In [111]:
def predict_genre_top_n(file_path, top_n=3):
    features = extract_features_custom(file_path)
    
    if features is not None:
        features = np.array(features).reshape(1, -1)
        features = scaler.transform(features)
        
        probabilities = stacking.predict_proba(features)[0]

        top_n_indices = np.argsort(probabilities)[::-1][:top_n]
        
        top_n_genres = le.inverse_transform(top_n_indices)
        top_n_probabilities = probabilities[top_n_indices]
        
        print(f"Top {top_n} Predictions for '{file_path}':")
        for i in range(top_n):
            print(f"{i+1}: {top_n_genres[i]} ({top_n_probabilities[i]*100:.2f}% probability)")
        
        return top_n_genres, top_n_probabilities
    else:
        print("Feature extraction failed for the given track.")
        return None

In [120]:
predict_genre_top_n("FANFARA.mp3", top_n=16)

Top 16 Predictions for 'FANFARA.mp3':
1: Electronic (89.05% probability)
2: Rock (5.62% probability)
3: Experimental (3.11% probability)
4: Hip-Hop (1.18% probability)
5: International (0.39% probability)
6: Instrumental (0.22% probability)
7: Pop (0.18% probability)
8: Soul-RnB (0.14% probability)
9: Jazz (0.05% probability)
10: Folk (0.03% probability)
11: Classical (0.01% probability)
12: Easy Listening (0.01% probability)
13: Blues (0.00% probability)
14: Country (0.00% probability)
15: Spoken (0.00% probability)
16: Old-Time / Historic (0.00% probability)


(array(['Electronic', 'Rock', 'Experimental', 'Hip-Hop', 'International',
        'Instrumental', 'Pop', 'Soul-RnB', 'Jazz', 'Folk', 'Classical',
        'Easy Listening', 'Blues', 'Country', 'Spoken',
        'Old-Time / Historic'], dtype=object),
 array([8.90514553e-01, 5.62253706e-02, 3.11183445e-02, 1.17971925e-02,
        3.86611465e-03, 2.20152270e-03, 1.75020471e-03, 1.39427418e-03,
        5.23860799e-04, 2.86897674e-04, 1.17606549e-04, 6.49761423e-05,
        4.87718353e-05, 4.72633947e-05, 3.23445165e-05, 1.06693451e-05],
       dtype=float32))