In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, precision_score, average_precision_score,
    recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.model_selection import cross_val_score, train_test_split

import IPython
os.environ['NUMBA_CACHE_DIR'] = IPython.paths.get_ipython_cache_dir()

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Get the user's home directory (e.g., 'C:\Users\user')
DATA_DIR = r"C:\Users\ferit\MLPC2025_classification"
metadata_dir = os.path.join(DATA_DIR, 'metadata.csv')
features_dir = os.path.join(DATA_DIR, 'audio_features')
labels_dir = os.path.join(DATA_DIR, 'labels')

#### Load Metadata

In [3]:
# Load Metadata & Training Files
metadata = pd.read_csv(metadata_dir)
train_files = metadata.sample(len(metadata), random_state=42)["filename"].unique()

#### Load categories

In [4]:
# Select a random audio file and its annotations
np.random.seed(1)
sample_file = np.random.choice(metadata['filename'].tolist())

# Load corresponding label file
label_filename = sample_file.replace('.mp3', '_labels.npz')
label_path = os.path.join(DATA_DIR, 'labels', label_filename)
labels = np.load(label_path)

# Load categories
categories = list(labels.keys())
print("Names of Label Classes: \n", categories)

Names of Label Classes: 
 ['Airplane', 'Alarm', 'Beep/Bleep', 'Bell', 'Bicycle', 'Bird Chirp', 'Bus', 'Car', 'Cat Meow', 'Chainsaw', 'Clapping', 'Cough', 'Cow Moo', 'Cowbell', 'Crying', 'Dog Bark', 'Doorbell', 'Drip', 'Drums', 'Fire', 'Footsteps', 'Guitar', 'Hammer', 'Helicopter', 'Hiccup', 'Horn Honk', 'Horse Neigh', 'Insect Buzz', 'Jackhammer', 'Laughter', 'Lawn Mower', 'Motorcycle', 'Piano', 'Pig Oink', 'Power Drill', 'Power Saw', 'Rain', 'Rooster Crow', 'Saxophone', 'Sewing Machine', 'Sheep/Goat Bleat', 'Ship/Boat', 'Shout', 'Singing', 'Siren', 'Sneeze', 'Snoring', 'Speech', 'Stream/River', 'Thunder', 'Train', 'Truck', 'Trumpet', 'Vacuum Cleaner', 'Violin', 'Washing Machine', 'Waves', 'Wind']


#### Aggregate Labels

In [5]:
# Aggregate Labels
def aggregate_labels(file_labels):
    __y = []
    for frame_labels in file_labels:
        if(sum(frame_labels) == 0):
            __y.append([0])
        elif(np.count_nonzero(frame_labels) == len(frame_labels)):
             __y.append([1])
        else: #The annotators don't agree on the label
            __y.append([np.random.choice(frame_labels)])
    return __y

### Split Train Data

#### Read Files

In [6]:
# Read files and split train data
import itertools
def read_files(file_names, num_to_read=(len(train_files))):
    X_train = []
    Y_train = {}
    for c in categories:
        Y_train[c] = []
    for f in file_names[:num_to_read]:
        if not os.path.exists(os.path.join(features_dir , f.split('.')[0] + '.npz')):
            continue
        features = np.load(os.path.join(features_dir , f.split('.')[0] + '.npz'))["embeddings"]
        X_train.append(features)
        y = np.load(os.path.join(labels_dir , f.split('.')[0] + '_labels.npz'))
        for c in categories:
            _y = aggregate_labels(y[c])
            Y_train[c].extend(list(itertools.chain.from_iterable(_y)))
    X_train = np.concatenate(X_train)
    return X_train, Y_train

#### Split data without data leakage

In [7]:
# Split data without data leakage
train_files, test_files = train_test_split(metadata["filename"].unique(), test_size=0.3, random_state=42)
valid_files, test_files = train_test_split(test_files, test_size=1/3, random_state=42)
#So final split: 70% train, 20% validation, 10% test

# Print number of train, validation and test files
print("Number of train files      :",len(train_files))
print("Number of validation files :",len(valid_files))
print("Number of test files       :",len(test_files))

Number of train files      : 5761
Number of validation files : 1646
Number of test files       : 823


#### Load train & test (subset) data

In [8]:
# High computational effort!!!

In [9]:
# Load train & test data
split = 40
X_train_raw, Y_train = read_files(train_files, split * 7)
X_valid_raw, Y_valid = read_files(valid_files, split * 2)
X_test_raw, Y_test = read_files(test_files, split * 1)

# print raw feature tensor shapes
print("Train shapes      (raw):", X_train_raw.shape, len(Y_train))
print("Validation shapes (raw):", X_valid_raw.shape, len(Y_valid))
print("Test shapes       (raw):", X_test_raw.shape, len(Y_test))

Train shapes      (raw): (52214, 768) 58
Validation shapes (raw): (14673, 768) 58
Test shapes       (raw): (7431, 768) 58


### Apply Feature Normalization & PCA dimensionality reduction (with 95% explained variance)

#### Feature Tensor Scaler

In [10]:
from sklearn.preprocessing import StandardScaler

# Compute scaling parameters ONLY on training data
scaler = StandardScaler().fit(X_train_raw)

#### Scale & Normalize Feature Tensor

In [11]:
# Scale Train and Test Feature Tensor
X_train_scaled = scaler.transform(X_train_raw)
X_valid_scaled = scaler.transform(X_valid_raw)
X_test_scaled = scaler.transform(X_test_raw)

# print scaled feature tensor shapes
print("Train shapes      (scaled):", X_train_scaled.shape, len(Y_train))
print("Validation shapes (scaled):", X_valid_scaled.shape, len(Y_test))
print("Test shapes       (scaled):", X_test_scaled.shape, len(Y_test))

Train shapes      (scaled): (52214, 768) 58
Validation shapes (scaled): (14673, 768) 58
Test shapes       (scaled): (7431, 768) 58


#### Dimensionality reduction for Feature Tensor with PCA

In [12]:
from sklearn.decomposition import PCA

# PCA dimensionality reduction with keeping 95% of the variance
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train_scaled) # fit and transform train feature tensor
X_valid = pca.transform(X_valid_scaled)     # transform validation feature tensor
X_test = pca.transform(X_test_scaled)       # transform test feature tensor

# print reduced feature tensor shapes
print("Train shapes      (reduced):", X_train.shape, len(Y_train))
print("Validation shapes (reduced):", X_valid.shape, len(Y_valid))
print("Test shapes       (reduced):", X_test.shape, len(Y_test))

Train shapes      (reduced): (52214, 149) 58
Validation shapes (reduced): (14673, 149) 58
Test shapes       (reduced): (7431, 149) 58


### Training Experiments

#### Class labels for GridSearch

In [13]:
selected_classes = {'Speech'}

# Multi-class labels
Y_train_multi = np.array([Y_train[cls] for cls in selected_classes]).T
Y_valid_multi = np.array([Y_valid[cls] for cls in selected_classes]).T
Y_test_multi = np.array([Y_test[cls] for cls in selected_classes]).T

# print multi-class label class shapes
print("Train labels:      ", len(Y_train_multi))
print("Validation labels: ", len(Y_valid_multi))
print("Test labels:       ", len(Y_test_multi))

Train labels:       52214
Validation labels:  14673
Test labels:        7431


#### Macro-Averaged Balanced Accuracy Metric

In [14]:
from sklearn.metrics import balanced_accuracy_score

# Macro-Averaged Balanced Accuracy
def balanced_accuracy(Y_valid, Y_pred):
    # Ensure arrays are 2D (reshape if 1D)
    if Y_valid.ndim == 1:
        Y_valid = Y_valid.reshape(-1, 1)
    if Y_pred.ndim == 1:
        Y_pred = Y_pred.reshape(-1, 1)
        
    n_labels = Y_valid.shape[1]
    balanced_accuracies = []
    
    for i in range(n_labels):
        score = balanced_accuracy_score(Y_valid[:, i], Y_pred[:, i])
        balanced_accuracies.append(score)
    
    # Macro-average across labels
    balanced_accuracy_macro = np.mean(balanced_accuracies)
    
    print(f"Macro-Averaged Balanced Accuracy: {balanced_accuracy_macro:.2f}")

#### Grid-kNN Classifier

In [15]:
# High computational effort!!!

In [16]:
# Grid-kNN Classifier
kNN = KNeighborsClassifier(weights='distance')
grid_kNN = MultiOutputClassifier(kNN)

# Parameter grid
param_grid = {
    'estimator__n_neighbors': [3, 10, 50, 100],
    'estimator__metric': ['euclidean', 'manhattan', 'cosine']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=grid_kNN,
    param_grid=param_grid,
    cv=3,                         # n-fold cross-validation
    scoring='balanced_accuracy',
    verbose=2,                    # Print progress
    n_jobs=-1                     # Use all CPU cores
)

# Fit on training data
start_time = time.time()                  # Start timer
grid_search.fit(X_train, Y_train_multi)
end_time = time.time()                    # End timer

Fitting 3 folds for each of 12 candidates, totalling 36 fits


#### Grid-kNN Performance Metrics

In [17]:
# Print traing time
total_seconds = end_time - start_time
hours = int(total_seconds // 3600)
remaining_seconds = total_seconds % 3600
minutes = int(remaining_seconds // 60)
seconds = remaining_seconds % 60

print(f"\nTraining completed in: {hours}h {minutes}m {seconds:.2f}s")

# Best parameters
print("\nBest parameters:", grid_search.best_params_)
best_knn_model = grid_search.best_estimator_

# Predict on validation set
Y_valid_pred = grid_search.predict(X_valid)

# Grid-kNN Performance Metrics 
print("\nPerformance Metrics – Grid-kNN Classifier")
print("=" * 60)
grid_kNN_perf = balanced_accuracy(Y_valid_multi, Y_valid_pred)


Training completed in: 0h 3m 32.88s

Best parameters: {'estimator__metric': 'cosine', 'estimator__n_neighbors': 50}

Performance Metrics – Grid-kNN Classifier
Macro-Averaged Balanced Accuracy: 0.77


#### Save best kNN model

In [18]:
from joblib import dump, load

# Save best model
dump(best_knn_model, 'best_knn_model.joblib')

['best_knn_model.joblib']

#### Grid-Random Forest Classifier

In [19]:
# Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
grid_rf = MultiOutputClassifier(rf)

# Parameter grid
param_grid = {
    'estimator__n_estimators': [100, 200, 300],
    'estimator__max_depth': [5, 10, 20],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2],
    'estimator__max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=grid_rf,
    param_grid=param_grid,
    cv=3,                         # n-fold cross-validation
    scoring='balanced_accuracy',
    verbose=3,
    n_jobs=-1                     # Use all CPU cores
)

# Fit on training data
start_time = time.time()                  # Start timer
grid_search.fit(X_train, Y_train_multi)
end_time = time.time()                    # End timer

Fitting 3 folds for each of 72 candidates, totalling 216 fits


#### Grid-Random Forest Performance Metrics

In [20]:
# Print traing time
total_seconds = end_time - start_time
hours = int(total_seconds // 3600)
remaining_seconds = total_seconds % 3600
minutes = int(remaining_seconds // 60)
seconds = remaining_seconds % 60

print(f"\nTraining completed in: {hours}h {minutes}m {seconds:.2f}s")

# Best parameters
print("\nBest parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

# Predict on validation set
Y_valid_pred = grid_search.predict(X_valid)

# Grid-RF Performance Metrics 
print("\nPerformance Metrics – Grid-Random Forest Classifier")
print("=" * 60)
grid_rf_perf = balanced_accuracy(Y_valid_multi, Y_valid_pred)


Training completed in: 0h 27m 34.95s

Best parameters: {'estimator__max_depth': 20, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 2, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 100}

Performance Metrics – Grid-Random Forest Classifier
Macro-Averaged Balanced Accuracy: 0.76


#### Save best Random Forest model

In [21]:
from joblib import dump, load

# Save best model
dump(best_rf_model, 'best_rf_model.joblib')

['best_rf_model.joblib']

#### Grid-XGBoost Classifier

In [22]:
# Grid-XGBoost Classifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# XGBoost Model Configuration
grid_xgb = XGBClassifier(random_state=42)

# Parameter grid
param_grid = {
    'estimator__learning_rate': [0.05, 0.1],
    'estimator__n_estimators': [300, 400],
    'estimator__max_depth': [6, 8],
    'estimator__max_bin': [256, 512],
    'estimator__gamma': [0, 0.1],
    'estimator__reg_alpha': [0, 0.1],
    'estimator__reg_lambda': [0.5, 1],
    'estimator__subsample': [0.8, 1.0],
    'estimator__colsample_bytree': [0.8, 1.0]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=grid_xgb,
    param_grid=param_grid,
    cv=3,
    scoring='balanced_accuracy',
    verbose=1,
    n_jobs=1
)

# Fit on training data
start_time = time.time()                  # Start timer
grid_search.fit(X_train, Y_train_multi)
end_time = time.time()                    # End timer

Fitting 3 folds for each of 512 candidates, totalling 1536 fits


#### Grid-XGBoost Performance Metrics

In [23]:
# Print traing time
total_seconds = end_time - start_time
hours = int(total_seconds // 3600)
remaining_seconds = total_seconds % 3600
minutes = int(remaining_seconds // 60)
seconds = remaining_seconds % 60

print(f"\nTraining completed in: {hours}h {minutes}m {seconds:.2f}s")

# Best parameters
print("\nBest parameters:", grid_search.best_params_)
best_xgb_model = grid_search.best_estimator_

# Predict on validation set
Y_valid_pred = grid_search.predict(X_valid)

# Grid-XGBoost Performance Metrics 
print("\nPerformance Metrics – Grid-XGBoost Classifier")
print("=" * 60)
grid_xgb_perf = balanced_accuracy(Y_valid_multi, Y_valid_pred)


Training completed in: 0h 21m 10.23s

Best parameters: {'estimator__colsample_bytree': 0.8, 'estimator__gamma': 0, 'estimator__learning_rate': 0.05, 'estimator__max_bin': 256, 'estimator__max_depth': 6, 'estimator__n_estimators': 300, 'estimator__reg_alpha': 0, 'estimator__reg_lambda': 0.5, 'estimator__subsample': 0.8}

Performance Metrics – Grid-XGBoost Classifier
Macro-Averaged Balanced Accuracy: 0.78


#### Save best XGBoost model

In [24]:
from joblib import dump, load

# Save best model
dump(best_xgb_model, 'best_xgb_model.joblib')

['best_xgb_model.joblib']

#### Grid-Neural Network Classifier

#### CUDA availability check

In [15]:
import torch
print("CUDA version:",torch.version.cuda)

# CUDA availability check
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Using CPU")

CUDA version: 12.8
Using GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU


#### Convert data to tensors

In [16]:
from torch.utils.data import TensorDataset, DataLoader

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
Y_train_tensor = torch.FloatTensor(Y_train_multi)
X_test_tensor = torch.FloatTensor(X_valid)
Y_test_tensor = torch.FloatTensor(Y_valid_multi)

#### Neural Network Classifier

In [17]:
# Neural Network Classifier
import torch
import torch.nn as nn

# Simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size1=128, hidden_size2=64, output_size=1):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size1)
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.output = nn.Linear(hidden_size2, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.sigmoid(self.output(x))
        return x

In [18]:
from skorch import NeuralNetClassifier
from sklearn.model_selection import GridSearchCV
import torch.optim as optim

# Wrap the SimpleNN in a skorch classifier
net = NeuralNetClassifier(
    module=SimpleNN,
    module__input_size=X_train_tensor.shape[1],
    criterion=nn.BCELoss,
    optimizer=optim.Adam,
    max_epochs=10,
    batch_size=256,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    verbose=0
)

# Parameter grid
param_grid = {
    'lr': [0.001, 0.01],
    'module__hidden_size1': [64, 128],
    'module__hidden_size2': [32, 64],
    'optimizer__weight_decay': [0, 0.001]
}

# Initialize GridSearchCV
grid_search_nn = GridSearchCV(
    estimator=net,
    param_grid=param_grid,
    cv=3,
    scoring='balanced_accuracy',
    verbose=1,
    n_jobs=1
)

# Fit on training data
start_time = time.time()
grid_search_nn.fit(X_train_tensor, Y_train_tensor)
end_time = time.time()

Fitting 3 folds for each of 16 candidates, totalling 48 fits


#### Grid-SimlpleNN Performance Metrics

In [19]:
# Print training time
total_seconds = end_time - start_time
hours = int(total_seconds // 3600)
remaining_seconds = total_seconds % 3600
minutes = int(remaining_seconds // 60)
seconds = remaining_seconds % 60
print(f"\nTraining completed in: {hours}h {minutes}m {seconds:.2f}s")

# Best parameters
print("\nBest parameters:", grid_search_nn.best_params_)
best_nn_model = grid_search_nn.best_estimator_

# Predict on validation set
Y_valid_pred = best_nn_model.predict(X_test_tensor)

# Neural Network Performance Metrics 
print("\nPerformance Metrics - Neural Network Classifier")
print("=" * 60)
balanced_accuracy(Y_valid_multi, Y_valid_pred)


Training completed in: 0h 3m 46.87s

Best parameters: {'lr': 0.01, 'module__hidden_size1': 64, 'module__hidden_size2': 32, 'optimizer__weight_decay': 0.001}

Performance Metrics - Neural Network Classifier
Macro-Averaged Balanced Accuracy: 0.81


#### Save best SimpleNN model

In [20]:
import torch

# Save the entire model (architecture + weights + optimizer state)
torch.save(best_nn_model.module_, 'best_nn_model.pt')