# 1. Load dataset and environemnt setup
### Section 1.1: libraries import and data path

In [6]:
import numpy as np
import librosa
import os
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm

# Paths for dataset and label files
DATASET_PATH = "/mnt/c/DF/LA/ASVspoof2019_LA_train/flac"
LABEL_FILE_PATH = "/mnt/c/DF/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
TEST_DATASET_PATH = "/mnt/c/DF/LA/ASVspoof2019_LA_dev/flac"
TEST_LABEL_FILE_PATH = "/mnt/c/DF/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt"

RANDOM_STATE = 2159081

### Section 1.2: function to extract mfcc features

In [2]:
# Extract features from audio files
def extract_features(file_path, n_mfcc=20):
    """
    Extract MFCC (Mel-Frequency Cepstral Coefficients) features from an audio file.
    
    Steps:
    1. Load the audio file using librosa.
    2. Compute the MFCCs, which capture the spectral characteristics of the audio.
    3. Take the mean of the MFCC coefficients across the time axis to summarize the audio into a fixed-length feature vector.

    Args:
        file_path (str): Path to the audio file.
        n_mfcc (int): Number of MFCC coefficients to compute.

    Returns:
        np.ndarray: A fixed-length feature vector representing the audio.
    """
    y, sr = librosa.load(file_path, sr=None)  # Load the audio signal and its sampling rate.
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # Compute MFCC features.
    return np.mean(mfcc.T, axis=0)  # Return the mean MFCC values across time.

### Section 1.3: function ti load labels file

In [3]:
# Load labels from the label file
def load_labels(label_file):
    """
    Load labels from a protocol file. The protocol file contains mappings between file IDs and their corresponding labels.

    Args:
        label_file (str): Path to the protocol file.

    Returns:
        dict: A dictionary mapping file IDs to labels (e.g., bona fide or spoof).
    """
    labels = {}
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()  # Split the line into parts.
            file_id, label = parts[1], parts[-1]  # Extract file ID and label.
            labels[file_id] = label
    return labels

### Section 1.4 function to load files with librosa

In [4]:
def load_and_preprocess_data(dataset_path, label_file):
    """
    Load audio data and extract features based on the provided label file.

    Steps:
    1. Load labels using the `load_labels` function.
    2. Iterate through all .flac files in the dataset directory.
    3. For each file, extract features and retrieve the corresponding label from the protocol file.

    Args:
        dataset_path (str): Path to the directory containing audio files.
        label_file (str): Path to the protocol file with labels.

    Returns:
        tuple: A tuple containing:
            - np.ndarray: Extracted features for all audio files.
            - np.ndarray: Corresponding labels for the audio files.
    """
    labels = load_labels(label_file)  # Load the labels.
    features = []
    labels_list = []

    for file in tqdm(os.listdir(dataset_path), desc="Processing audio files", unit="file"):
        if file.endswith(".flac"):  # Check if the file is a .flac audio file.
            file_path = os.path.join(dataset_path, file)  # Construct the full file path.
            file_id = file.replace(".flac", "")  # Extract the file ID (without extension).

            if file_id in labels:  # Check if the file ID exists in the labels.
                features.append(extract_features(file_path))  # Extract features from the audio file.
                labels_list.append(labels[file_id])  # Append the corresponding label.
            else:
                print(f"Warning: File ID '{file_id}' not found in the label file. Skipping.")

    return np.array(features), np.array(labels_list)

### Section 1.5: load training set and its labels + splitting in validation

In [5]:
# Load training data
print("Loading training data...")
train_features, train_labels = load_and_preprocess_data(DATASET_PATH, LABEL_FILE_PATH)

# Encode labels into numerical format
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)

Loading training data...


Processing audio files: 100%|███████████████████████████████████████████████████| 25380/25380 [04:18<00:00, 98.35file/s]


NameError: name 'train_test_split' is not defined

In [7]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.2, random_state=RANDOM_STATE, stratify=train_labels)

# Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# 2. Define One Class SVM
### Section 2.1: Definition

In [8]:
# Define parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

# Perform grid search
print("Performing grid search to find the best SVM parameters...")
grid_search = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5, scoring='roc_auc', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_svm = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate the best model on the validation set
val_predictions = best_svm.predict(X_val)
val_probabilities = best_svm.predict_proba(X_val)[:, 1]
val_roc_auc = roc_auc_score(y_val, val_probabilities)
print(f"Validation ROC AUC Score: {val_roc_auc:.4f}")

Performing grid search to find the best SVM parameters...
Fitting 5 folds for each of 72 candidates, totalling 360 fits


Traceback (most recent call last):
  File "/home/matteo/miniconda3/envs/sklearn-env/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_5065/2444792097.py", line 12, in <module>
    grid_search.fit(X_train, y_train)
  File "/home/matteo/miniconda3/envs/sklearn-env/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/matteo/miniconda3/envs/sklearn-env/lib/python3.12/site-packages/sklearn/model_selection/_search.py", line 1019, in fit
    self._run_search(evaluate_candidates)
  File "/home/matteo/miniconda3/envs/sklearn-env/lib/python3.12/site-packages/sklearn/model_selection/_search.py", line 1573, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
Exception ignored in: 'zmq.backend.cython._zmq.Frame.__del__'
Traceback (most rece

KeyboardInterrupt: 

### Section 2.2 Load testset and scale it

In [None]:
# Load test data
print("Loading test data...")
test_features, test_labels = load_and_preprocess_data(TEST_DATASET_PATH, TEST_LABEL_FILE_PATH)
test_labels = label_encoder.transform(test_labels)
test_features = scaler.transform(test_features)

### Section 2.3: Predict testset labels

In [None]:
# Evaluate the model on the test set
print("Evaluating model on test data...")
test_predictions = best_svm.predict(test_features)
test_probabilities = best_svm.predict_proba(test_features)[:, 1]

### Section 2.4: Show confusion matrix

In [None]:
# Compute evaluation metrics
roc_auc = roc_auc_score(test_labels, test_probabilities)
print(f"Test ROC AUC Score: {roc_auc:.4f}")

# Confusion Matrix
cm = confusion_matrix(test_labels, test_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - SVM")
plt.show()

### Section 2.6: Show ROC

In [None]:
# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(test_labels, test_probabilities)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for SVM")
plt.legend()
plt.grid()
plt.show()

### Section 2.7: Save the model

In [None]:
# Save the best model and scaler
with open("best_svm_model.pkl", "wb") as model_file:
    pickle.dump(best_svm, model_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Best model and scaler saved successfully.")