# Audio Classification using Support Vector Machines (SVMs)

## Model Training

In [1]:
# Import Relevant libraries
import librosa
import os

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.preprocessing.sequence import pad_sequences

import pickle

In [2]:
# Define a function to load audio clips and extract features
def load_and_extract_features(audio_folder_path:str, n_mfcc=20)->list:
    """
    Load audio clips from a folder and extract MFCC features.
    
    Parameters:
        - audio_folder_path (str): Path to the folder containing subfolders correspond to each person (class) and each containing audio clips.
        - n_mfcc (int): Number of Mel-frequency cepstral coefficients (MFCC) to extract.
    
    Returns:
        - X (list): Extracted features (shape: [[n_samples, n_mfcc]]).
        - y (list): Labels corresponding to the audio clips (shape: [n_samples,]).
    """
    X = []
    y = []
    for label in range(1, 6):
        folder_path = f"{audio_folder_path}/{label}"
        for file_name in os.listdir(folder_path):
            if not file_name.startswith('.'): 
                file_path = f"{folder_path}/{file_name}"
                # print (file_path)
                # Load audio clip
                signal, sr = librosa.load(file_path, duration=3.0)  # Set duration to 3 seconds
                # Extract MFCC features
                mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
                # Flatten the list
                mfcc_flattened=mfcc.T.flatten().tolist()
                # print(len(mfcc_flattened))
                # Append features and label to X and y
                X.append(mfcc_flattened)
                y.append(label)

    return X, y

In [3]:
# Call the function to load and extract features from audio clips
audio_folder_path = "Audio"  # Path to the folder containing the audio clips
X, y = load_and_extract_features(audio_folder_path)

In [4]:
# Check the length of each element in the X
len(X[1])

2560

In [5]:
# Check the length of X
len(X)

500

In [6]:
# Check the length of y
len(y)

500

In [7]:
# Split the data into training and testing sets (20% test, 80% train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an SVM classifier, using linear kernel for the best performance
svm = SVC(kernel='linear', probability=True)


# Pad or truncate MFCC features to a consistent length
X_train = pad_sequences(X_train, maxlen=2500, dtype='float32', padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=2500, dtype='float32', padding='post', truncating='post')

# Standardize the data
# REMOVED AS IT IS CAUSING ACCURACY ISSUE
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Check the size of the data
# print(len(X_train))
# print(len(X_test))

# print(len(X_train[3]))
# print(len(X_test[5]))

# print (len(y_train))
# print (len(y_test))

In [8]:
# Train the SVM classifier on the training data
svm.fit(X_train, y_train)

# Predict labels for the testing data
y_pred = svm.predict(X_test)

## Evaluate the Model

### Confusion Matrix

In [9]:
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", confusion_mat)

Confusion Matrix:
 [[27  1  0  0  0]
 [ 0 14  0  0  0]
 [ 0  0 10  0  0]
 [ 0  0  0 24  0]
 [ 0  0  0  0 24]]


### Accuracy

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.99


### Precision

In [11]:
# Calculate precision score for each person separately
precision_none = precision_score(y_test, y_pred, average=None)

# Calculate macro-averaged precision score
precision_macro = precision_score(y_test, y_pred, average='macro')
print("Precision (None):", precision_none)
print("Precision (macro):", precision_macro)

Precision (None): [1.         0.93333333 1.         1.         1.        ]
Precision (macro): 0.9866666666666667


### Recall

In [12]:
recall_none = recall_score(y_test, y_pred, average=None)
recall_macro = recall_score(y_test, y_pred, average='macro')
print("Recall (None):", recall_none)
print("Recall (macro):", recall_macro)

Recall (None): [0.96428571 1.         1.         1.         1.        ]
Recall (macro): 0.9928571428571429


### F1 score

In [13]:
f1_none = f1_score(y_test, y_pred, average=None)
f1_macro= f1_score(y_test, y_pred, average="macro")
print("F1 Score (None):", f1_none)
print("F1 Score (macro):", f1_macro)

F1 Score (None): [0.98181818 0.96551724 1.         1.         1.        ]
F1 Score (macro): 0.9894670846394984


## Prediction Function

In [14]:
def predict_new_audio(new_audio_path:str, svm:object, n_mfcc=20)->int:
    
    """
    Predict the source of new audio at new_audio_path based on the model svm trained. 
    
    Parameters:
        - new_audio_path (str): Path to the folder containing new audio to be predicted. 
        - svm: trained SVM model 
        - n_mfcc (int): Number of Mel-frequency cepstral coefficients (MFCC) to extract, default = 20. 
    
    Returns:
        - y_pred[0](int): The predicted label for the given new audio. 
    """
    
    input_list = []
    # Load audio clip 
    signal_new, sr_new = librosa.load(new_audio_path, duration=3.0)
    # abstract the mfcc features
    mfcc_new = librosa.feature.mfcc(y=signal_new, sr=sr_new, n_mfcc=n_mfcc)
    # flatten the list 
    mfcc_flattened=mfcc_new.T.flatten().tolist()
    input_list.append(mfcc_flattened)
    # padd the list to ensure consistent length
    input_list = pad_sequences(input_list, maxlen=2500, dtype='float32', padding='post', truncating='post')
    # predict the label from the list using the model. 
    y_pred = svm.predict(input_list)
    return y_pred[0]

## Test Cases

In [15]:
# Expected output: 1
print(predict_new_audio("Audio/1/1677931349968.mp3", svm))

1


In [16]:
# Expected output: 4
print(predict_new_audio("Audio/4/1678429656280.mp3", svm))

4


In [17]:
# Expected output: 2
print(predict_new_audio("Audio/2/1677933179540.mp3", svm))

2


## Probability for each class

In [20]:
probabilities = svm.predict_proba(X_test)
print(probabilities)

[[6.86389841e-02 5.87236433e-02 3.27653903e-02 8.16691842e-01
  2.31801400e-02]
 [9.72506939e-01 1.46940940e-02 2.21316774e-03 3.59078599e-03
  6.99501327e-03]
 [1.29726181e-03 5.10195301e-04 1.16188093e-02 9.68280532e-01
  1.82932019e-02]
 [6.51174288e-02 9.33424240e-01 5.25074486e-04 4.80335882e-04
  4.52921247e-04]
 [2.40199510e-02 9.53986202e-01 6.09747874e-03 6.53124400e-03
  9.36512466e-03]
 [8.31310246e-04 5.72815662e-04 1.26267292e-02 9.68845760e-01
  1.71233852e-02]
 [9.97699390e-04 3.58303949e-04 6.85329482e-03 9.83921840e-01
  7.86886228e-03]
 [4.04529193e-02 9.44592187e-01 3.62933051e-03 6.90437848e-03
  4.42118489e-03]
 [8.93532233e-01 9.73085829e-02 1.63576306e-03 2.71847994e-03
  4.80494121e-03]
 [1.89647241e-03 2.34571753e-03 4.86211261e-03 6.15559775e-03
  9.84740100e-01]
 [9.54914075e-01 2.61632039e-02 7.99624513e-03 5.36980121e-03
  5.55667496e-03]
 [2.73833694e-02 9.00672510e-01 1.24879379e-02 1.23523756e-02
  4.71038071e-02]
 [9.88376701e-03 1.55519659e-02 3.418718

## Export the Model

In [None]:
# Save the trained model to a file
filename = 'audio_svm_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(svm, file)

In [None]:
filename = 'audio_svm_model.pkl'
with open(filename, 'rb') as file:
    clf = pickle.load(file)