# Audio Classification using Support vector machines (SVMs)

## Model Training

In [36]:
# Import Relevant libraries
import librosa
import os

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.preprocessing.sequence import pad_sequences

import pickle

In [37]:
# Define a function to load audio clips and extract features
def load_and_extract_features(audio_folder_path:str, n_mfcc=20)->list:
    """
    Load audio clips from a folder and extract MFCC features.
    
    Parameters:
        - audio_folder_path (str): Path to the folder containing subfolders correspond to each person (class) and each containing audio clips.
        - n_mfcc (int): Number of Mel-frequency cepstral coefficients (MFCC) to extract.
    
    Returns:
        - X (list): Extracted features (shape: [[n_samples, n_mfcc]]).
        - y (list): Labels corresponding to the audio clips (shape: [n_samples,]).
    """
    X = []
    y = []
    for label in range(1, 6):
        folder_path = f"{audio_folder_path}/{label}"
        for file_name in os.listdir(folder_path):
            if not file_name.startswith('.'): 
                file_path = f"{folder_path}/{file_name}"
                # print (file_path)
                # Load audio clip
                signal, sr = librosa.load(file_path, duration=3.0)  # Set duration to 3 seconds
                # Extract MFCC features
                mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)
                # Flatten the list
                mfcc_flattened=mfcc.T.flatten().tolist()
                # print(len(mfcc_flattened))
                # Append features and label to X and y
                X.append(mfcc_flattened)
                y.append(label)

    return X, y

# Call the function to load and extract features from audio clips
audio_folder_path = "Audio"  # Path to the folder containing the audio clips
X, y = load_and_extract_features(audio_folder_path)

In [38]:
len(X[1])

2560

In [39]:
len(y)

500

In [40]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an SVM classifier, using linear kernel for the best performance
svm = SVC(kernel='linear')


# Pad or truncate MFCC features to a consistent length
X_train = pad_sequences(X_train, maxlen=2500, dtype='float32', padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=2500, dtype='float32', padding='post', truncating='post')

# Standardize the data
# REMOVED AS IT IS CAUSING ACCURACY ISSUE
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Check the size of the data
# print(len(X_train))
# print(len(X_test))

# print(len(X_train[3]))
# print(len(X_test[5]))

# print (len(y_train))
# print (len(y_test))

In [41]:
# Train the SVM classifier on the training data
svm.fit(X_train, y_train)

# Predict labels for the testing data
y_pred = svm.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.99


## Prediction Function

In [45]:
def predict_new_audio(new_audio_path:str, svm:object, n_mfcc=20)->int:
    input_list = []
    signal_new, sr_new = librosa.load(new_audio_path, duration=3.0)
    mfcc_new = librosa.feature.mfcc(y=signal_new, sr=sr_new, n_mfcc=n_mfcc)
    mfcc_flattened=mfcc_new.T.flatten().tolist()
    input_list.append(mfcc_flattened)
    input_list = pad_sequences(input_list, maxlen=2500, dtype='float32', padding='post', truncating='post')
    y_pred = svm.predict(input_list)
    return y_pred[0]

2


## Test Cases

In [47]:
print(predict_new_audio("Audio/1/1677931349968.mp3", svm))

1


In [48]:
print(predict_new_audio("Audio/4/1678429656280.mp3", svm))

4


In [49]:
print(predict_new_audio("Audio/2/1677933179540.mp3", svm))

2


## Export the Model

In [50]:
# Save the trained model to a file
filename = 'audio_svm_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(svm, file)

In [None]:
filename = 'audio_svm_model.pkl'
with open(filename, 'rb') as file:
    clf = pickle.load(file)