In [6]:
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    
    # Extract pitch
    f0, voiced_flag, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    mean_pitch = np.mean(f0[voiced_flag]) if any(voiced_flag) else 0

    # Extract loudness (RMS)
    rms = librosa.feature.rms(y=y)[0]
    mean_rms = np.mean(rms)

    # Extract Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y=y)[0]
    mean_zcr = np.mean(zcr)

    # Extract MFCCs and their deltas
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfccs = librosa.feature.delta(mfccs)
    delta_delta_mfccs = librosa.feature.delta(mfccs, order=2)
    
    mean_mfccs = np.mean(mfccs, axis=1)
    mean_delta_mfccs = np.mean(delta_mfccs, axis=1)
    mean_delta_delta_mfccs = np.mean(delta_delta_mfccs, axis=1)

    # Extract Spectral Features
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)[0]

    mean_spectral_centroid = np.mean(spectral_centroid)
    mean_spectral_bandwidth = np.mean(spectral_bandwidth)
    mean_spectral_contrast = np.mean(spectral_contrast)
    
    return np.concatenate([
        mean_mfccs, mean_delta_mfccs, mean_delta_delta_mfccs,
        [mean_pitch, mean_rms, mean_zcr, mean_spectral_centroid, mean_spectral_bandwidth, mean_spectral_contrast]
    ])

In [75]:
import glob

# Example audio files and labels
audio_files = glob.glob('AudioWAV/*.wav')
labels = [file[-10:].split('_')[0] for file in audio_files]

In [76]:
rem = labels.index('')
labels.pop(rem)
audio_files.pop(rem)

'AudioWAV/1040_ITH_SAD_X.wav'

In [77]:
# Extract features for each file
features = [extract_features(file) for file in audio_files]

In [78]:
# Create DataFrame
columns = [f'mfcc{i+1}' for i in range(13)] + [f'delta_mfcc{i+1}' for i in range(13)] + [f'delta_delta_mfcc{i+1}' for i in range(13)]
columns += ['pitch', 'rms', 'zcr', 'spectral_centroid', 'spectral_bandwidth', 'spectral_contrast']
df = pd.DataFrame(features, columns=columns)
df['label'] = labels

In [79]:
# Split into features and labels
X = df.drop(columns=['label'])
y = df['label']

In [80]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

In [82]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

In [95]:
device = torch.device('mps')

In [96]:
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).to(device)
        self.y = torch.tensor(y, dtype=torch.long).to(device)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [97]:
train_dataset = AudioDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = AudioDataset(X_test, y_test) 
test_loader = DataLoader(test_dataset, batch_size=32)

In [101]:
class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        self.fc1 = nn.Linear(45, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 256)
        self.fc4 = nn.Linear(256, 6)
        self.dropout = nn.Dropout(0.1)
        self.norm1 = nn.BatchNorm1d(64)
        self.norm2 = nn.BatchNorm1d(128)
    
    def forward(self, x):
        x = self.dropout(torch.relu(self.norm1(self.fc1(x))))
        x = self.dropout(torch.relu(self.norm2(self.fc2(x))))
        x = self.dropout(torch.relu(self.fc3(x)))
        x = torch.log_softmax(self.fc4(x), dim=1)
        return x

In [104]:
model = AudioClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5)

model.train()
for epoch in range(1000):
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    epoch_loss /= len(train_loader)
    scheduler.step(epoch_loss)
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch} Loss: {epoch_loss}')

Epoch 0 Loss: 1.5274528181681069
Epoch 10 Loss: 1.1808171602346564
Epoch 20 Loss: 1.063637500809085
Epoch 30 Loss: 1.001248804151371
Epoch 40 Loss: 0.9297041681505018
Epoch 50 Loss: 0.8869503880700758
Epoch 60 Loss: 0.8296023254753441
Epoch 70 Loss: 0.8044262590908235
Epoch 80 Loss: 0.7950448613333446
Epoch 90 Loss: 0.7028702907664801
Epoch 100 Loss: 0.6811089425958613
Epoch 110 Loss: 0.6646129647570271


In [87]:
model.eval()
with torch.no_grad():
    y_pred = []
    for X_batch, y_batch in test_loader:
        y_pred.extend(model(X_batch).argmax(dim=1).tolist())

In [88]:
y_pred = np.array(y_pred)

In [89]:
print(classification_report(y_test, y_pred, target_names=encoder.classes_))

              precision    recall  f1-score   support

         ANG       0.62      0.69      0.65       239
         DIS       0.33      0.36      0.34       239
         FEA       0.34      0.33      0.34       267
         HAP       0.42      0.40      0.41       258
         NEU       0.38      0.33      0.36       219
         SAD       0.41      0.39      0.40       267

    accuracy                           0.42      1489
   macro avg       0.42      0.42      0.42      1489
weighted avg       0.41      0.42      0.41      1489

