In [1]:
# Step 1: Install Necessary Libraries
!pip install transformers torchaudio



In [3]:
!pip install -U transformers


  pid, fd = os.forkpty()


Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed transformers-4.46.1


In [8]:
!du -h /kaggle/working  # List file sizes in the working directory
!rm -rf /kaggle/working/large_file_or_directory  # Replace with the path to delete

import shutil
shutil.rmtree('/root/.cache/huggingface', ignore_errors=True)
shutil.rmtree('/root/.cache/torch', ignore_errors=True)


  pid, fd = os.forkpty()


486M	/kaggle/working/results/checkpoint-534
1.1G	/kaggle/working/results/checkpoint-8960
1.1G	/kaggle/working/results/checkpoint-7840
1.1G	/kaggle/working/results/checkpoint-560
1.1G	/kaggle/working/results/checkpoint-5600
1.1G	/kaggle/working/results/checkpoint-3360
1.1G	/kaggle/working/results/checkpoint-1680
1.1G	/kaggle/working/results/checkpoint-5040
1.1G	/kaggle/working/results/checkpoint-6160
1.1G	/kaggle/working/results/checkpoint-4480
1.1G	/kaggle/working/results/checkpoint-9520
1.1G	/kaggle/working/results/checkpoint-2240
1.1G	/kaggle/working/results/checkpoint-2800
1.1G	/kaggle/working/results/checkpoint-1120
1.1G	/kaggle/working/results/checkpoint-3920
1.1G	/kaggle/working/results/checkpoint-10080
1.1G	/kaggle/working/results/checkpoint-6720
1.1G	/kaggle/working/results/checkpoint-7280
1.1G	/kaggle/working/results/checkpoint-8400
20G	/kaggle/working/results
4.0K	/kaggle/working/.virtual_documents
20G	/kaggle/working


In [2]:
# Step 2: Import Necessary Libraries
import os
import pandas as pd
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification, Trainer, TrainingArguments

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Step 3: Define the Data Directory and Emotion Labels
data_dir = '/kaggle/input/augnito-speech-task/Emotions'
emotion_labels = ['Angry', 'Happy', 'Sad', 'Neutral', 'Fearful', 'Disgusted', 'Suprised']

# Step 4: Create a DataFrame with File Paths and Labels
file_paths = []
labels = []

# Load file paths and labels based on folder names
for emotion in emotion_labels:
    emotion_dir = os.path.join(data_dir, emotion)
    if not os.path.isdir(emotion_dir):
        print(f"Warning: Directory for emotion '{emotion}' not found.")
        continue

    for root, _, files in os.walk(emotion_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
                labels.append(emotion)

if not file_paths:
    raise ValueError("No .wav files found. Check the data directory path.")

# Confirm the lengths of file_paths and labels are the same
if len(file_paths) != len(labels):
    raise ValueError("Mismatch between file paths and labels.")

# Create DataFrame
df = pd.DataFrame({'file_path': file_paths, 'emotion': labels})

# Encode Labels
df['label'] = df['emotion'].astype('category').cat.codes
label2id = {label: idx for idx, label in enumerate(emotion_labels)}
id2label = {v: k for k, v in label2id.items()}

# Step 5: Split Data into Train, Validation, and Test Sets (70-15-15) with Stratification
train_df, test_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1765, stratify=train_df['label'], random_state=42)

# Step 6: Define a Dataset Class for Loading Data and Tokenizing
class EmotionDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, max_length=16000):
        self.dataframe = dataframe.reset_index(drop=True)
        self.feature_extractor = feature_extractor
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        file_path = self.dataframe.loc[idx, 'file_path']
        label = self.dataframe.loc[idx, 'label']
        
        # Load audio file and truncate or pad to max_length
        speech_array, _ = librosa.load(file_path, sr=16000)
        if len(speech_array) > self.max_length:
            speech_array = speech_array[:self.max_length]
        else:
            pad_length = self.max_length - len(speech_array)
            speech_array = np.pad(speech_array, (0, pad_length), 'constant')
        
        # Extract features and remove the extra dimension
        inputs = self.feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt", padding="longest")
        inputs['input_values'] = inputs['input_values'].squeeze()
        
        # Convert label to torch.long to avoid RuntimeError
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        
        return inputs

# Step 7: Initialize the Feature Extractor and Model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model = HubertForSequenceClassification.from_pretrained(
    "facebook/hubert-base-ls960",
    num_labels=len(label2id),
    problem_type="single_label_classification"
).to(device)

# Step 8: Create Datasets
train_dataset = EmotionDataset(train_df, feature_extractor)
val_dataset = EmotionDataset(val_df, feature_extractor)
test_dataset = EmotionDataset(test_df, feature_extractor)

# Step 9: Define Training Arguments with adaptive learning and checkpoint management
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    save_total_limit=1  # Only keep the best model checkpoint
)

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    
    # Generate classification report for precision, recall, and f1-score
    report = classification_report(
        labels, preds, labels=list(label2id.values()), target_names=emotion_labels, output_dict=True, zero_division=0
    )
    
    return {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)

# Define ReduceLROnPlateau scheduler with desired parameters
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(
    optimizer=optimizer,
    mode='max',           # Maximize accuracy
    factor=0.5,           # Reduce learning rate by half
    patience=3,           # Wait 3 epochs for improvement
    threshold=0.001,      # Minimum change to qualify as improvement
    verbose=True          # Log LR adjustments
)

# Custom Trainer to support ReduceLROnPlateau
class TrainerWithReduceLROnPlateau(Trainer):
    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
        metrics = super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
        # Extract accuracy and adjust learning rate
        if 'eval_accuracy' in metrics:
            scheduler.step(metrics['eval_accuracy'])
        return metrics

# Initialize the Trainer with the custom Trainer class
trainer = TrainerWithReduceLROnPlateau(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)  # Provide optimizer and scheduler
)

# Step 10: Train the Model
trainer.train()

# Step 11: Evaluate on Test Data
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")

# Step 12: Model Predictions and Classification Report
predictions = trainer.predict(test_dataset)
print(classification_report(predictions.label_ids, np.argmax(predictions.predictions, axis=1), target_names=emotion_labels))


Using device: cuda


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7798,1.631117,0.344792,0.396553,0.344792,0.261679
2,1.5897,1.432611,0.454167,0.53244,0.454167,0.395898
3,1.4344,1.313417,0.504167,0.543898,0.504167,0.489897
4,1.3348,1.234539,0.532292,0.590122,0.532292,0.520463
5,1.2715,1.221803,0.541667,0.616314,0.541667,0.525393
6,1.1893,1.133391,0.558333,0.596985,0.558333,0.562612
7,1.1543,1.123258,0.567708,0.616985,0.567708,0.556111
8,1.1003,1.11504,0.576042,0.630896,0.576042,0.565356
9,1.0372,1.069845,0.592187,0.623005,0.592187,0.593442
10,1.0021,1.140706,0.585938,0.620855,0.585938,0.579552


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

Test Results: {'eval_loss': 1.0623122453689575, 'eval_accuracy': 0.6307291666666667, 'eval_precision': 0.642927336852053, 'eval_recall': 0.6307291666666667, 'eval_f1': 0.6305208988662325, 'eval_runtime': 32.4509, 'eval_samples_per_second': 59.166, 'eval_steps_per_second': 3.698, 'epoch': 20.0}


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


              precision    recall  f1-score   support

       Angry       0.81      0.72      0.76       325
       Happy       0.57      0.57      0.57       280
         Sad       0.66      0.48      0.55       307
     Neutral       0.57      0.64      0.61       325
     Fearful       0.61      0.79      0.68       269
   Disgusted       0.55      0.61      0.58       325
    Suprised       0.90      0.62      0.73        89

    accuracy                           0.63      1920
   macro avg       0.67      0.63      0.64      1920
weighted avg       0.64      0.63      0.63      1920



In [3]:
# Step 1: Import Necessary Libraries
import os
import pandas as pd
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import Wav2Vec2FeatureExtractor, WavLMModel
from torch import nn, optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Step 2: Define the Data Directory and Emotion Labels
data_dir = '/kaggle/input/augnito-speech-task/Emotions'
emotion_labels = ['Angry', 'Happy', 'Sad', 'Neutral', 'Fearful', 'Disgusted', 'Suprised']

# Step 3: Create a DataFrame with File Paths and Labels
file_paths, labels = [], []
for emotion in emotion_labels:
    emotion_dir = os.path.join(data_dir, emotion)
    if not os.path.isdir(emotion_dir):
        print(f"Warning: Directory for emotion '{emotion}' not found.")
        continue
    for root, _, files in os.walk(emotion_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
                labels.append(emotion)

if not file_paths:
    raise ValueError("No .wav files found. Check the data directory path.")

# Create DataFrame
df = pd.DataFrame({'file_path': file_paths, 'emotion': labels})
df['label'] = df['emotion'].astype('category').cat.codes
label2id = {label: idx for idx, label in enumerate(emotion_labels)}
id2label = {v: k for k, v in label2id.items()}

# Step 4: Extract Embeddings Using WavLM Base
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base")
wavlm_model = WavLMModel.from_pretrained("microsoft/wavlm-base").to(device)
wavlm_model.eval()

def extract_embedding(file_path, feature_extractor, model):
    speech_array, _ = librosa.load(file_path, sr=16000)
    inputs = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt", padding="longest").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Mean pooling
    return embedding

# Extract embeddings
embeddings, labels = [], []
for _, row in df.iterrows():
    file_path, label = row['file_path'], row['label']
    embedding = extract_embedding(file_path, feature_extractor, wavlm_model)
    embeddings.append(embedding)
    labels.append(label)

# Convert embeddings and labels into a DataFrame
embedding_df = pd.DataFrame(np.concatenate(embeddings, axis=0))
embedding_df['label'] = labels
embedding_df.to_csv("precomputed_embeddings.csv", index=False)

# Step 5: Train-Test Split
X = embedding_df.drop(columns=["label"]).values  # Features (embeddings)
y = embedding_df["label"].values  # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Step 6: Define a Complex Classifier Model
class EnhancedClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(EnhancedClassifier, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        return self.network(x)

input_dim = X_train.shape[1]
num_classes = len(np.unique(y))
model = EnhancedClassifier(input_dim, num_classes).to(device)

# Step 7: Training the Classifier with Scheduler and Best Model Saving
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)  # Initial learning rate
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)  # Reduce on val loss increase

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

best_val_loss = float('inf')
best_model_path = "best_model.pth"

# Training Loop
num_epochs = 200
for epoch in range(num_epochs):
    # Training phase
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Validation phase
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test_tensor)
        val_loss = criterion(val_outputs, y_test_tensor).item()
        _, val_predicted = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(y_test_tensor.cpu(), val_predicted.cpu())

    # Step the scheduler with validation loss
    scheduler.step(val_loss)

    # Check if current model is the best and save it
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)  # Overwrite best model

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Load the best model for final evaluation
model.load_state_dict(torch.load(best_model_path))

# Step 8: Final Evaluation on Test Set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, predicted = torch.max(test_outputs, 1)
    accuracy = accuracy_score(y_test_tensor.cpu(), predicted.cpu())
    print(f"Test Accuracy: {accuracy:.4f}")
    print(classification_report(y_test_tensor.cpu(), predicted.cpu(), target_names=emotion_labels))


Using device: cuda




Epoch [1/200], Loss: 2.0145, Val Loss: 1.9380, Val Accuracy: 0.1714
Epoch [2/200], Loss: 1.8145, Val Loss: 1.9320, Val Accuracy: 0.1776
Epoch [3/200], Loss: 1.6897, Val Loss: 1.9250, Val Accuracy: 0.1906
Epoch [4/200], Loss: 1.5845, Val Loss: 1.9167, Val Accuracy: 0.2120
Epoch [5/200], Loss: 1.5001, Val Loss: 1.9071, Val Accuracy: 0.2479
Epoch [6/200], Loss: 1.4190, Val Loss: 1.8957, Val Accuracy: 0.2969
Epoch [7/200], Loss: 1.3485, Val Loss: 1.8821, Val Accuracy: 0.3125
Epoch [8/200], Loss: 1.2854, Val Loss: 1.8663, Val Accuracy: 0.3333
Epoch [9/200], Loss: 1.2186, Val Loss: 1.8484, Val Accuracy: 0.3474
Epoch [10/200], Loss: 1.1632, Val Loss: 1.8280, Val Accuracy: 0.3589
Epoch [11/200], Loss: 1.1219, Val Loss: 1.8047, Val Accuracy: 0.3766
Epoch [12/200], Loss: 1.0755, Val Loss: 1.7784, Val Accuracy: 0.3922
Epoch [13/200], Loss: 1.0409, Val Loss: 1.7488, Val Accuracy: 0.4099
Epoch [14/200], Loss: 1.0013, Val Loss: 1.7160, Val Accuracy: 0.4266
Epoch [15/200], Loss: 0.9746, Val Loss: 1.6

  model.load_state_dict(torch.load(best_model_path))


In [3]:
!pip install --upgrade --force-reinstall torch torchvision torchaudio



Collecting torch
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.20.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-

In [17]:
import os
import pandas as pd
import numpy as np
import librosa
import torch
import torchvision.models as models
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch import nn, optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Define the Data Directory and Emotion Labels
data_dir = '/kaggle/input/augnito-speech-task/Emotions'
emotion_labels = ['Angry', 'Happy', 'Sad', 'Neutral', 'Fearful', 'Disgusted', 'Suprised']

# Create DataFrame with File Paths and Labels
file_paths, labels = [], []
for emotion in emotion_labels:
    emotion_dir = os.path.join(data_dir, emotion)
    for root, _, files in os.walk(emotion_dir):
        for file in files:
            if file.endswith('.wav'):
                file_paths.append(os.path.join(root, file))
                labels.append(emotion)

if not file_paths:
    raise ValueError("No .wav files found. Check the data directory path.")

# Check total samples loaded
print(f'Total samples loaded: {len(file_paths)}')

# Create DataFrame
df = pd.DataFrame({'file_path': file_paths, 'emotion': labels})
df['label'] = df['emotion'].astype('category').cat.codes
label2id = {label: idx for idx, label in enumerate(emotion_labels)}
id2label = {v: k for k, v in label2id.items()}

# Convert audio to mel spectrogram
def audio_to_mel_spectrogram(file_path, n_mels=128):
    audio, sr = librosa.load(file_path, sr=16000)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return log_mel_spectrogram

# Preprocess all audio files and save as tensors
spectrograms, labels = [], []
for _, row in df.iterrows():
    spectrogram = audio_to_mel_spectrogram(row['file_path'])
    spectrograms.append(spectrogram)
    labels.append(row['label'])

# Check total samples after conversion to spectrograms
print(f'Samples after conversion to spectrograms: {len(spectrograms)}')

# Split data
X_train, X_test, y_train, y_test = train_test_split(spectrograms, labels, test_size=0.15, random_state=42)

# Check training and testing samples
print(f'Training samples: {len(X_train)}, Testing samples: {len(X_test)}')

# PyTorch Dataset
class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, spectrograms, labels, transform=None):
        self.spectrograms = spectrograms
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.spectrograms)

    def __getitem__(self, idx):
        spectrogram = self.spectrograms[idx]
        spectrogram = np.stack([spectrogram]*3, axis=0)  # Convert to 3 channels
        spectrogram = torch.tensor(spectrogram).float()
        if self.transform:
            spectrogram = self.transform(spectrogram)
        label = torch.tensor(self.labels[idx])
        return spectrogram, label

# Transformations for ResNet input
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Create PyTorch DataLoader
train_data = AudioDataset(X_train, y_train, transform=transform)
test_data = AudioDataset(X_test, y_test, transform=transform)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

# Check DataLoader batch counts
print(f'Train batches: {len(train_loader)}, Test batches: {len(test_loader)}')

# Load ResNet18 model and freeze initial layers
model = models.resnet18(pretrained=True)

# Freeze initial layers
for name, param in model.named_parameters():
    if "layer3" not in name and "layer4" not in name:  # Freezing up to layer2, tuning layers 3 and 4
        param.requires_grad = False

# Modify the final fully connected layer for 7-class classification
model.fc = nn.Linear(model.fc.in_features, len(emotion_labels))
model = model.to(device)

# Training the Classifier with Cosine Annealing Scheduler and Weight Decay
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-6)

best_val_accuracy = 0
best_model_path = "best_model.pth"

# Training Loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    scheduler.step()

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_loader):.4f}, "
          f"Val Loss: {val_loss/len(test_loader):.4f}, Val Accuracy: {val_accuracy:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), best_model_path)

# Final Evaluation on Test Set
model.load_state_dict(torch.load(best_model_path))
model.eval()
all_labels = []
all_preds = []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(all_labels, all_preds, target_names=emotion_labels))


Using device: cuda
Total samples loaded: 12798
Samples after conversion to spectrograms: 12798
Training samples: 10878, Testing samples: 1920
Train batches: 340, Test batches: 60




Epoch [1/20], Train Loss: 1.0501, Val Loss: 0.8348, Val Accuracy: 0.6854
Epoch [2/20], Train Loss: 0.5680, Val Loss: 0.7786, Val Accuracy: 0.7078
Epoch [3/20], Train Loss: 0.2118, Val Loss: 0.8559, Val Accuracy: 0.7109
Epoch [4/20], Train Loss: 0.0611, Val Loss: 0.9690, Val Accuracy: 0.7130
Epoch [5/20], Train Loss: 0.0266, Val Loss: 1.0628, Val Accuracy: 0.7094
Epoch [6/20], Train Loss: 0.0157, Val Loss: 0.9976, Val Accuracy: 0.7229
Epoch [7/20], Train Loss: 0.0074, Val Loss: 0.9935, Val Accuracy: 0.7182
Epoch [8/20], Train Loss: 0.0057, Val Loss: 0.9922, Val Accuracy: 0.7245
Epoch [9/20], Train Loss: 0.0045, Val Loss: 1.0047, Val Accuracy: 0.7224
Epoch [10/20], Train Loss: 0.0039, Val Loss: 0.9978, Val Accuracy: 0.7219
Epoch [11/20], Train Loss: 0.0040, Val Loss: 1.0101, Val Accuracy: 0.7271
Epoch [12/20], Train Loss: 0.0045, Val Loss: 1.0210, Val Accuracy: 0.7214
Epoch [13/20], Train Loss: 0.0039, Val Loss: 1.0166, Val Accuracy: 0.7240
Epoch [14/20], Train Loss: 0.0045, Val Loss: 1.

  model.load_state_dict(torch.load(best_model_path))


Test Accuracy: 0.7276
              precision    recall  f1-score   support

       Angry       0.83      0.84      0.84       329
       Happy       0.68      0.63      0.66       258
         Sad       0.63      0.62      0.63       315
     Neutral       0.79      0.71      0.75       348
     Fearful       0.75      0.75      0.75       244
   Disgusted       0.64      0.73      0.68       344
    Suprised       0.91      0.98      0.94        82

    accuracy                           0.73      1920
   macro avg       0.75      0.75      0.75      1920
weighted avg       0.73      0.73      0.73      1920

