In [6]:
# Step 1: Install Necessary Libraries
!pip install transformers torchaudio

  pid, fd = os.forkpty()




In [12]:
from transformers import logging
import torch

logging.set_verbosity_error()  # Suppresses detailed logs for space
torch.cuda.empty_cache()       # Empties GPU cache if used

!rm -rf ~/.cache/huggingface  # Clear Hugging Face cache
!rm -rf ~/.cache/torch        # Clear PyTorch cache

!du -h /kaggle/working  # List file sizes in the working directory
!rm -rf /kaggle/working/large_file_or_directory  # Replace with the path to delete

import shutil
shutil.rmtree('/root/.cache/huggingface', ignore_errors=True)
shutil.rmtree('/root/.cache/torch', ignore_errors=True)


  pid, fd = os.forkpty()


4.0K	/kaggle/working/.virtual_documents
1.1G	/kaggle/working/results/checkpoint-2240
1.1G	/kaggle/working/results/checkpoint-13440
1.1G	/kaggle/working/results/checkpoint-19040
1.1G	/kaggle/working/results/checkpoint-8960
1.1G	/kaggle/working/results/checkpoint-10080
1.1G	/kaggle/working/results/checkpoint-12320
1.1G	/kaggle/working/results/checkpoint-16800
1.1G	/kaggle/working/results/checkpoint-15680
1.1G	/kaggle/working/results/checkpoint-5600
1.1G	/kaggle/working/results/checkpoint-4480
1.1G	/kaggle/working/results/checkpoint-11200
486M	/kaggle/working/results/checkpoint-21280
1.1G	/kaggle/working/results/checkpoint-7840
1.1G	/kaggle/working/results/checkpoint-14560
1.1G	/kaggle/working/results/checkpoint-17920
1.1G	/kaggle/working/results/checkpoint-3360
1.1G	/kaggle/working/results/checkpoint-20160
1.1G	/kaggle/working/results/checkpoint-1120
1.1G	/kaggle/working/results/checkpoint-6720
20G	/kaggle/working/results
20G	/kaggle/working


In [1]:
# Step 1: Import Necessary Libraries
import os
import pandas as pd
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification, Trainer, TrainingArguments

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Step 2: Define the Data Directory and Emotion Labels
data_dir = '/kaggle/input/augnito-task-dataset/Emotions'
emotion_labels = ['Angry', 'Happy', 'Sad', 'Neutral', 'Fearful', 'Disgusted', 'Suprised']

# Step 3: Create a DataFrame with File Paths and Labels
file_paths, labels = [], []
for emotion in emotion_labels:
    emotion_dir = os.path.join(data_dir, emotion)
    if not os.path.isdir(emotion_dir):
        print(f"Warning: Directory for emotion '{emotion}' not found.")
        continue
    for root, _, files in os.walk(emotion_dir):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                file_paths.append(file_path)
                labels.append(emotion)

if not file_paths:
    raise ValueError("No .wav files found. Check the data directory path.")

# Create DataFrame
df = pd.DataFrame({'file_path': file_paths, 'emotion': labels})
df['label'] = df['emotion'].astype('category').cat.codes
label2id = {label: idx for idx, label in enumerate(emotion_labels)}
id2label = {v: k for k, v in label2id.items()}

# Step 4: Dataset Preparation
class EmotionDataset(Dataset):
    def __init__(self, dataframe, feature_extractor, max_length=16000):
        self.dataframe = dataframe.reset_index(drop=True)
        self.feature_extractor = feature_extractor
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        file_path = self.dataframe.loc[idx, 'file_path']
        label = self.dataframe.loc[idx, 'label']
        speech_array, _ = librosa.load(file_path, sr=16000)
        inputs = self.feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt", padding="longest")
        inputs['input_values'] = inputs['input_values'].squeeze(0)  # Ensure input is 1D
        return {
            "input_values": inputs["input_values"],
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Step 5: Split Data into Train, Validation, and Test Sets
train_df, test_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1765, stratify=train_df['label'], random_state=42)

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
train_dataset = EmotionDataset(train_df, feature_extractor)
val_dataset = EmotionDataset(val_df, feature_extractor)
test_dataset = EmotionDataset(test_df, feature_extractor)

# Verify dataset sizes
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Validation Dataset Size: {len(val_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")

# Step 6: Load Pretrained Wav2Vec2 Base Model with a Classification Head
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(emotion_labels),
    problem_type="single_label_classification"
).to(device)

# Unfreeze the last few layers for fine-tuning
for name, param in model.wav2vec2.named_parameters():
    if not name.startswith("encoder.layers.11"):  # Adjust number of layers as needed
        param.requires_grad = False

# Step 7: Set up Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Shorten for testing
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
     report_to="none"  # Disables wandb logging
)

# Define metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    report = classification_report(
        labels, preds, labels=list(label2id.values()), target_names=emotion_labels, output_dict=True, zero_division=0
    )
    return {
        'accuracy': accuracy,
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1': report['weighted avg']['f1-score']
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

# Step 8: Train the Model
trainer.train()

# Step 9: Evaluate the Fine-Tuned Model on Test Set
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")

# Step 10: Model Predictions and Classification Report
predictions = trainer.predict(test_dataset)
print(classification_report(predictions.label_ids, np.argmax(predictions.predictions, axis=1), target_names=emotion_labels))


Using device: cuda


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Train Dataset Size: 8958
Validation Dataset Size: 1920
Test Dataset Size: 1920


config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4087,1.525669,0.410417,0.475502,0.410417,0.388822
2,1.0282,1.396975,0.517188,0.547526,0.517188,0.511451
3,1.0331,1.407183,0.551562,0.614815,0.551562,0.533548
4,1.224,1.120048,0.619792,0.624538,0.619792,0.613568
5,1.0718,1.136039,0.638542,0.655762,0.638542,0.632547
6,0.723,1.297699,0.607292,0.656584,0.607292,0.603721
7,0.7079,1.105451,0.658333,0.693052,0.658333,0.656315
8,0.5438,1.234137,0.643229,0.6751,0.643229,0.639962
9,0.5477,1.115114,0.671354,0.683882,0.671354,0.671523
10,0.6064,1.244289,0.666667,0.698913,0.666667,0.663972


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast

Test Results: {'eval_loss': 1.0372107028961182, 'eval_accuracy': 0.6963541666666667, 'eval_precision': 0.7043298057037982, 'eval_recall': 0.6963541666666667, 'eval_f1': 0.6943641531155033, 'eval_runtime': 40.2873, 'eval_samples_per_second': 47.658, 'eval_steps_per_second': 5.957, 'epoch': 20.0}
              precision    recall  f1-score   support

       Angry       0.86      0.82      0.84       325
       Happy       0.64      0.72      0.68       280
         Sad       0.60      0.62      0.61       307
     Neutral       0.74      0.53      0.62       325
     Fearful       0.64      0.86      0.73       269
   Disgusted       0.68      0.62      0.65       325
    Suprised       0.83      0.82      0.82        89

    accuracy                           0.70      1920
   macro avg       0.71      0.71      0.71      1920
weighted avg       0.70      0.70      0.69      1920

