In [None]:
# Import necessary libraries
import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import librosa
import pandas as pd
import numpy as np
import soundfile as sf
from datasets import Features, Value, Sequence, Dataset
import evaluate
from transformers import (
    AutoModelForAudioClassification,
    Wav2Vec2FeatureExtractor,
    TrainingArguments,
    Trainer,
)
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [19]:
# Paths
dataset_dir = '.'
train_csv = os.path.join(dataset_dir, 'train.csv')
test_csv = os.path.join(dataset_dir, 'test.csv')
train_audio_dir = os.path.join(dataset_dir, 'train')
test_audio_dir = os.path.join(dataset_dir, 'test')

# Label mapping
e_labels = {"marah": 0, "jijik": 1, "takut": 2, "bahagia": 3, "netral": 4, "sedih": 5}

# Model checkpoint
checkpoint = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
out_dir = "./wav2vec2-emotion-6"

In [20]:
# Load metadata
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

# Encode labels
le = LabelEncoder()
le.classes_ = np.array(list(e_labels.keys()))
train_df['label'] = train_df['label'].map(e_labels)

# Create HuggingFace Datasets
ds_train = Dataset.from_pandas(train_df)
ds_test = Dataset.from_pandas(test_df)

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(checkpoint)
model = AutoModelForAudioClassification.from_pretrained(
    checkpoint,
    num_labels=6
)
model.gradient_checkpointing_enable()

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

In [22]:
def custom_data_collator(features):
    # Get input values and find max length
    input_values = [feature["input_values"] for feature in features]
    max_length = max(len(x) for x in input_values)
    
    # Pad input values to the same length
    padded_inputs = []
    attention_masks = []
    for inputs in input_values:
        # Create padding
        padding_length = max_length - len(inputs)
        padded_input = np.pad(inputs, (0, padding_length), 'constant', constant_values=0)
        # Create attention mask (1 for real values, 0 for padding)
        attention_mask = np.concatenate([
            np.ones(len(inputs)),
            np.zeros(padding_length)
        ])
        
        padded_inputs.append(padded_input)
        attention_masks.append(attention_mask)
    
    # Now create tensors from padded sequences
    if "label" in features[0].keys():
        labels = [feature["label"] for feature in features]
        batch = {
            "input_values": torch.tensor(padded_inputs, dtype=torch.float32),
            "attention_mask": torch.tensor(attention_masks, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.long)
        }
    else:
        batch = {
            "input_values": torch.tensor(padded_inputs, dtype=torch.float32),
            "attention_mask": torch.tensor(attention_masks, dtype=torch.float32)
        }
    
    return batch

def prepare_batch(batch, audio_dir):
    try:
        file_path = os.path.join(audio_dir, batch["id"])
        if not os.path.exists(file_path) and os.path.exists(file_path + ".wav"):
            file_path = file_path + ".wav"
            
        speech, sr = sf.read(file_path)
        
        if sr != 16000:
            speech = librosa.resample(
                speech, 
                orig_sr=sr, 
                target_sr=16000
            )
            sr = 16000

        if len(speech.shape) > 1:
            speech = np.mean(speech, axis=1)
        
        inputs = feature_extractor(
            speech,
            sampling_rate=16000,
            return_tensors="np",
            padding=True
        )

        return {
            "input_values": inputs.input_values[0].astype(np.float32),
            "attention_mask": inputs.attention_mask[0].astype(np.float32) if "attention_mask" in inputs else np.ones_like(inputs.input_values[0]).astype(np.float32),
            "label": np.int64(batch["label"]) if "label" in batch else np.int64(0)
        }
    except Exception as e:
        print(f"Error processing {batch['id']}: {e}")
        return {
            "input_values": np.zeros(16000, dtype=np.float32), 
            "attention_mask": np.ones(16000, dtype=np.float32),
            "label": np.int64(0)
        }

features = Features({
    'input_values': Sequence(feature=Value(dtype='float32')),
    'attention_mask': Sequence(feature=Value(dtype='float32')),
    'label': Value(dtype='int64')
})

ds_train = ds_train.map(
    lambda x: prepare_batch(x, train_audio_dir),
    remove_columns=ds_train.column_names,
    features=features
)

ds_test = ds_test.map(
    lambda x: prepare_batch(x, test_audio_dir),
    remove_columns=ds_test.column_names,
    features=features
)

Map:   0%|          | 0/6798 [00:00<?, ? examples/s]

Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

In [23]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [24]:
args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=100,
    do_eval=True,
    num_train_epochs=6,
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    report_to=[]
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_train,
    tokenizer=feature_extractor,
    data_collator=custom_data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss
100,1.3718
200,1.0676
300,0.9533
400,0.9372
500,0.9841
600,0.8878
700,0.8417
800,0.8702
900,0.7356
1000,0.6263


TrainOutput(global_step=5100, training_loss=0.49674665189256856, metrics={'train_runtime': 6601.0289, 'train_samples_per_second': 6.179, 'train_steps_per_second': 0.773, 'total_flos': 4.572850895162956e+18, 'train_loss': 0.49674665189256856, 'epoch': 6.0})

In [48]:
preds_output = trainer.predict(ds_test)
preds = np.argmax(preds_output.predictions, axis=-1)

# Map back to labels
inv_map = {v: k for k, v in e_labels.items()}
pred_labels = [inv_map[p] for p in preds]

# Prepare submission
submission = pd.DataFrame({'id': test_df['id'], 'label': pred_labels})
submission.to_csv('submission.csv', index=False)
print("Saved submission.csv")


Saved submission.csv
