<h1>Install Libraries<h1>

In [1]:
import transformers
print(transformers.__version__)  # Should show 4.57.3

  from .autonotebook import tqdm as notebook_tqdm


4.57.3


In [2]:
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install transformers datasets soundfile librosa torchaudio scikit-learn jiwer seaborn matplotlib noisereduce

Note: you may need to restart the kernel to use updated packages.


<h1>Imports<h1>

In [4]:
import os
import numpy as np
import pandas as pd
import torch
import librosa
import soundfile as sf
import noisereduce as nr

from datasets import Dataset, Audio
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

from transformers import (
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForSequenceClassification,
    TrainingArguments,
    Trainer
)

<h1>Load Label CSV<h1>

In [5]:
df = pd.read_csv("C:\\Users\\dilit\\OneDrive - Sri Lanka Institute of Information Technology\\Research\\ai-powered-interview-training-voicebot\\backend\\app\\inputs\\labels_from_filenames.csv")
df = df[["file_path", "emotion_label"]]
df.head()

Unnamed: 0,file_path,emotion_label
0,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral
1,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral
2,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral
3,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral
4,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,calm


<h1>Encode Emotion Labels<h1>

In [6]:
emotion_list = sorted(df["emotion_label"].unique())

label2id = {emotion: i for i, emotion in enumerate(emotion_list)}
id2label = {i: emotion for emotion, i in label2id.items()}

df["label"] = df["emotion_label"].map(label2id)
df.head()

Unnamed: 0,file_path,emotion_label,label
0,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral,5
1,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral,5
2,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral,5
3,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,neutral,5
4,C:\Users\dilit\OneDrive - Sri Lanka Institute ...,calm,1


<h1>Audio Preprocessing Function<h1>

In [7]:
def preprocess_audio(file_path, target_sr=16000):
    # Load audio
    audio, sr = librosa.load(file_path, sr=None)

    # 1. Noise Reduction
    reduced_noise = nr.reduce_noise(y=audio, sr=sr)

    # 2. Trim leading & trailing silence
    trimmed, _ = librosa.effects.trim(reduced_noise, top_db=20)

    # 3. Normalization
    normalized = librosa.util.normalize(trimmed)

    # 4. Resample to 16k Hz
    if sr != target_sr:
        normalized = librosa.resample(normalized, orig_sr=sr, target_sr=target_sr)

    return normalized, target_sr

<h1>Assign Preprocessing to the dataset<h1>

In [31]:
# Add audio loading
dataset = Dataset.from_pandas(df)

def load_and_preprocess(batch):
    audio_array, sr = preprocess_audio(batch["file_path"])
    batch["audio"] = {"array": audio_array, "sampling_rate": sr}
    return batch

dataset = dataset.map(load_and_preprocess)

Map: 100%|██████████| 1320/1320 [03:15<00:00,  6.75 examples/s]
Map: 100%|██████████| 1320/1320 [03:15<00:00,  6.75 examples/s]


<h1>Feature Extraction<h>

In [32]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-large-960h"
)

<h1>Convert Audio to Model Input<h1>

In [33]:
def encode(batch):
    audio = batch["audio"]["array"]
    sampling_rate = batch["audio"]["sampling_rate"]

    # Don't pad here; let the data collator handle padding at batch time.
    inputs = feature_extractor(
        audio,
        sampling_rate=sampling_rate,
        max_length=160000,  # around 10 sec max audio
        truncation=True,
    )
    # Ensure input_values is a plain 1D Python list (not numpy nor nested list)
    iv = inputs.get("input_values")
    # handle numpy arrays
    try:
        import numpy as _np
        if isinstance(iv, _np.ndarray):
            iv = _np.squeeze(iv).tolist()
    except Exception:
        pass

    # handle nested lists like [[...]]
    if isinstance(iv, list) and len(iv) > 0 and isinstance(iv[0], (list, tuple)):
        # flatten 1-element nesting
        if len(iv) == 1:
            iv = iv[0]
        else:
            # otherwise, try to flatten one level
            flat = []
            for e in iv:
                if isinstance(e, (list, tuple)):
                    flat.extend(e)
                else:
                    flat.append(e)
            iv = flat
    inputs["input_values"] = iv
    # Ensure labels are scalars
    inputs["labels"] = int(batch["label"]) if batch.get("label") is not None else None
    return inputs

dataset = dataset.map(
    encode,
    remove_columns=["file_path", "emotion_label", "audio"]
)

Map: 100%|██████████| 1320/1320 [00:19<00:00, 66.74 examples/s]
Map: 100%|██████████| 1320/1320 [00:19<00:00, 66.74 examples/s]


<h1>Train and Test Split<h1>

In [34]:
"""
Diagnostic cell: print types, shapes and sample values for a few dataset entries so we can see what's fed
to the collator and detect any nested/inhomogeneous sequences.
"""
from pprint import pprint
sample_size = min(5, len(train_ds))
for i in range(sample_size):
    s = train_ds[i]
    iv = s.get('input_values')
    lab = s.get('labels') or s.get('label')
    print(f'[{i}] type(input_values)={type(iv)}; sample len=', end='')
    try:
        print(len(iv))
    except Exception:
        print('N/A')
    print(f'    sample type first element: {type(iv[0]) if (hasattr(iv, "__len__") and len(iv)>0) else None}')
    print(f'    label type: {type(lab)}; value: {lab}')
    print('-' * 60)

[0] type(input_values)=<class 'list'>; sample len=1
    sample type first element: <class 'list'>
    label type: <class 'int'>; value: 5
------------------------------------------------------------
[1] type(input_values)=<class 'list'>; sample len=1
    sample type first element: <class 'list'>
    label type: <class 'int'>; value: 7
------------------------------------------------------------
[2] type(input_values)=<class 'list'>; sample len=1
    sample type first element: <class 'list'>
    label type: <class 'int'>; value: 0
------------------------------------------------------------
[3] type(input_values)=<class 'list'>; sample len=1
    sample type first element: <class 'list'>
    label type: <class 'int'>; value: 1
------------------------------------------------------------
[4] type(input_values)=<class 'list'>; sample len=1
    sample type first element: <class 'list'>
    label type: <class 'int'>; value: 1
------------------------------------------------------------


In [35]:
split = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = split["train"]
val_ds = split["test"]

<h1>Load wav2vec2 Model<h1>

In [36]:
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-large-960h",
    num_labels=len(emotion_list),
    label2id=label2id,
    id2label=id2label,
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from huggingface_hub import hf_hub_download

path = hf_hub_download("facebook/wav2vec2-large-960h", "config.json")
print(path)

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /facebook/wav2vec2-large-960h/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002D769DFB010>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 5b0f4ad3-8860-4495-9881-6e79f925cab9)')' thrown while requesting HEAD https://huggingface.co/facebook/wav2vec2-large-960h/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /facebook/wav2vec2-large-960h/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002D769D84890>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: c37253ea-a978-4950-9ae7-802fbfca447a)')' thrown while requesting HEAD https://huggingface.co/face

C:\Users\dilit\.cache\huggingface\hub\models--facebook--wav2vec2-large-960h\snapshots\bdeaacdf88f7a155f50a2704bc967aa81fbbb2ab\config.json


<h1>Metrics (Accuracy, Precision, Recall, F1)<h1>

In [37]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted'
    )

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

<h1>Training Arguments<h1>

In [15]:
# %pip install --upgrade pip
%pip install --upgrade torch transformers accelerate

Note: you may need to restart the kernel to use updated packages.




In [16]:
import torch
import transformers
import accelerate

print(torch.__version__)
print(transformers.__version__)
print(accelerate.__version__)

2.9.1+cpu
4.57.3
1.12.0


In [17]:
%pip install --upgrade --force-reinstall torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu122

Looking in indexes: https://download.pytorch.org/whl/cu122
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement torch (from versions: none)
ERROR: No matching distribution found for torch


In [18]:
import transformers
print(transformers.__version__)

4.57.3


In [19]:
import sys
print(sys.executable)
%pip list

c:\Users\dilit\AppData\Local\Programs\Python\Python311\python.exe
Package                 Version
----------------------- -----------
accelerate              1.12.0
aiohappyeyeballs        2.6.1
aiohttp                 3.13.2
aiosignal               1.4.0
anyio                   4.12.0
asttokens               3.0.1
attrs                   25.4.0
audioread               3.1.0
certifi                 2025.11.12
cffi                    2.0.0
charset-normalizer      3.4.4
click                   8.3.1
colorama                0.4.6
comm                    0.2.3
contourpy               1.3.3
cycler                  0.12.1
datasets                4.4.1
debugpy                 1.8.17
decorator               5.2.1
dill                    0.4.0
executing               2.2.1
filelock                3.20.0
fonttools               4.61.0
frozenlist              1.8.0
fsspec                  2025.10.0
h11                     0.16.0
httpcore                1.0.9
httpx                   0.28.1
hugging

In [38]:
# Define training arguments in a way that is compatible across Transformers versions
from transformers import TrainingArguments
from packaging import version
import transformers as _transformers

transformers_version = _transformers.__version__

training_kwargs = dict(
    output_dir="./wav2vec2_emotion_preprocessed",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    warmup_steps=200,
    logging_steps=20,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=False,  # CPU only
    # Prefer eval_steps/save_steps for backward compatibility
    eval_steps=100,
    save_steps=100,
)

# Add evaluation/save strategy only if the TrainingArguments signature accepts them
import inspect
from transformers import TrainingArguments as _TrainingArguments

# Inspect TrainingArguments signature and filter kwargs to avoid passing unsupported keys
sig = inspect.signature(_TrainingArguments.__init__)
# Allowed param names in TrainingArguments.__init__ (exclude self and kwargs)
allowed_param_names = {name for name in sig.parameters.keys() if name not in ('self', 'kwargs')}

# Add evaluation/save strategy only if supported
# Find the canonical parameter name for evaluation strategy in this Transformers version: `evaluation_strategy` or `eval_strategy`.
eval_param = None
if 'evaluation_strategy' in allowed_param_names:
    eval_param = 'evaluation_strategy'
elif 'eval_strategy' in allowed_param_names:
    eval_param = 'eval_strategy'

if 'save_strategy' in allowed_param_names:
    training_kwargs['save_strategy'] = 'steps'

# If an eval parameter name exists, set it to match the save strategy to allow load_best_model_at_end
if eval_param:
    training_kwargs[eval_param] = 'steps'
else:
    # There is no evaluation strategy parameter; fall back to eval_steps & save_steps only.
    # If load_best_model_at_end requires matching eval/save strategy, switch it off to avoid the ValueError.
    if 'load_best_model_at_end' in training_kwargs and training_kwargs['load_best_model_at_end']:
        print('# NOTE: Disabling load_best_model_at_end because evaluation strategy param is missing in this Transformers version')
        training_kwargs['load_best_model_at_end'] = False

# Filter training_kwargs to only include allowed parameters (prevent TypeError on unknown args)
filtered_kwargs = {k: v for k, v in training_kwargs.items() if k in allowed_param_names}

# Show what will be passed to TrainingArguments (helpful for debugging in notebook)
print('# Filtered TrainingArgs keys:', sorted(filtered_kwargs.keys()))

# Note: If you recently upgraded the `transformers` package in the same kernel, restart the kernel
# to ensure imports/loading reflect the newly installed version (otherwise you may still see old signatures).

training_args = TrainingArguments(**filtered_kwargs)


# Filtered TrainingArgs keys: ['eval_steps', 'eval_strategy', 'fp16', 'learning_rate', 'load_best_model_at_end', 'logging_steps', 'num_train_epochs', 'output_dir', 'per_device_eval_batch_size', 'per_device_train_batch_size', 'save_steps', 'save_strategy', 'save_total_limit', 'warmup_steps']


In [39]:
# Final verification: print Transformers version and check installed TrainingArguments parameters
import inspect
import transformers
from transformers import TrainingArguments
print('Transformers version:', transformers.__version__)
print('\nTrainingArguments signature:')
print(inspect.signature(TrainingArguments.__init__))

# Print the instantiated training_args properties for debugging
try:
    print('\nTraining args loaded:')
    print(' - load_best_model_at_end:', training_args.load_best_model_at_end)
    # mapping for evaluation strategy attribute names
    eval_attr = None
    if hasattr(training_args, 'evaluation_strategy'):
        eval_attr = 'evaluation_strategy'
    elif hasattr(training_args, 'eval_strategy'):
        eval_attr = 'eval_strategy'
    print(' - evaluation strategy attr:', eval_attr)
    if eval_attr:
        print('   ->', getattr(training_args, eval_attr))
    print(' - save_strategy:', getattr(training_args, 'save_strategy', None))
    print(' - eval_steps:', getattr(training_args, 'eval_steps', None))
    print(' - save_steps:', getattr(training_args, 'save_steps', None))
except Exception as e:
    print('Error while printing training args properties:', e)

Transformers version: 4.57.3

TrainingArguments signature:

Training args loaded:
 - load_best_model_at_end: True
 - evaluation strategy attr: eval_strategy
   -> IntervalStrategy.STEPS
 - save_strategy: SaveStrategy.STEPS
 - eval_steps: 100
 - save_steps: 100


<h1>Trainer<h1>

In [40]:
from transformers import DataCollatorWithPadding, Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

data_collator = DataCollatorWithPadding(tokenizer=processor, padding=True)

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator  # correct
)

<h1>Train the Model<h1>

In [42]:
import torch
from dataclasses import dataclass
from typing import Dict, List, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[float], int]]]) -> Dict[str, torch.Tensor]:
        # Extract input_values and labels, convert numpy arrays to Python lists to be padded by the processor
        input_features = []
        label_features = []
        for f in features:
            iv = f.get("input_values")
            # Normalize input_values to 1D Python list: handle numpy arrays, nested lists, or (n,1) shapes
            try:
                import numpy as _np
                # Convert numpy arrays to list then flatten 1-element nesting
                if hasattr(iv, "tolist"):
                    iv = iv.tolist()
                # If it's a nested list like [[...]] or list of arrays, convert & squeeze
                if isinstance(iv, list) and len(iv) > 0 and isinstance(iv[0], (list, tuple, _np.ndarray)):
                    arr = _np.asarray(iv)
                    arr = _np.squeeze(arr)
                    iv = arr.tolist()
            except Exception:
                # Fallback: try to flatten trivial nesting
                if isinstance(iv, list) and len(iv) > 0 and isinstance(iv[0], (list, tuple)):
                    if len(iv) == 1:
                        iv = iv[0]
            input_features.append({"input_values": iv})

            lab = f.get("labels")
            # Ensure labels are ints (sequence classification expects scalar labels)
            try:
                lab_int = int(lab)
            except Exception:
                # Fallback: if it's an array or list, take the first element
                if hasattr(lab, "__len__"):
                    lab_int = int(lab[0])
                else:
                    lab_int = int(lab)
            label_features.append(lab_int)

        # Pad the input_values using processor (processor expects list of lists)
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Convert labels to tensor
        batch["labels"] = torch.tensor(label_features, dtype=torch.long)
        return batch

In [3]:
# Load processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

# Create collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator
)

NameError: name 'Wav2Vec2Processor' is not defined

In [44]:
# Quick sanity test: run the collator on a small batch and inspect shapes
from pprint import pprint
try:
    sample_features = [train_ds[i] for i in range(min(4, len(train_ds)))]
    batch = data_collator(sample_features)
    print('Batch keys:', list(batch.keys()))
    for k, v in batch.items():
        try:
            print(k, type(v), v.shape)
        except Exception:
            print(k, type(v))
    print('Sanity check OK — collator produced tensors')
except Exception as e:
    print('Collator test error:', type(e).__name__, e)

Batch keys: ['input_values', 'labels']
input_values <class 'torch.Tensor'> torch.Size([4, 31062])
labels <class 'torch.Tensor'> torch.Size([4])
Sanity check OK — collator produced tensors


In [45]:
trainer.train()

Step,Training Loss,Validation Loss
100,2.0986,2.080244
200,2.0735,2.073848
300,2.0358,2.065224
400,2.1292,2.072175
500,2.0524,2.067372
600,2.0375,2.078077
700,2.0907,2.068405
800,2.0898,2.073665
900,2.1263,2.080536
1000,2.0936,2.062241




TrainOutput(global_step=2640, training_loss=2.0711076259613037, metrics={'train_runtime': 167944.5913, 'train_samples_per_second': 0.031, 'train_steps_per_second': 0.016, 'total_flos': 2.7919664495120093e+17, 'train_loss': 2.0711076259613037, 'epoch': 5.0})

<h1>Save the Train Model<h1>

In [2]:
# STEP 17 — Save final model and processor
trainer.save_model("./wav2vec2_emotion_model")
processor.save_pretrained("./wav2vec2_emotion_model")

NameError: name 'trainer' is not defined

<h1>Evaluate the Model<h1>

In [1]:
metrics = trainer.evaluate()
metrics

NameError: name 'trainer' is not defined