<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/803/w2v2_finetune_best_DONE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch
!pip install transformers[torch]



In [3]:
from datasets import load_dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Config, Wav2Vec2Processor, TrainingArguments, Trainer
import torch
from sklearn.metrics import accuracy_score

# Load the dataset
dataset = load_dataset("HamdanXI/fb_labeled_v5")

# Filter out invalid entries
def is_audio_valid(batch):
    return batch["audio"] is not None and batch["Output"] is not None

valid_dataset = dataset.filter(is_audio_valid)

# Map labels to zero-indexed
label_list = sorted(set(valid_dataset['train']['Output']))
label_dict = {label: idx for idx, label in enumerate(label_list)}

def map_labels(batch):
    batch["Output"] = label_dict[batch["Output"]]
    return batch

valid_dataset = valid_dataset.map(map_labels)

# Load configuration and create model
NUM_CLASSES = len(label_dict)
config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base-960h", num_labels=NUM_CLASSES)
model = Wav2Vec2ForSequenceClassification(config)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Prepare the dataset
def prepare_dataset(batch):
    audio = batch["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding=True)
    input_values = inputs.input_values.squeeze()  # Adjust as needed
    labels = torch.tensor(batch["Output"], dtype=torch.long)
    return {"input_values": input_values, "labels": labels}

processed_dataset = valid_dataset.map(prepare_dataset, remove_columns=valid_dataset.column_names['train'])

# Split the dataset
train_test_split = processed_dataset['train'].train_test_split(test_size=0.1)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    load_best_model_at_end=True,
)

# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

import torch
from torch.nn.utils.rnn import pad_sequence

class DataCollatorForWav2Vec2:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        # Ensure each input value is a tensor, then pad
        input_values = [torch.tensor(feature['input_values'], dtype=torch.float) for feature in features]
        labels = [feature['labels'] for feature in features]

        # Pad input values
        input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)  # Update padding value if needed

        # Convert labels to tensors
        labels = torch.tensor(labels, dtype=torch.long)

        return {
            'input_values': input_values_padded,
            'labels': labels
        }

# Initialize the custom data collator with the processor
data_collator = DataCollatorForWav2Vec2(processor=processor)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,1.5813,1.552469,0.520591


TrainOutput(global_step=798, training_loss=1.5743725998957354, metrics={'train_runtime': 956.4288, 'train_samples_per_second': 26.731, 'train_steps_per_second': 0.834, 'total_flos': 6.95182282800768e+17, 'train_loss': 1.5743725998957354, 'epoch': 2.9943714821763603})

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
trainer.push_to_hub("HamdanXI/w2v2_fb_labeled")

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

events.out.tfevents.1714507814.7db393de5383.729.0:   0%|          | 0.00/6.52k [00:00<?, ?B/s]

Upload 8 LFS files:   0%|          | 0/8 [00:00<?, ?it/s]

events.out.tfevents.1714507944.7db393de5383.729.1:   0%|          | 0.00/6.48k [00:00<?, ?B/s]

events.out.tfevents.1714508244.7db393de5383.3206.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1714508420.7db393de5383.4036.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1714508461.7db393de5383.4551.0:   0%|          | 0.00/6.48k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

events.out.tfevents.1714508480.7db393de5383.4551.1:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HamdanXI/results/commit/287bef80609d6d009a95177b5d1913804ddbb82e', commit_message='HamdanXI/w2v2_fb_labeled', commit_description='', oid='287bef80609d6d009a95177b5d1913804ddbb82e', pr_url=None, pr_revision=None, pr_num=None)

In [6]:
# F1 Score
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1_scores = f1_score(labels, preds, average=None)  # Returns an array of F1 scores for each class
    return {"accuracy": acc, **{f"f1_score_class_{i}": score for i, score in enumerate(f1_scores)}}

In [10]:
from transformers import TrainingArguments, Trainer

# Setup training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score Class 0,F1 Score Class 1,F1 Score Class 2,F1 Score Class 3,F1 Score Class 4,F1 Score Class 5,F1 Score Class 6,F1 Score Class 7
1,1.624,1.556209,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
2,1.6758,1.563488,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
3,1.5986,1.555887,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
4,1.722,1.55387,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
5,1.7561,1.555756,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
6,1.724,1.554321,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
7,1.485,1.553403,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
8,1.6086,1.552441,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
9,1.4925,1.553025,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0
10,1.611,1.552675,0.520591,0.0,0.0,0.0,0.0,0.0,0.0,0.684722,0.0


TrainOutput(global_step=5330, training_loss=1.5694520359862365, metrics={'train_runtime': 3478.8717, 'train_samples_per_second': 24.496, 'train_steps_per_second': 1.532, 'total_flos': 2.32108737424704e+18, 'train_loss': 1.5694520359862365, 'epoch': 10.0})

In [11]:
results = trainer.evaluate()
print(results)

{'eval_loss': 1.5526747703552246, 'eval_accuracy': 0.5205913410770855, 'eval_f1_score_class_0': 0.0, 'eval_f1_score_class_1': 0.0, 'eval_f1_score_class_2': 0.0, 'eval_f1_score_class_3': 0.0, 'eval_f1_score_class_4': 0.0, 'eval_f1_score_class_5': 0.0, 'eval_f1_score_class_6': 0.6847222222222222, 'eval_f1_score_class_7': 0.0, 'eval_runtime': 29.4249, 'eval_samples_per_second': 32.184, 'eval_steps_per_second': 4.044, 'epoch': 10.0}


In [12]:
trainer.push_to_hub("HamdanXI/w2v2_fb_labeled_v2")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HamdanXI/results/commit/e14c66ba7fe35eb33b4c3fb320b5924ed5649d67', commit_message='HamdanXI/w2v2_fb_labeled_v2', commit_description='', oid='e14c66ba7fe35eb33b4c3fb320b5924ed5649d67', pr_url=None, pr_revision=None, pr_num=None)