<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/803/w2v2_finetune_best_veryClose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch



In [2]:
!pip install transformers[torch]



In [3]:
from datasets import load_dataset

dataset = load_dataset("HamdanXI/fb_labeled_v5")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Config
import torch

NUM_CLASSES = len(set(dataset['train']['Output']))

# Load the configuration and modify it for sequence classification
config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base-960h", num_labels=NUM_CLASSES)

# Create the classification model
model = Wav2Vec2ForSequenceClassification(config)

In [5]:
def is_audio_valid(batch):
    return batch["audio"] is not None and batch["Output"] is not None

# Filter out invalid entries
valid_dataset = dataset.filter(is_audio_valid)

In [6]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

def prepare_dataset(batch):
     # Process audio
    audio = batch["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding=True)

    # Adjust input_values to maintain batch dimensions
    input_values = inputs.input_values.squeeze(0) if inputs.input_values.ndim == 3 and inputs.input_values.shape[0] == 1 else inputs.input_values

    # Ensure label is a tensor
    label_tensor = torch.tensor(batch["Output"], dtype=torch.long)

    batch["input_values"] = input_values
    batch["labels"] = label_tensor
    return batch

# Map the function across the dataset
processed_dataset = valid_dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"])

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    load_best_model_at_end=True,
)

In [9]:
from datasets import DatasetDict

# Assuming the entire dataset is loaded as 'train'
train_test_split = dataset["train"].train_test_split(test_size=0.1)  # Adjust the test size as needed

# Now you have a new dataset with both 'train' and 'test' splits
dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
# dataset.push_to_hub("HamdanXI/fb_labeled_v6_w2v2")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/4293 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Map:   0%|          | 0/4292 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/954 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/HamdanXI/fb_labeled_v6_w2v2/commit/ca03b380ca22e9f8f2483b15a76fab6286532b63', commit_message='Upload dataset', commit_description='', oid='ca03b380ca22e9f8f2483b15a76fab6286532b63', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
trainer.train()

  return table.fast_gather(key % table.num_rows)


IndexError: list index out of range

In [14]:
print("Train set size:", len(dataset['train']))
print("Test set size:", len(dataset['test']))

Train set size: 8585
Test set size: 954


In [15]:
print(dataset['train'][0])  # Inspect the first sample from the train set
print(dataset['test'][0])   # Inspect the first sample from the test set

{'file_name': 'FluencyBank_046_11.wav', 'audio': {'path': None, 'array': array([ 0.00643921, -0.00021362,  0.0038147 , ...,  0.00787354,
        0.00582886,  0.00402832]), 'sampling_rate': 16000}, 'EpId': '46', 'ClipId': '11', 'Output': 4}
{'file_name': None, 'audio': {'path': None, 'array': array([-0.00997925, -0.01071167, -0.00836182, ...,  0.00909424,
        0.01080322,  0.01177979]), 'sampling_rate': 16000}, 'EpId': '81', 'ClipId': '14', 'Output': 7}
