<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/803/w2v2_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torchaudio datasets pandas



In [3]:
import torchaudio
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
from transformers import TrainingArguments, Trainer
import os

# Path to your folders
audio_folder = "FluencyBank"
label_file = "output.csv"

# Load the labels
labels_df = pd.read_csv(label_file)
labels_df['path'] = labels_df.apply(lambda row: f"FluencyBank_{int(row['EpId']):03}_{row['ClipId']}.wav", axis=1)

# Prepare dataset
def prepare_dataset(row):
    speech_array, sampling_rate = torchaudio.load(row['path'])
    return {"speech": speech_array[0], "sampling_rate": sampling_rate, "labels": row['Output']}

dataset = Dataset.from_pandas(labels_df)
dataset = dataset.map(prepare_dataset, remove_columns=["EpId", "ClipId", "Output", "path"])

# Split dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({"train": train_test_split['train'], "test": train_test_split['test']})

# Load processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=dataset['train'].features['labels'].num_classes)

# Preprocess data
def preprocess_data(batch):
    input_values = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], return_tensors="pt").input_values
    return {"input_values": input_values.squeeze(), "labels": batch["labels"]}

train_dataset = dataset_dict["train"].map(preprocess_data, batched=True, batch_size=8, remove_columns=dataset_dict["train"].column_names)
test_dataset = dataset_dict["test"].map(preprocess_data, batched=True, batch_size=8, remove_columns=dataset_dict["test"].column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
)

# Train the model
trainer.train()

# You can save the model and processor with the following commands
model.save_pretrained("./wav2vec2-finetuned")
processor.save_pretrained("./wav2vec2-finetuned")

Map:   0%|          | 0/5477 [00:00<?, ? examples/s]

RuntimeError: Failed to open the input "FluencyBank_010_0.wav" (No such file or directory).
Exception raised from get_input_format_context at /__w/audio/audio/pytorch/audio/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x7fc4c40f4d87 in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x7fc4c40a575f in /usr/local/lib/python3.10/dist-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x42904 (0x7fc4aa2ca904 in /usr/local/lib/python3.10/dist-packages/torio/lib/libtorio_ffmpeg4.so)
frame #3: torio::io::StreamingMediaDecoder::StreamingMediaDecoder(std::string const&, std::optional<std::string> const&, std::optional<std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > > const&) + 0x14 (0x7fc4aa2cd304 in /usr/local/lib/python3.10/dist-packages/torio/lib/libtorio_ffmpeg4.so)
frame #4: <unknown function> + 0x3a58e (0x7fc3da49358e in /usr/local/lib/python3.10/dist-packages/torio/lib/_torio_ffmpeg4.so)
frame #5: <unknown function> + 0x32147 (0x7fc3da48b147 in /usr/local/lib/python3.10/dist-packages/torio/lib/_torio_ffmpeg4.so)
frame #6: <unknown function> + 0x15a10e (0x575a22bf610e in /usr/bin/python3)
frame #7: _PyObject_MakeTpCall + 0x25b (0x575a22beca7b in /usr/bin/python3)
frame #8: <unknown function> + 0x168c20 (0x575a22c04c20 in /usr/bin/python3)
frame #9: <unknown function> + 0x165087 (0x575a22c01087 in /usr/bin/python3)
frame #10: <unknown function> + 0x150e2b (0x575a22bece2b in /usr/bin/python3)
frame #11: <unknown function> + 0xf244 (0x7fc4d4c14244 in /usr/local/lib/python3.10/dist-packages/torchaudio/lib/_torchaudio.so)
frame #12: _PyObject_MakeTpCall + 0x25b (0x575a22beca7b in /usr/bin/python3)
frame #13: _PyEval_EvalFrameDefault + 0x6a79 (0x575a22be5629 in /usr/bin/python3)
frame #14: _PyObject_FastCallDictTstate + 0xc4 (0x575a22bebc14 in /usr/bin/python3)
frame #15: <unknown function> + 0x164a64 (0x575a22c00a64 in /usr/bin/python3)
frame #16: _PyObject_MakeTpCall + 0x1fc (0x575a22beca1c in /usr/bin/python3)
frame #17: _PyEval_EvalFrameDefault + 0x6a79 (0x575a22be5629 in /usr/bin/python3)
frame #18: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #19: _PyEval_EvalFrameDefault + 0x6bd (0x575a22bdf26d in /usr/bin/python3)
frame #20: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #21: _PyEval_EvalFrameDefault + 0x614a (0x575a22be4cfa in /usr/bin/python3)
frame #22: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #23: _PyEval_EvalFrameDefault + 0x614a (0x575a22be4cfa in /usr/bin/python3)
frame #24: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #25: _PyEval_EvalFrameDefault + 0x2a27 (0x575a22be15d7 in /usr/bin/python3)
frame #26: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #27: _PyEval_EvalFrameDefault + 0x198c (0x575a22be053c in /usr/bin/python3)
frame #28: <unknown function> + 0x177cc2 (0x575a22c13cc2 in /usr/bin/python3)
frame #29: _PyEval_EvalFrameDefault + 0xaa0 (0x575a22bdf650 in /usr/bin/python3)
frame #30: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #31: PyObject_Call + 0x122 (0x575a22c05492 in /usr/bin/python3)
frame #32: _PyEval_EvalFrameDefault + 0x2a27 (0x575a22be15d7 in /usr/bin/python3)
frame #33: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #34: PyObject_Call + 0x122 (0x575a22c05492 in /usr/bin/python3)
frame #35: _PyEval_EvalFrameDefault + 0x2a27 (0x575a22be15d7 in /usr/bin/python3)
frame #36: <unknown function> + 0x1687f1 (0x575a22c047f1 in /usr/bin/python3)
frame #37: _PyEval_EvalFrameDefault + 0x198c (0x575a22be053c in /usr/bin/python3)
frame #38: <unknown function> + 0x13f9c6 (0x575a22bdb9c6 in /usr/bin/python3)
frame #39: PyEval_EvalCode + 0x86 (0x575a22cd1256 in /usr/bin/python3)
frame #40: <unknown function> + 0x23ae2d (0x575a22cd6e2d in /usr/bin/python3)
frame #41: <unknown function> + 0x15ac59 (0x575a22bf6c59 in /usr/bin/python3)
frame #42: _PyEval_EvalFrameDefault + 0x6bd (0x575a22bdf26d in /usr/bin/python3)
frame #43: <unknown function> + 0x177ff0 (0x575a22c13ff0 in /usr/bin/python3)
frame #44: _PyEval_EvalFrameDefault + 0x2568 (0x575a22be1118 in /usr/bin/python3)
frame #45: <unknown function> + 0x177ff0 (0x575a22c13ff0 in /usr/bin/python3)
frame #46: _PyEval_EvalFrameDefault + 0x2568 (0x575a22be1118 in /usr/bin/python3)
frame #47: <unknown function> + 0x177ff0 (0x575a22c13ff0 in /usr/bin/python3)
frame #48: <unknown function> + 0x2557af (0x575a22cf17af in /usr/bin/python3)
frame #49: <unknown function> + 0x1662ca (0x575a22c022ca in /usr/bin/python3)
frame #50: _PyEval_EvalFrameDefault + 0x8ac (0x575a22bdf45c in /usr/bin/python3)
frame #51: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #52: _PyEval_EvalFrameDefault + 0x6bd (0x575a22bdf26d in /usr/bin/python3)
frame #53: _PyFunction_Vectorcall + 0x7c (0x575a22bf69fc in /usr/bin/python3)
frame #54: _PyEval_EvalFrameDefault + 0x8ac (0x575a22bdf45c in /usr/bin/python3)
frame #55: <unknown function> + 0x1687f1 (0x575a22c047f1 in /usr/bin/python3)
frame #56: PyObject_Call + 0x122 (0x575a22c05492 in /usr/bin/python3)
frame #57: _PyEval_EvalFrameDefault + 0x2a27 (0x575a22be15d7 in /usr/bin/python3)
frame #58: <unknown function> + 0x1687f1 (0x575a22c047f1 in /usr/bin/python3)
frame #59: _PyEval_EvalFrameDefault + 0x198c (0x575a22be053c in /usr/bin/python3)
frame #60: <unknown function> + 0x200175 (0x575a22c9c175 in /usr/bin/python3)
frame #61: <unknown function> + 0x15ac59 (0x575a22bf6c59 in /usr/bin/python3)
frame #62: <unknown function> + 0x236bc5 (0x575a22cd2bc5 in /usr/bin/python3)
frame #63: <unknown function> + 0x2b2572 (0x575a22d4e572 in /usr/bin/python3)
