In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import os
from pathlib import Path
import pandas as pd
from datasets import load_dataset, Dataset, Audio,concatenate_datasets
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import torchaudio
device = "cuda:0" if torch.cuda.is_available() else "cpu"



  from .autonotebook import tqdm as notebook_tqdm


## Importing data

In [None]:
def get_audio_file_paths(base_path_str: str) -> dict:
    
    base_path = Path(base_path_str)
    processed_dir = base_path / "processed data"
    
    audio_paths = {}
    audio_extensions = {'.wav', '.mp3', '.flac', '.m4a', '.ogg', '.opus'}

    if not processed_dir.is_dir():
        return audio_paths

    for lang_dir in processed_dir.iterdir():
        if not lang_dir.is_dir():
            continue
        
        lang_name = lang_dir.name
        audio_paths[lang_name] = {}
        
        for sub_dir in lang_dir.iterdir():
            if not sub_dir.is_dir():
                continue
            
            sub_name = sub_dir.name
            
            files = [
                str(f.resolve()) for f in sub_dir.glob('*') 
                if f.is_file() and f.suffix.lower() in audio_extensions
            ]
            audio_paths[lang_name][sub_name] = files
            
    return audio_paths

In [3]:
data_path = r"C:\Users\lucar-work\Documents\GitHub\whisper-math\data"
todos_os_arquivos = get_audio_file_paths(data_path)

In [4]:
text_a_english = "zero five twelve ninety-nine one hundred and five 2 plus 7 18 minus 4 6 times 3 20 divided by 5 ten plus thirty minus eight negative fifteen plus nine three to the power of two square root of sixteen clear equals repeat"
text_a_arabic = "احسب خمسة زائد اثنين عشرة ناقص ثلاثة ستة ضرب أربعة عشرون قسمة خمسة سالب سبعة زائد واحد خمسة أس اثنين الجذر التربيعي لأربعة وعشرين امسح [CMD] تأكيد [CMD] أعِد [CMD] calculate 37 plus خمسة اطرح twelve من عشرة اضرب ثلاثة في twenty eighty divided by ثمانية اجمع ١٢ و ١٣ سبعة زائد ١٩ 45 minus تسعة 3.5 plus اثنين ونصف واحد فاصلة خمسة ضرب أربعة مية واثنا عشر ناقص ستة 1000 minus 250 999 plus 1 قل اللون: أزرق [CHK]"

text_b_english = 'one eight seventeen sixty-four one hundred and twenty 4 plus 9 22 minus 7 9 times 5 81 divided by 9 thirty plus fifty negative six minus ten plus three two to the power of five cube root of twenty-seven start [CMD] stop [CMD] undo [CMD]'
text_b_arabic = 'اجمع سبعة و تلاتين مع 12 خمسة وأربعون ناقص عشرين تسعة ضرب ستة أربعة وستون قسمة ثمانية سالب ثلاثة زائد خمسة اثنان أس ثلاثة الجذر التكعيبي لسبعة وعشرين امسح الشاشة [CMD] تم [CMD] كرر آخر عملية [CMD] calculate twelve times خمسة اقسم 36 على ستة اطرح خمسة من twenty fifty plus سبعة اجمع ١٠٠ و ٢٥ مئتان ناقص ٩٩ 14 minus أربعة اثنين فاصلة خمسة زائد 0.5 7.25 divided by خمسة أربع مية وخمسة ناقص عشرة 500 plus 500 1234 minus 234 قل اللون: أخضر [CHK]'

text_c_english = "two nine eleven seventy-three two hundred and three 8 plus 6 40 minus 12 7 times 7 90 divided by 10 twenty plus fifteen negative nine minus twenty plus eight five to the power of three square root of one hundred confirm [CMD] repeat last [CMD] slower please [CMD]"
text_c_arabic = "احسب 23 زائد 15 سبعة ناقص اثنين ثلاثة ضرب تسعة ستة وثلاثون قسمة أربعة سالب اثنا عشر زائد عشرة عشرة أس اثنين الجذر التربيعي لتسعة افتح [CMD] رجوع [CMD] أعد الحساب [CMD] calculate twenty minus ثلاثة اجمع five و خمسة اضرب 8 في twenty-one thirty divided by ثلاثة اجمع ٧ و ١١ أربعون ناقص ١٨ 16 plus سبعة واحد فاصلة خمسة ناقص 0.25 2.2 times اثنين تسعمية وتسعة وتسعين زائد واحد 1500 minus 300 333 plus 667 قل اللون: أحمر [CHK]"


In [5]:
all_files = get_audio_file_paths(data_path)
arabic_files = all_files['arabic']
english_files = all_files['english']

In [6]:
transcriptions_arabic = {
'A': text_a_arabic,
'B': text_b_arabic,
'C': text_c_arabic
}

transcriptions_english = {
'A': text_a_english,
'B': text_b_english,
'C': text_c_english
}

In [7]:
def dataset_g(transcriptions, files, language : str):

    rows = []
    for label in files:
        text = transcriptions[label]  # uma única string
        for file_path in files[label]:
            rows.append({'Label': label, 'audio': file_path, 'transcription': text, 'Language': language})

    df = pd.DataFrame(rows)

    df.drop('Label', axis=1, inplace=True)

    return df


In [8]:
df_arabic = dataset_g(transcriptions_arabic, arabic_files, language='arabic')
df_english = dataset_g(transcriptions_english, english_files, language='english')

In [9]:
def generate_audio_dataset(df_arabic, df_english):
    """
    Gera um dataset de áudio sem carregar os arquivos (lazy load).
    Evita crash do Jupyter por memória.
    """

    df = pd.concat([df_arabic, df_english], axis=0).reset_index(drop=True)
    # Cria Dataset Hugging Face (sem carregar áudio ainda)
    dataset = Dataset.from_pandas(df)

    # Faz o cast para tipo Audio de forma LAZY (carrega só quando acessa)
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

    return dataset

## Generating dataset

In [10]:
df_final = generate_audio_dataset(df_arabic, df_english)
df_final

Dataset({
    features: ['audio', 'transcription', 'Language'],
    num_rows: 80
})

## Fine-Tuning

### Processing dataset

In [11]:
df_final = df_final.train_test_split(test_size=0.1)

In [None]:
model_name = "openai/whisper-small"

processor = WhisperProcessor.from_pretrained(model_name, task="transcribe")

model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Desativar idioma fixo (importantíssimo)
model.config.forced_decoder_ids = None

In [13]:
def preprocess_function(batch):
    audio = batch["audio"]
    # extrair features do áudio
    batch["input_features"] = processor.feature_extractor(
        audio["array"], 
        sampling_rate=16000
        ).input_features[0]
    # tokenizar a transcrição
    batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
    return batch

In [14]:
dataset = df_final.map(preprocess_function, remove_columns=df_final["train"].column_names)

Map: 100%|██████████| 72/72 [00:03<00:00, 20.58 examples/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 21.18 examples/s]


In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [16]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


### Defining evaluation metrics

In [None]:
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


### Training

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned",  # change to a repo name of your choice
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,  # increase by 2x for every 2x decrease in batch size
    learning_rate=5e-6,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=True,
    fp16=False,
    eval_strategy="steps",
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)


trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

trainer.train()


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 