In [None]:
#install torchcodec
#pipenv install "accelerate>=0.26.0"
#pipenv install tensorboard

from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
import os
import pandas as pd
import json
from datasets import Dataset, Audio
from typing import Any, List, Dict, Union
import torch

from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

import evaluate

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")
model.generation_config.task = 'transcribe'
model.generation_config.forced_decoder_ids = None

In [None]:
ROOT_FOLDER = ''

processed_path = os.path.join(ROOT_FOLDER, "data", "Processed_Files")

df_train = pd.read_csv(os.path.join(processed_path, "file_splits.csv"))
df_train = df_train[df_train['split'] == 'train']
df_train.reset_index(drop=True, inplace = True)

df_val = pd.read_csv(os.path.join(processed_path, "file_splits.csv"))
df_val = df_val[df_val['split'] == 'val']
df_val.reset_index(drop=True, inplace = True)


# df_train = df_train[:10]
# df_val = df_val[:10]


# Read JSON file
with open(os.path.join(ROOT_FOLDER, 'data/Processed_Files', "Transcript.json"), "r", encoding="utf-8") as f:
    transcript_json = json.load(f)

# Step 3: add text column by mapping filename → transcript
df_train["text"] = df_train["filename"].map(transcript_json)
df_val["text"] = df_val["filename"].map(transcript_json)
# Step 4: add path to audio file
df_train["path"] = df_train["filename"].apply(lambda x: os.path.join(processed_path, x))
df_val["path"] = df_val["filename"].apply(lambda x: os.path.join(processed_path, x))
df_train.head(1)

In [None]:
df_train.drop([  53,  151,  159,  239,  253,  263,  299,  448,  480,  483,  627,  716,
        762,  765,  794,  807,  843,  847,  859,  895, 1000, 1010, 1025, 1131,
       1260, 970], inplace = True)
df_val.drop([  85 ], inplace = True)



In [None]:
df_val.head(1)

In [None]:
# #important preprocessing: there is a maximum sequence length which a model can produce so we need to
# #remove the labels which after tokenization have length greater than "maximum sequence length" which is 1024 in this case 


# df_token_len = df_val[['text']].copy()
# def safe_token_length(x):
#     try:
#         return len(processor.tokenizer(x).input_ids)
#     except Exception:
#         return 0   # if error, set length to 0

# df_token_len['token_length'] = df_token_len['text'].apply(safe_token_length)
# df_token_len[df_token_len['token_length'] == 1024].index
# df_token_len[df_token_len['token_length'] > 1024].index

In [None]:
def prepare_dataset(batch):
    # Load audio
    audio = batch['path']   # already decoded to dict: {"array": ..., "sampling_rate": ...}
        
    # Extract features
    batch['input_features'] = processor.feature_extractor(audio["array"], 
                                                    sampling_rate=audio["sampling_rate"]).input_features[0]
        
    # Tokenize text
    batch['labels'] = processor.tokenizer(batch["text"]).input_ids
    return batch

In [None]:
# val_dataset = Dataset.from_pandas(df_val)
# val_dataset = val_dataset.cast_column("path", Audio(sampling_rate=16000))

# val_dataset = val_dataset.map(prepare_dataset, batch_size=4, num_proc=1)

# data_set_path = os.path.join(ROOT_FOLDER, "data", "data_sets",'prepared_val')

# val_dataset.save_to_disk(data_set_path)
# from datasets import load_from_disk
# val_dataset = load_from_disk(data_set_path)

In [None]:
df_train.shape

In [None]:
df_train_copy = df_train.copy()

# Parameters
total_rows = len(df_train_copy) 
n_parts = 14
part_size = total_rows // n_parts  + 1

# Create base output directory
base_out_dir = os.path.join(ROOT_FOLDER, "data", "data_sets")

for i in range(n_parts):
    print(i)
    start = i * part_size
    end = start + part_size
    df_part = df_train_copy.iloc[start:end]

    # Convert to Dataset
    train_dataset = Dataset.from_pandas(df_part, preserve_index=False)

    # Cast audio column
    train_dataset = train_dataset.cast_column("path", Audio(sampling_rate=16000))

    # Map preprocessing
    train_dataset = train_dataset.map(
        prepare_dataset,
        batch_size=1,
        num_proc=1
    )

    # Save to disk
    part_name = f"prepared_train_part{i+1}"
    data_set_path = os.path.join(base_out_dir, part_name)
    train_dataset.save_to_disk(data_set_path)

    print(f"Saved: {part_name} -> rows [{start}:{end})")


In [None]:
class DataCollatorSpeechSeq2SeqWithPadding:
    
    def __init__(self, processor: Any, decoder_start_token_id: int):
        self.processor = processor
        self.decoder_start_token_id = decoder_start_token_id

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{'input_features': feature['input_features']} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors='pt')
 
        label_features = [{'input_ids': feature['labels'][:448]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors='pt')
 
        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)
 
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
 
        batch['labels'] = labels
 
        return batch


data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

metric = evaluate.load('wer')
 
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
 
    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id
 
    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
 
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
 
    return {'wer': wer}


In [None]:
#parameters
out_dir = os.path.join(ROOT_FOLDER, 'finetuned_model_files')
batch_size = 1
gradient_accum = 8
epochs = 10
dataloader_num = 8

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps= gradient_accum,
    learning_rate=0.00001,
    warmup_steps=1000,
    bf16=False,
    fp16=True,
    num_train_epochs=epochs,
    eval_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    predict_with_generate=True,
    generation_max_length=225,
    report_to=['tensorboard'],
    load_best_model_at_end=True,
    metric_for_best_model='wer',
    greater_is_better=False,
    dataloader_num_workers= dataloader_num,
    save_total_limit=2,
    lr_scheduler_type='constant',
    seed=42,
    data_seed=42
)

In [None]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
	
trainer.train()