In [1]:
from datasets import load_dataset, DatasetDict , Audio,Dataset
import warnings
warnings.filterwarnings("ignore")  # Ignore all warnings
from datasets import Dataset , concatenate_datasets
import numpy as np
import torchvision
torchvision.disable_beta_transforms_warning()
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Trainer,
    TrainingArguments,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    WhisperFeatureExtractor,
    WhisperTokenizer,
    pipeline
)
from evaluate import load
import torch
from torch.utils.data import Dataset
import os
import torchaudio
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import soundfile as sf
import pandas as pd

In [3]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    for i in range(num_gpus):
        print(f"\nGPU {i}:")
        print(f"  Name: {torch.cuda.get_device_name(i)}")
        print(f"  Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024 ** 3):.2f} GB")
        print(f"  Capability: {torch.cuda.get_device_capability(i)}")
else:
    print("CUDA is not available. No GPU detected.")


Number of GPUs available: 2

GPU 0:
  Name: NVIDIA GeForce RTX 3060
  Total Memory: 11.64 GB
  Capability: (8, 6)

GPU 1:
  Name: NVIDIA GeForce RTX 3060
  Total Memory: 11.64 GB
  Capability: (8, 6)


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [4]:
from datasets import Dataset, DatasetDict, Audio
import pandas as pd
from tqdm import tqdm

def create_dataset_split(split_path):
    audio_dir = os.path.join(split_path, "audio")
    text_dir = os.path.join(split_path, "text")
    
    # Get all WAV files
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith(".wav")])
    
    data = []
    for audio_file in tqdm(audio_files):
        # Get corresponding text file
        text_file = audio_file.replace(".wav", ".txt")
        text_path = os.path.join(text_dir, text_file)
        
        # Read text
        with open(text_path, "r", encoding="utf-8") as f:
            transcription = f.read().strip()
        
        # Add to dataset
        data.append({
            "audio": os.path.join(audio_dir, audio_file),
            "transcription": transcription
        })
    
    return Dataset.from_pandas(pd.DataFrame(data)).cast_column("audio", Audio())

# Create DatasetDict
dataset = DatasetDict({
    # "train": create_dataset_split("D:\STT_OCR_RAG\\train\\data\\common_voice_11_ar\\train"),
    # "validation": create_dataset_split("D:\STT_OCR_RAG\\train\\data\\common_voice_11_ar\\valid")
     # "train": create_dataset_split("/media/nozom/New Volume1/egy imp data/audio_files/train"), #linux
    "validation": create_dataset_split("/media/nozom/New Volume1/egy imp data/dataset_split_2/valid") #linux
    # "test": create_dataset_split("D:\STT_OCR_RAG\\train\\train\\data\\common_voice_11_ar\\test")
                            })

print(dataset)

100%|██████████| 425/425 [00:00<00:00, 6704.80it/s]

DatasetDict({
    validation: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 425
    })
})





In [None]:
from datasets import load_from_disk, concatenate_datasets, DatasetDict

def load_multiple_arrow_datasets(paths):
    all_splits = {"train": [], "validation": []}

    for root_path in paths:
        for split in ["train", "validation"]:
            split_path = os.path.join(root_path, split)
            if not os.path.exists(split_path):
                continue
            chunk_dirs = sorted([
                os.path.join(split_path, d)
                for d in os.listdir(split_path)
                if os.path.isdir(os.path.join(split_path, d))
            ])
            split_chunks = [load_from_disk(chunk_dir) for chunk_dir in chunk_dirs]
            all_splits[split].extend(split_chunks)

    # دمج كل الـ chunks في كل split
    dataset_splits = {}
    for split, chunks in all_splits.items():
        if chunks:  # تأكد إن فيه بيانات
            dataset_splits[split] = concatenate_datasets(chunks)

    return DatasetDict(dataset_splits)

dataset = load_multiple_arrow_datasets([
    "/media/nozom/New Volume1/egy imp data/data_split_3/arrow",
    "/media/nozom/New Volume1/egy imp data/data_split_2/arrow",
    "/media/nozom/New Volume1/egy imp data/audio_files/arrow_data"
])


In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 16856
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'input_features', 'labels'],
        num_rows: 1689
    })
})


In [4]:
# local_model_path = "D:/STT_OCR_RAG/models/large-v3"
# model = WhisperForConditionalGeneration.from_pretrained(local_model_path)
# processor = WhisperProcessor.from_pretrained(local_model_path, language="ar", task="transcribe")

In [5]:
# local_model_path_fine = "D:/STT_OCR_RAG/audio_transcription/voice_to_text/results_large/new/checkpoint-600"

In [6]:
# processor.save_pretrained(local_model_path_fine)

In [8]:
# local_model_path_fine = "D:/STT_OCR_RAG/audio_transcription/voice_to_text/results_small/checkpoint-100"
local_model_path_fine = "/media/nozom/New Volume1/STT_OCR_RAG/audio_transcription/results_small/checkpoint-1140" #linux

In [9]:
model_fine = WhisperForConditionalGeneration.from_pretrained(local_model_path_fine)
model_fine.gradient_checkpointing_enable()

processor_fine = WhisperProcessor.from_pretrained(local_model_path_fine, language="ar", task="transcribe")
feature_extractor_fine = WhisperFeatureExtractor.from_pretrained(local_model_path_fine)
tokenizer_fine = WhisperTokenizer.from_pretrained(local_model_path_fine, language="ar", task="transcribe")
model_fine.generation_config.language = "ar"
model_fine.generation_config.task = "transcribe"
model_fine.generation_config.forced_decoder_ids = None


In [10]:
model_fine.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [16]:
# model_fine.save_pretrained("D:\STT_OCR_RAG\small")

In [13]:
# processor_fine.save_pretrained("D:\STT_OCR_RAG\small")
# feature_extractor_fine.save_pretrained("D:\STT_OCR_RAG\small")
# tokenizer_fine.save_pretrained("D:\STT_OCR_RAG\small")


('D:\\STT_OCR_RAG\\small\\tokenizer_config.json',
 'D:\\STT_OCR_RAG\\small\\special_tokens_map.json',
 'D:\\STT_OCR_RAG\\small\\vocab.json',
 'D:\\STT_OCR_RAG\\small\\merges.txt',
 'D:\\STT_OCR_RAG\\small\\normalizer.json',
 'D:\\STT_OCR_RAG\\small\\added_tokens.json')

In [11]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(local_model_path_fine)
tokenizer = WhisperTokenizer.from_pretrained(local_model_path_fine, language="ar", task="transcribe")

In [11]:
# input_str = dataset["train"][87]["transcription"]
# labels = tokenizer(input_str).input_ids
# decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
# decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

# print(f"Input:                 {input_str}")
# print(f"Decoded w/ special:    {decoded_with_special}")
# print(f"Decoded w/out special: {decoded_str}")
# print(f"Are equal:             {input_str == decoded_str}")


In [21]:
print(dataset["validation"]['transcription'][587])

يجي يعمل فرحه في مصر عايز يتجوز واحد يا ريتني كنت انا يا اخي والله العظيم التقرير الخطير جدا عن الجهاز المركزي للتعبئه العامه والاحصاء بحث الدخل والانس


In [13]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [14]:
print(dataset["validation"][0])

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [15]:
def prepare_dataset(batch):
    audio = batch["audio"]
    # Compute input features
    batch["input_features"] = feature_extractor_fine(
        audio["array"],
        sampling_rate=audio["sampling_rate"]
    ).input_features[0]
    # Tokenize transcriptions
    batch["labels"] = tokenizer_fine(batch["transcription"]).input_ids
    return batch
# train_subset = dataset['train'].select(range(1000))
# validation_subset = dataset['validation'].select(range(100))

# dataset = DatasetDict({
#     'train': train_subset,
#     'validation': validation_subset
# })
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["validation"])

Map:   0%|          | 0/425 [00:00<?, ? examples/s]

In [None]:
# model.generation_config.language = "ar"
# model.generation_config.task = "transcribe"
# model.generation_config.forced_decoder_ids = None

In [None]:
# model.to(device)

In [16]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]
        
        batch["labels"] = labels
        return batch


In [17]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor_fine,
    decoder_start_token_id=model_fine.config.decoder_start_token_id
)

In [26]:
wer_metric =load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer_fine.pad_token_id

    # Decode predictions and labels
    pred_str = tokenizer_fine.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer_fine.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [32]:
# فك التجميد فقط عن آخر طبقتين مثلاً:
for name, param in model_fine.model.encoder.named_parameters():
    if 'layers.10' in name or 'layers.11' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False


In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_small/",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=10,
    # max_steps=2000,
    num_train_epochs=3,
    # gradient_checkpointing=True,
    logging_dir="./results_small/logs",
    fp16=True,
    eval_strategy="steps",
    eval_steps=200,
    predict_with_generate=True, #false
    generation_max_length=400, #225
    save_steps=200,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    # save_total_limit=2,
    remove_unused_columns=False,
    # label_smoothing_factor=0.1,#جديد
    
    # lr_scheduler_type="cosine"#جديد 
)

In [19]:
dataset["validation"]

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 425
})

In [22]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model_fine,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor_fine.feature_extractor,
)

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


In [None]:
#66% before training

In [22]:
torch.cuda.empty_cache()

In [23]:
model_fine.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [34]:
from tqdm import tqdm

def compute_werV(model, dataset, processor, device="cuda", batch_size=4):
    """Compute WER on validation set before training."""
    model.eval()
    all_preds = []
    all_labels = []
    # Compute input features
       
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        
        # Generate prediction
        input_features = torch.stack([torch.tensor(x) for x in batch["input_features"]]).to(device)
        labels = batch["labels"]  # Already tokenized
        with torch.no_grad():
            pred_ids = model.generate(input_features,
                                     language="ar", 
                                    task="transcribe")
        pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
        label_texts = processor.batch_decode(labels, skip_special_tokens=True)
        all_preds.extend(pred_texts)
        all_labels.extend(label_texts)
        
    # Calculate WER
    wer = wer_metric.compute(predictions=all_preds, references=all_labels)
    # wer=0
    return wer

# Usage
base_wer = compute_werV(model, dataset["validation"], processor)
print(f"Baseline WER before training: {base_wer:.2%}")

NameError: name 'model' is not defined

In [28]:
from tqdm import tqdm

def compute_werV(model, dataset, processor, device="cuda", batch_size=4):
    """Compute WER on validation set before training."""
    model.eval()
    all_preds = []
    all_labels = []
    # Compute input features
       
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        
        # Generate prediction
        input_features = torch.stack([torch.tensor(x) for x in batch["input_features"]]).to(device)
        labels = batch["labels"]  # Already tokenized
        with torch.no_grad():
            pred_ids = model.generate(input_features,
                                     language="ar", 
                                    task="transcribe")
        pred_texts = processor.batch_decode(pred_ids, skip_special_tokens=True)
        label_texts = processor.batch_decode(labels, skip_special_tokens=True)
        all_preds.extend(pred_texts)
        all_labels.extend(label_texts)
        
    # Calculate WER
    wer = wer_metric.compute(predictions=all_preds, references=all_labels)
    # wer=0
    return wer

# Usage
base_wer = compute_werV(model_fine, dataset["validation"], processor_fine)
print(f"Baseline WER before training: {base_wer:.2%}")

100%|██████████| 27/27 [22:02<00:00, 48.99s/it]


Baseline WER before training: 59.77%


In [25]:
def process_audio(path):
    waveform, sr = torchaudio.load(path)
    if waveform.shape[0] > 1:  # If multi-channel
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)
    return waveform.squeeze().numpy()

In [26]:
# Test on Arabic audio
audio_path = "D:\\STT_OCR_RAG\\data\\12.wav"
audio = process_audio(audio_path)

# Original model
original_result = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device="cuda" if torch.cuda.is_available() else "cpu"
)(audio, generate_kwargs={
        "language": "ar",
        "task": "transcribe",
        "return_timestamps": True  # Enable timestamps for long audio
    })


In [31]:
# Fine-tuned model
peft_result = pipeline(
    "automatic-speech-recognition",
    model=model_fine,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
     device="cuda" if torch.cuda.is_available() else "cpu"
)(audio,
  generate_kwargs={
        "language": "ar",
        "task": "transcribe",
        "return_timestamps": True  # Enable timestamps for long audio
    })

print("Original Model:", original_result["text"])
print("\nFine-Tuned Model:", peft_result["text"])

Original Model:  عشان نعرف مين اللي ورا الحدثة بتاعتنا وحسام

Fine-Tuned Model: عشان نعرف مين اللي ورا الحد سابتعت أنا و حسام؟
