In [None]:
!pip install datasets transformers soundfile torchaudio

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

In [None]:
from datasets import load_dataset
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "ar")

In [None]:
from datasets import load_dataset
from transformers import Wav2Vec2Processor
import torchaudio

dataset = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="train+validation")

def resample_audio(batch):
    batch["audio"] = torchaudio.load(batch["path"])[0][0].numpy()
    return batch

dataset = dataset.map(resample_audio)


In [None]:
import re

# Function to clean and normalize text
def clean_text(batch):
    batch["text"] = re.sub("[^ء-ي ]", "", batch["sentence"])  # Remove non-Arabic chars
    return batch

dataset = dataset.map(clean_text, num_proc=4)


In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base", sampling_rate=16000)

small_dataset = dataset.select(range(int(len(dataset)*.5)))

def batch_tokenize(batch):
    batch["input_values"] = processor(
        batch["audio"],
        sampling_rate=16000,
        return_tensors="pt",
        padding=True
    ).input_values
    batch["labels"] = processor.tokenizer(batch["text"], padding=True).input_ids
    return batch

small_dataset = small_dataset.map(batch_tokenize, batched=True, remove_columns=["audio", "sentence"])


In [None]:
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    vocab_size=len(processor.tokenizer),
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id
)

training_args = TrainingArguments(
    output_dir="./wav2vec2-ar",
    evaluation_strategy="steps",
    logging_steps=100,
    save_steps=500,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    data_collator=processor.data_collator,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.feature_extractor
)



In [None]:
trainer.train()


In [None]:
test_dataset = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="test")
test_dataset = test_dataset.map(resample_audio).map(clean_text).map(tokenize)

results = trainer.evaluate(test_dataset)
print(f"Test Results: {results}")


In [None]:
test_dataset = load_dataset("mozilla-foundation/common_voice_11_0", "ar", split="test")
test_dataset = test_dataset.map(resample_audio).map(clean_text).map(tokenize)

results = trainer.evaluate(test_dataset)
print(f"Test Results: {results}")


In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

model_path = "./model"
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)

audio_path = "my_voice_test.wav"  # Replace with your .wav file path
waveform, sample_rate = torchaudio.load(audio_path)

if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

input_values = processor(waveform.squeeze(), sampling_rate=16000, return_tensors="pt").input_values
model.eval()
with torch.no_grad():
    logits = model(input_values).logits

# Decode prediction
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

print("Transcription:", transcription[0])


## Solved Hussain Yafei