# Arabic GEC Pipeline

This notebook implements the QALB 2015 L2 correction pipeline.

**Steps:**
1.  Data Download & Decompression
2.  M2 Format Parsing
3.  Fine-tuning AraT5
4.  Inference

In [1]:
# Install necessary libraries
!pip install transformers datasets pyarabic gdown sentencepiece

Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl.metadata (10 kB)
Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarabic
Successfully installed pyarabic-0.6.15


## 1. Data Download

In [2]:
import gdown
import zipfile
import os

# Download the file from Google Drive
file_id = '1hvLiiMvvubyCEAZK4KIWgu7qHBNCHOp-'
url = f'https://drive.google.com/uc?id={file_id}'
output_file = 'qalb_dataset.zip'

# Only download if not exists
if not os.path.exists(output_file):
    gdown.download(url, output_file, quiet=False)

# Unzip the file
if os.path.exists(output_file):
    with zipfile.ZipFile(output_file, 'r') as zip_ref:
        zip_ref.extractall("data")
    print("Dataset extracted to 'data' directory.")
else:
    print("Download failed.")

Downloading...
From (original): https://drive.google.com/uc?id=1hvLiiMvvubyCEAZK4KIWgu7qHBNCHOp-
From (redirected): https://drive.google.com/uc?id=1hvLiiMvvubyCEAZK4KIWgu7qHBNCHOp-&confirm=t&uuid=1ce29455-e3e2-4d65-b2ff-09b03afe0cea
To: /content/qalb_dataset.zip
100%|██████████| 94.3M/94.3M [00:01<00:00, 78.6MB/s]


Dataset extracted to 'data' directory.


## 2. Step 1: The M2 Parser

In [3]:
import csv

def parse_m2_and_generate_csv(m2_path, output_csv_path):
    print(f"Processing {m2_path}...")
    if not os.path.exists(m2_path):
        print(f"File not found: {m2_path}")
        return

    sentences = []
    with open(m2_path, 'r', encoding='utf-8') as f:
        m2_data = f.read().strip().split("\n\n")

    processed_data = []

    for entry in m2_data:
        lines = entry.split("\n")
        if not lines:
            continue
        
        # The first line starts with 'S' and contains the original sentence (tokenized)
        source_line = lines[0]
        if not source_line.startswith("S "):
            continue
            
        original_tokens = source_line[2:].split()
        edits = []
        
        # Subsequent lines start with 'A' and contain edits
        for line in lines[1:]:
            if line.startswith("A "):
                parts = line[2:].split("||")
                # Format: A start_off end_off||type||correction||... 
                span = parts[0].split()
                start_off = int(span[0])
                end_off = int(span[1])
                correction = parts[2]
                edits.append((start_off, end_off, correction))
        
        # Critical Reversal Logic: Sort edits by start_off in descending order
        # This prevents index shifting when modifying the token list
        edits.sort(key=lambda x: x[0], reverse=True)
        
        corrected_tokens = list(original_tokens)
        for start, end, subst in edits:
            # Python slice replacement: list[start:end] = [new_tokens]
            # Determine if replacement is empty (deletion) or has content
            if subst == "-NONE-":
                replacement = []
            else:
                replacement = subst.split()
            
            corrected_tokens[start:end] = replacement
            
        original_sent = " ".join(original_tokens)
        corrected_sent = " ".join(corrected_tokens)
        
        processed_data.append([original_sent, corrected_sent])

    # Save to CSV
    with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["incorrect", "correct"])
        writer.writerows(processed_data)
    
    print(f"Saved {len(processed_data)} pairs to {output_csv_path}")

# Example usage (adjust paths after extraction)
# Find M2 files in data folder
# for root, dirs, files in os.walk("data"):
#     for file in files:
#         if file.endswith(".m2") and "Dev" in file:
#              parse_m2_and_generate_csv(os.path.join(root, file), "qalb_full_gec.csv")

## 3. Step 2: Model Training (AraT5)

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset
import pyarabic.araby as araby

def run_training_step():
    if not os.path.exists('qalb_full_gec.csv'):
        print("Training data 'qalb_full_gec.csv' not found. Please run Step 1 Parser first.")
        return

    # --- Data Loading ---
    # Load the CSV generated in Step 1
    dataset = load_dataset('csv', data_files='qalb_full_gec.csv')
    
    # Split into train/validation (simple split for demo)
    dataset = dataset['train'].train_test_split(test_size=0.1)
    
    model_name = "aubmindlab/arat5-v2-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # --- Preprocessing ---
    prefix = "gec_arabic: "
    max_input_length = 128
    max_target_length = 128

    def preprocess_function(examples):
        inputs = [prefix + araby.normalize_hamza(ex if ex else "") for ex in examples["incorrect"]]
        targets = [araby.normalize_hamza(ex if ex else "") for ex in examples["correct"]]
        
        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_target_length, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    # --- Training Config ---
    batch_size = 16
    args = Seq2SeqTrainingArguments(
        "arat5-gec-checkpoints",
        evaluation_strategy = "steps",
        learning_rate=3e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=5,
        predict_with_generate=True,
        fp16=True, # faster training on GPU
        push_to_hub=False,
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # --- Metric (Simplified GLEU/CER placeholder) ---
    def compute_metrics(eval_preds):
        import numpy as np
        preds, labels = eval_preds
        # simple decoding for sanity check
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        return {"sample_pred": decoded_preds[0], "sample_label": decoded_labels[0]}

    trainer = Seq2SeqTrainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    
    # Save the final model
    model.save_pretrained("arat5-gec-finetuned")
    tokenizer.save_pretrained("arat5-gec-finetuned")

# Uncomment to run training
# run_training_step()



## 4. Step 3: Inference

In [5]:
def run_inference(input_sentence):
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    import torch

    model_path = "arat5-gec-finetuned"
    
    # Fallback if model isn't trained yet for testing this cell
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    except:
        print("Finetuned model not found, loading base model for demo...")
        model_name = "aubmindlab/arat5-v2-base"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Preprocessing
    prefix = "gec_arabic: "
    text = prefix + input_sentence
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).to(device)

    # Generation
    outputs = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    corrected_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_sentence

# Example Output Test
test_sentence = "ذهب الولد الى مدرسة"
print(f"Original: {test_sentence}")
print(f"Corrected: {run_inference(test_sentence)}")

Original: ذهب الولد الى مدرسة


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Finetuned model not found, loading base model for demo...


OSError: aubmindlab/arat5-v2-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`