<a href="https://colab.research.google.com/github/M0hammadTamimi/1-1-2025-SQuAD-dataset/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets openpyxl
!pip install datasets
!pip install -q transformers datasets wandb peft torch pandas openpyxl
# Cell 1: Install Requirements
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem

In [None]:
# Cell 1: Install Requirements (same)
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem

# Cell 2: Import Libraries (same, plus CUDA optimizations)
import pandas as pd
import wandb
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig
import os
from tqdm.auto import tqdm
import subprocess
import psutil

# Enable CUDA optimizations
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Cell 3: Mount Drive and Setup (same)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

wandb.login()
wandb.init(project="huggingface", entity="mohammadtamimi300-hashmite-tech")

# Cell 4: Display System Info (same)
def display_system_info():
    gpu_info = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(gpu_info.stdout.decode())
    ram_info = psutil.virtual_memory()
    print(f"Total RAM: {ram_info.total / (1024 ** 3):.2f} GB")
    print(f"Available RAM: {ram_info.available / (1024 ** 3):.2f} GB")

display_system_info()

# Cell 5: Load Datasets (same)
print("Loading SQuAD dataset...")
squad_dataset = load_dataset("squad")

print("Loading custom dataset...")
df = pd.read_excel('/content/datasetQA.xlsx')
print(f"Custom dataset loaded with {len(df)} rows")

# Cell 6: Initialize Tokenizer (optimized)
tokenizer = AutoTokenizer.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    use_fast=True,
    model_max_length=512
)

# Cell 7: Preprocessing Functions (same logic, optimized implementation)
def preprocess_squad(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors=None  # Changed for batch processing
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = examples["answers"][sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1) if 1 in sequence_ids else -1
        context_end = sequence_ids.index(1, context_start + 1) if 1 in sequence_ids[context_start + 1:] else len(sequence_ids) - 1

        if context_start == -1 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_custom_dataset(examples):
    questions = [str(q) if pd.notnull(q) else '' for q in examples["question"]]
    contexts = [str(c) if pd.notnull(c) else '' for c in examples["context"]]
    answers = examples['answer']

    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors=None  # Changed for batch processing
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if not answer:
            start_positions.append(0)
            end_positions.append(0)
            continue

        context = contexts[sample_idx]
        start_char = context.find(answer)
        end_char = start_char + len(answer)

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1) if 1 in sequence_ids else -1
        context_end = sequence_ids.index(1, context_start + 1) if 1 in sequence_ids[context_start + 1:] else len(sequence_ids) - 1

        if start_char == -1 or context_start == -1 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Cell 8: Process Datasets (optimized)
print("Processing SQuAD dataset...")
processed_squad = squad_dataset.map(
    preprocess_squad,
    remove_columns=squad_dataset["train"].column_names,
    batched=True,
    batch_size=1000,  # Increased batch size
    num_proc=4  # Use multiple CPU cores
)

print("Processing custom dataset...")
custom_dataset = Dataset.from_pandas(df)
processed_custom = custom_dataset.map(
    preprocess_custom_dataset,
    remove_columns=custom_dataset.column_names,
    batched=True,
    batch_size=1000,  # Increased batch size
    num_proc=4  # Use multiple CPU cores
)

# Split custom dataset (same)
custom_train, custom_val = processed_custom.train_test_split(test_size=0.1).values()

# Cell 9: Initialize Model with LoRA (fixed)
print("Initializing model...")
model = AutoModelForQuestionAnswering.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    return_dict=True,
    # Remove torch_dtype=torch.float16 from here
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="QUESTION_ANS",
    target_modules=["query", "key", "value"],
    bias="none",
    modules_to_save=["qa_outputs"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Cell 10: Training Configuration (fixed)
def get_training_args(output_dir, name):
    return TrainingArguments(
        output_dir=output_dir,
        run_name=name,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_steps=50,
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        weight_decay=0.01,
        report_to="wandb",
        fp16=True,  # Keep this
        # Remove fp16_opt_level="O2"
        save_strategy="steps",
        save_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        warmup_ratio=0.1,
        group_by_length=True,
        dataloader_num_workers=2,  # Reduced to avoid warning
        gradient_checkpointing=False,
        optim="adamw_torch"
    )

# Cell 11: Training (fixed)
print("Training on SQuAD...")
squad_args = get_training_args('/content/squad_model', "squad_pretraining")
squad_trainer = Trainer(
    model=model,
    args=squad_args,
    train_dataset=processed_squad["train"],
    eval_dataset=processed_squad["validation"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

squad_trainer.train()

# Clear cache between training phases
torch.cuda.empty_cache()

print("\nFine-tuning on custom dataset...")
custom_args = get_training_args('/content/final_model', "custom_finetuning")
custom_trainer = Trainer(
    model=model,
    args=custom_args,
    train_dataset=custom_train,
    eval_dataset=custom_val,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

custom_trainer.train()

# Clear cache between training phases
torch.cuda.empty_cache()

print("\nFine-tuning on custom dataset...")
custom_args = get_training_args('/content/final_model', "custom_finetuning")
custom_trainer = Trainer(
    model=model,
    args=custom_args,
    train_dataset=custom_train,
    eval_dataset=custom_val,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

custom_trainer.train()

In [None]:
# Cell 1: Install Requirements
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem

# Cell 2: Import Libraries
import pandas as pd
import wandb
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig
import os
from tqdm.auto import tqdm
import subprocess
import psutil

# Enable CUDA optimizations
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# Cell 3: Mount Drive and Setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

wandb.login()
wandb.init(project="huggingface", entity="mohammadtamimi300-hashmite-tech")

# Cell 4: Display System Info
def display_system_info():
    gpu_info = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(gpu_info.stdout.decode())
    ram_info = psutil.virtual_memory()
    print(f"Total RAM: {ram_info.total / (1024 ** 3):.2f} GB")
    print(f"Available RAM: {ram_info.available / (1024 ** 3):.2f} GB")

display_system_info()

# Cell 5: Load Datasets
print("Loading SQuAD dataset...")
squad_dataset = load_dataset("squad")

print("Loading custom dataset...")
df = pd.read_excel('/content/datasetQA.xlsx')
print(f"Custom dataset loaded with {len(df)} rows")

# Cell 6: Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    use_fast=True,
    model_max_length=512
)

# Cell 7: Preprocessing Functions
def preprocess_squad(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors=None
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = examples["answers"][sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1) if 1 in sequence_ids else -1
        context_end = sequence_ids.index(1, context_start + 1) if 1 in sequence_ids[context_start + 1:] else len(sequence_ids) - 1

        if context_start == -1 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_custom_dataset(examples):
    questions = [str(q) if pd.notnull(q) else '' for q in examples["question"]]
    contexts = [str(c) if pd.notnull(c) else '' for c in examples["context"]]
    answers = examples['answer']

    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors=None
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if not answer:
            start_positions.append(0)
            end_positions.append(0)
            continue

        context = contexts[sample_idx]
        start_char = context.find(answer)
        end_char = start_char + len(answer)

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1) if 1 in sequence_ids else -1
        context_end = sequence_ids.index(1, context_start + 1) if 1 in sequence_ids[context_start + 1:] else len(sequence_ids) - 1

        if start_char == -1 or context_start == -1 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Cell 8: Process Datasets
print("Processing SQuAD dataset...")
processed_squad = squad_dataset.map(
    preprocess_squad,
    remove_columns=squad_dataset["train"].column_names,
    batched=True,
    batch_size=1000,
    num_proc=4
)

print("Processing custom dataset...")
custom_dataset = Dataset.from_pandas(df)
processed_custom = custom_dataset.map(
    preprocess_custom_dataset,
    remove_columns=custom_dataset.column_names,
    batched=True,
    batch_size=1000,
    num_proc=4
)

# Split custom dataset
custom_train, custom_val = processed_custom.train_test_split(test_size=0.1).values()

# Cell 9: Initialize Model with LoRA
print("Initializing model...")
model = AutoModelForQuestionAnswering.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    return_dict=True
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="QUESTION_ANS",
    target_modules=["query", "key", "value"],
    bias="none",
    modules_to_save=["qa_outputs"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Cell 10: Training Configuration
def get_training_args(output_dir, name):
    return TrainingArguments(
        output_dir=output_dir,
        run_name=name,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_steps=50,
        learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        weight_decay=0.01,
        report_to="wandb",
        fp16=True,
        save_strategy="steps",
        save_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        warmup_ratio=0.1,
        group_by_length=True,
        dataloader_num_workers=2,
        gradient_checkpointing=False,
        optim="adamw_torch"
    )

# Cell 11: Training
print("Training on SQuAD...")
squad_args = get_training_args('/content/squad_model', "squad_pretraining")
squad_trainer = Trainer(
    model=model,
    args=squad_args,
    train_dataset=processed_squad["train"],
    eval_dataset=processed_squad["validation"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

squad_trainer.train()

# Clear cache between training phases
torch.cuda.empty_cache()

print("\nFine-tuning on custom dataset...")
custom_args = get_training_args('/content/final_model', "custom_finetuning")
custom_trainer = Trainer(
    model=model,
    args=custom_args,
    train_dataset=custom_train,
    eval_dataset=custom_val,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

custom_trainer.train()
