<a href="https://colab.research.google.com/github/M0hammadTamimi/abd/blob/main/notebooks/AraBERTv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets wandb peft torch pandas openpyxl

In [None]:
# Cell 1: Check GPU and Install Requirements
!nvidia-smi
!pip install -q transformers datasets wandb peft torch pandas openpyxl

# Cell 2: Import Libraries
import pandas as pd
import wandb
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    default_data_collator
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig
import torch
import os
from google.colab import drive

# Cell 3: Initialize Wandb and Mount Drive
def init_wandb():
    if wandb.run is None:
        return wandb.init(project="huggingface", entity="mohammadtamimi300-hashmite-tech")
    return wandb.run

# Mount Google Drive
drive.mount('/content/drive')
init_wandb()

# Cell 4: Load Dataset
df = pd.read_excel('/content/datasetQA.xlsx')
print(f"Dataset loaded with {len(df)} rows")

# Cell 5: Initialize Tokenizer and Preprocessing
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

def improved_preprocess_function(examples):
    context = str(examples['context'])
    question = str(examples['question'])
    answer = str(examples['answer'])

    if not context or not answer:
        return None

    # Normalize whitespace
    context = ' '.join(context.split())
    answer = ' '.join(answer.split())

    start_position = context.find(answer)
    if start_position == -1:
        return None

    try:
        # Encode the inputs
        encoding = tokenizer(
            question,
            context,
            max_length=512,
            truncation='only_second',
            padding='max_length',
            return_offsets_mapping=True,
            return_overflowing_tokens=True,
            stride=128
        )

        # Convert character positions to token positions
        offsets = encoding['offset_mapping'][0]
        start_token = None
        end_token = None

        for idx, (start, end) in enumerate(offsets):
            if start <= start_position < end:
                start_token = idx
            if start < start_position + len(answer) <= end:
                end_token = idx
                break

        if start_token is None or end_token is None:
            return None

        # Get first window and prepare final encoding
        encoding = {k: v[0] for k, v in encoding.items()}
        encoding['start_positions'] = start_token
        encoding['end_positions'] = end_token

        return encoding
    except Exception as e:
        print(f"Error in preprocessing: {str(e)}")
        return None

# Cell 6: Process and Split Dataset
print("Processing dataset...")
tokenized_datasets = df.apply(improved_preprocess_function, axis=1).dropna().tolist()
print(f"Processed {len(tokenized_datasets)} valid examples")

# Split dataset
train_data, val_data = train_test_split(tokenized_datasets, test_size=0.1, random_state=42)
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
print(f"Train set: {len(train_dataset)} examples")
print(f"Validation set: {len(val_dataset)} examples")

# Cell 7: Initialize Model and Configure LoRA
print("Initializing model...")
model = AutoModelForQuestionAnswering.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    return_dict=True
)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="QUESTION_ANS",
    target_modules=["query", "key", "value"],
    bias="none",
    modules_to_save=["qa_outputs"]
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Cell 8: Configure Training
output_dir = r'D:/training_model_output'  # Updated output directory
logging_dir = r'D:/training_model_output/logs'  # Updated logging directory

# Create directories if they don't exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="steps",  # Updated from evaluation_strategy
    eval_steps=50,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir=logging_dir,
    report_to="wandb",
    fp16=True,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=100,
    logging_steps=10,
    push_to_hub=False,
    remove_unused_columns=True
)

# Cell 9: Initialize Trainer and Start Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)

# Add early stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)
trainer.add_callback(early_stopping)

# Train with proper error handling
print("Starting training...")
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training error occurred: {str(e)}")
    print("Saving checkpoint model...")
    model.save_pretrained("D:/training_model_output/checkpoint_model")

# Cell 10: Evaluate and Save Model
print("\nRunning final evaluation...")
try:
    # Ensure wandb is initialized
    init_wandb()

    # Run evaluation
    validation_output = trainer.evaluate()
    print("\nValidation Results:", validation_output)

    # Save the model
    print("\nSaving final model...")
    model.save_pretrained(output_dir)  # Saving model to the specified path
    tokenizer.save_pretrained(output_dir)  # Saving tokenizer to the specified path
    print(f"Model saved successfully to {output_dir}!")

    # Create zip file
    import zipfile
    zip_file_path = output_dir + ".zip"
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(output_dir):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), output_dir))
    print("Model zipped successfully!")

except Exception as e:
    print(f"Error in final steps: {str(e)}")
finally:
    # Always ensure wandb is properly closed
    if wandb.run:
        wandb.finish()

# Cell 11: Optional - Save to Drive
# Ensure that this is pointing to your desired destination, here the path is for Google Drive but adjust it as needed.

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model = model_name)
unmasker(preprocessed)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.375786691904068,
  'token': 1875,
  'token_str': 'بيروت',
  'sequence': 'عاصم لبنان هي بيروت'},
 {'score': 0.08452503383159637,
  'token': 48,
  'token_str': '.',
  'sequence': 'عاصم لبنان هي.'},
 {'score': 0.07384563982486725,
  'token': 2314,
  'token_str': 'دمشق',
  'sequence': 'عاصم لبنان هي دمشق'},
 {'score': 0.05577431991696358,
  'token': 59,
  'token_str': ':',
  'sequence': 'عاصم لبنان هي :'},
 {'score': 0.042271003127098083,
  'token': 2912,
  'token_str': 'باريس',
  'sequence': 'عاصم لبنان هي باريس'}]