<a href="https://colab.research.google.com/github/M0hammadTamimi/abd/blob/main/finaltrainMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers datasets openpyxl
!pip install datasets
!pip install -q transformers datasets wandb peft torch pandas openpyxl
# Cell 1: Install Requirements
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem



In [8]:
# Cell 1: Install Requirements
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem

# Cell 2: Import Libraries
import pandas as pd
import wandb
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig
import os
from tqdm.auto import tqdm
import subprocess
import psutil

# Cell 3: Mount Drive and Setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Initialize wandb
wandb.login()
wandb.init(project="huggingface", entity="mohammadtamimi300-hashmite-tech")

# Cell 4: Display System Info
def display_system_info():
    gpu_info = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(gpu_info.stdout.decode())

    ram_info = psutil.virtual_memory()
    print(f"Total RAM: {ram_info.total / (1024 ** 3):.2f} GB")
    print(f"Available RAM: {ram_info.available / (1024 ** 3):.2f} GB")

display_system_info()

# Cell 5: Load Datasets
print("Loading SQuAD dataset...")
squad_dataset = load_dataset("squad")

print("Loading custom dataset...")
df = pd.read_excel('/content/datasetQA.xlsx')
print(f"Custom dataset loaded with {len(df)} rows")

# Cell 6: Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# Cell 7: Preprocessing Functions
def preprocess_squad(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]

    # Tokenize
    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Get answer positions
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = examples["answers"][sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1) if 1 in sequence_ids else -1
        context_end = sequence_ids.index(1, context_start + 1) if 1 in sequence_ids[context_start + 1:] else len(sequence_ids) - 1

        if context_start == -1 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_custom_dataset(examples):
    questions = [str(q) if pd.notnull(q) else '' for q in examples["question"]]
    contexts = [str(c) if pd.notnull(c) else '' for c in examples["context"]]
    answers = examples['answer']

    inputs = tokenizer(
        questions,
        contexts,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if not answer:
            start_positions.append(0)
            end_positions.append(0)
            continue

        context = contexts[sample_idx]
        start_char = context.find(answer)
        end_char = start_char + len(answer)

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1) if 1 in sequence_ids else -1
        context_end = sequence_ids.index(1, context_start + 1) if 1 in sequence_ids[context_start + 1:] else len(sequence_ids) - 1

        if start_char == -1 or context_start == -1 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Cell 8: Process Datasets
print("Processing SQuAD dataset...")
processed_squad = squad_dataset.map(
    preprocess_squad,
    remove_columns=squad_dataset["train"].column_names,
    batched=True
)

print("Processing custom dataset...")
custom_dataset = Dataset.from_pandas(df)
processed_custom = custom_dataset.map(
    preprocess_custom_dataset,
    remove_columns=custom_dataset.column_names,
    batched=True
)

# Split custom dataset
custom_train, custom_val = processed_custom.train_test_split(test_size=0.1).values()

# Cell 9: Initialize Model with LoRA
print("Initializing model...")
model = AutoModelForQuestionAnswering.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    return_dict=True
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="QUESTION_ANS",
    target_modules=["query", "key", "value"],
    bias="none",
    modules_to_save=["qa_outputs"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Cell 10: Training Configuration
def get_training_args(output_dir, name):
    return TrainingArguments(
        output_dir=output_dir,
        run_name=name,
        evaluation_strategy="steps",
        eval_steps=50,
        logging_steps=50,
        learning_rate=5e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        weight_decay=0.01,
        report_to="wandb",
        fp16=True,
        save_strategy="steps",
        save_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        warmup_ratio=0.1,
        group_by_length=True,
        dataloader_num_workers=2,
        gradient_checkpointing=True
    )

# Cell 11: Training
# First train on SQuAD
print("Training on SQuAD...")
squad_args = get_training_args('/content/squad_model', "squad_pretraining")
squad_trainer = Trainer(
    model=model,
    args=squad_args,
    train_dataset=processed_squad["train"],
    eval_dataset=processed_squad["validation"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

squad_trainer.train()

# Then fine-tune on custom dataset
print("\nFine-tuning on custom dataset...")
custom_args = get_training_args('/content/final_model', "custom_finetuning")
custom_trainer = Trainer(
    model=model,
    args=custom_args,
    train_dataset=custom_train,
    eval_dataset=custom_val,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

custom_trainer.train()




Mounted at /content/drive
Sat Dec 28 13:40:44 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              11W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                          

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Processing custom dataset...


Map:   0%|          | 0/15323 [00:00<?, ? examples/s]

Initializing model...


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 886,274 || all params: 135,490,564 || trainable%: 0.6541
Training on SQuAD...




Step,Training Loss,Validation Loss
50,6.2008,6.190061
100,6.1925,6.173124
150,6.1656,6.144519
200,6.1382,6.104175
250,6.0907,6.052039
300,6.0351,5.988513
350,5.9681,5.913317
400,5.8957,5.82645
450,5.8043,5.72787
500,5.7106,5.618447


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

KeyboardInterrupt: 

In [16]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name = "/content/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Test data with diverse examples
test_data = [
    {
        "context": "الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.",
        "question": "ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟",
        "expected_answer": "علوم الكمبيوتر والهندسة"
    },
    {
        "context": "تأسست الجامعة الهاشمية في عام 1995 في مدينة الزرقاء، الأردن. وهي جامعة حكومية تضم حالياً أكثر من 25,000 طالب.",
        "question": "متى تأسست الجامعة الهاشمية؟",
        "expected_answer": "1995"
    },
    {
        "context": "تقع كلية تكنولوجيا المعلومات في المبنى الرئيسي للجامعة. تضم الكلية خمسة أقسام أكاديمية وهي: علم الحاسوب، ونظم المعلومات الحاسوبية، وهندسة البرمجيات، والذكاء الاصطناعي، وأمن المعلومات.",
        "question": "كم قسم أكاديمي في كلية تكنولوجيا المعلومات؟",
        "expected_answer": "خمسة"
    },
    {
        "context": "يبلغ عدد الطلاب المسجلين في برنامج علوم الحاسوب 1200 طالب وطالبة، منهم 700 من الإناث و500 من الذكور.",
        "question": "كم عدد الطالبات في برنامج علوم الحاسوب؟",
        "expected_answer": "700"
    }
]

def test_model(model, tokenizer, test_data):
    results = []
    for data in test_data:
        inputs = tokenizer.encode_plus(
            data['question'],
            data['context'],
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding='max_length'
        )

        with torch.no_grad():
            outputs = model(**inputs)

        # Get the most probable start and end positions
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Get the top 5 most probable answers
        start_indices = torch.topk(start_logits, 5, dim=1).indices[0]
        end_indices = torch.topk(end_logits, 5, dim=1).indices[0]

        answers = []
        for start_idx in start_indices:
            for end_idx in end_indices:
                if end_idx >= start_idx:  # Valid answer span
                    answer = tokenizer.decode(
                        inputs['input_ids'][0][start_idx:end_idx + 1],
                        skip_special_tokens=True
                    ).strip()
                    if answer and len(answer) > 0:
                        answers.append(answer)

        # Get the best answer (first non-empty answer)
        predicted_answer = answers[0] if answers else ""

        # Calculate exact match and partial match
        exact_match = predicted_answer.strip() == data['expected_answer'].strip()
        partial_match = data['expected_answer'].strip() in predicted_answer or predicted_answer in data['expected_answer'].strip()

        results.append({
            "question": data['question'],
            "context": data['context'],
            "expected_answer": data['expected_answer'],
            "predicted_answer": predicted_answer,
            "exact_match": exact_match,
            "partial_match": partial_match,
            "all_predictions": answers[:3]  # Keep top 3 predictions for analysis
        })
    return results

# Run the test
print("Testing model...")
results = test_model(model, tokenizer, test_data)

# Print detailed results
print("\nDetailed Results:")
print("=" * 80)
for i, result in enumerate(results, 1):
    print(f"\nTest Case {i}:")
    print(f"Context: {result['context']}")
    print(f"Question: {result['question']}")
    print(f"Expected Answer: {result['expected_answer']}")
    print(f"Predicted Answer: {result['predicted_answer']}")
    print(f"Top 3 Predictions: {', '.join(result['all_predictions'])}")
    print(f"Exact Match: {result['exact_match']}")
    print(f"Partial Match: {result['partial_match']}")
    print("-" * 80)

# Calculate and print summary metrics
exact_matches = sum(1 for r in results if r['exact_match'])
partial_matches = sum(1 for r in results if r['partial_match'])
total_cases = len(results)

print("\nSummary Metrics:")
print(f"Total Test Cases: {total_cases}")
print(f"Exact Matches: {exact_matches}/{total_cases} ({(exact_matches/total_cases)*100:.2f}%)")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Testing model...

Detailed Results:

Test Case 1:
Context: الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.
Question: ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟
Expected Answer: علوم الكمبيوتر والهندسة
Predicted Answer: ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية ؟ الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.
Top 3 Predictions: ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية ؟ الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة., ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية ؟ الجام, ما هي درجات البكالوريوس التي تقدمها الجام
Exact Match: False
Partial Match: True
--------------------------------------------------------------------------------

Test Case 2:
Context: تأسست الجامعة الهاشمية في عام 1995 في مدينة الزرقاء، الأردن. وهي جامعة حكومية تضم حالياً أكثر من 25,000 طالب.
Question: متى تأسست الجامعة الهاشمية؟
Expected Answer: 1995
Predicted Answer: متى تأسست الجامعة الها

In [14]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load the fine-tuned model and tokenizer from Hugging Face Hub
model_name = "aubmindlab/bert-base-arabertv2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to answer a question using the fine-tuned model
def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Get the most likely start and end of the answer
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # If the start index is after the end index, we return an empty string (no valid answer)
    if start_index > end_index:
        return "No answer found"

    # Convert token indices back to string and remove special tokens
    answer_tokens = inputs.input_ids[0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

# Example usage with your dataset
contexts = [
    "الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.",
    "تعليمات منح درجة البكالوريوس في الجامعة الهاشمية تحدد شروط التخرج والمتطلبات.",
    "تم إصدار تعليمات منح درجة البكالوريوس في الجامعة الهاشمية برقم (1084/32/2016) بتاريخ 23/5/2016.",
    "أعلنت الجامعة الهاشمية أن الامتحانات النهائية ستبدأ في بداية يونيو 2024.",
    "الجامعة الهاشمية توفر برامج ماجستير في مجالات متعددة مثل الهندسة والإدارة."
]

questions = [
    "ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟",
    "ما اسم التعليمات المتعلقة بمنح درجة البكالوريوس في الجامعة الهاشمية؟",
    "ما رقم وتاريخ إصدار تعليمات منح درجة البكالوريوس في الجامعة الهاشمية؟",
    "متى ستبدأ الامتحانات النهائية في الجامعة الهاشمية؟",
    "ما هي برامج الماجستير التي تقدمها الجامعة الهاشمية؟"
]

# Iterate over your dataset and get the answers
for context, question in zip(contexts, questions):
    answer = answer_question(question, context)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print("-" * 50)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟
Answer: No answer found
--------------------------------------------------
Question: ما اسم التعليمات المتعلقة بمنح درجة البكالوريوس في الجامعة الهاشمية؟
Answer: No answer found
--------------------------------------------------
Question: ما رقم وتاريخ إصدار تعليمات منح درجة البكالوريوس في الجامعة الهاشمية؟
Answer: No answer found
--------------------------------------------------
Question: متى ستبدأ الامتحانات النهائية في الجامعة الهاشمية؟
Answer: النهائية في الجامعة الهاشمية ؟ أعلنت الجامعة الهاشمية أن الامتحانات النهائية
--------------------------------------------------
Question: ما هي برامج الماجستير التي تقدمها الجامعة الهاشمية؟
Answer: ##عة الهاشمية توفر برامج ماجستير في مجالات متعددة مثل الهندسة
--------------------------------------------------
