<a href="https://colab.research.google.com/github/M0hammadTamimi/final_model/blob/main/finaltrainMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets openpyxl
!pip install datasets
!pip install -q transformers datasets wandb peft torch pandas openpyxl
# Cell 1: Install Requirements
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
# Cell 1: Install Requirements
!pip install -q transformers datasets wandb peft torch pandas openpyxl psutil tkseem

# Cell 2: Import Libraries
import pandas as pd
import wandb
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig
import os
from tqdm.auto import tqdm
import subprocess
import psutil

# Cell 3: Mount Drive and Setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Initialize wandb
wandb.login()  # You'll need to enter your API key
wandb.init(project="huggingface", entity="mohammadtamimi300-hashmite-tech")

# Cell 4: Display GPU and RAM Information
def display_system_info():
    # GPU Info
    gpu_info = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE)
    print(gpu_info.stdout.decode())

    # RAM Info
    ram_info = psutil.virtual_memory()
    print(f"Total RAM: {ram_info.total / (1024 ** 3):.2f} GB")
    print(f"Available RAM: {ram_info.available / (1024 ** 3):.2f} GB")

display_system_info()

# Cell 5: Load and Prepare Dataset
print("Loading dataset...")
df = pd.read_excel('/content/datasetQA.xlsx')
print(f"Dataset loaded with {len(df)} rows")

# Cell 6: Initialize Tokenizer and Define Preprocessing
# Use AutoTokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

def preprocess_function(examples):
    # Convert inputs to strings
    questions = [str(q) if pd.notnull(q) else '' for q in examples["question"]]
    contexts = [str(c) if pd.notnull(c) else '' for c in examples["context"]]
    answers = examples['answer']

    # Tokenize using the Hugging Face tokenizer
    inputs = tokenizer(questions, contexts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    # Calculate start and end positions
    start_positions = []
    end_positions = []

    for i in range(len(answers)):
        answer = answers[i]
        if answer:
            answer_start = contexts[i].find(answer)
            if answer_start == -1:  # Handle case where the answer is not found
                start_positions.append(0)
                end_positions.append(0)
            else:
                start_positions.append(inputs.char_to_token(i, answer_start) or 0)  # Ensure valid token index
                end_position = answer_start + len(answer)
                end_positions.append(inputs.char_to_token(i, end_position - 1) or 0)  # Ensure valid token index
        else:
            start_positions.append(0)
            end_positions.append(0)

    inputs.update({"start_positions": start_positions, "end_positions": end_positions})

    return inputs

# Cell 7: Process Dataset
print("Processing dataset...")
dataset = Dataset.from_pandas(df)
processed_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

# Split dataset into train and validation sets
train_dataset, val_dataset = processed_dataset.train_test_split(test_size=0.1).values()

print(f"Train set: {len(train_dataset)} examples")
print(f"Validation set: {len(val_dataset)} examples")

# Cell 8: Initialize Model and Configure LoRA
print("Initializing model...")
model = AutoModelForQuestionAnswering.from_pretrained(
    "aubmindlab/bert-base-arabertv2",
    return_dict=True
)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="QUESTION_ANS",
    target_modules=["query", "key", "value"],
    bias="none",
    modules_to_save=["qa_outputs"]
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Cell 9: Display Model Info (Trainable Parameters)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"\nTrainable params: {trainable_params} ||")
print(f"All params: {total_params} ||")
print(f"Trainable%: {trainable_params / total_params * 100:.4f}")

# Cell 10: Configure Training
output_dir = '/content/training_model_output'
logging_dir = '/content/training_model_output/logs'

os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    run_name="lora_qa_full_dataset",  # Unique name for this run
    evaluation_strategy="steps",  # Enable evaluation
    eval_steps=50,  # Evaluate every 50 steps
    logging_steps=50,  # Log every 50 steps
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Lower batch size for memory efficiency
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=logging_dir,
    report_to="wandb",
    fp16=True,  # Enable mixed-precision training to save memory
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_ratio=0.1,
    push_to_hub=False,
    remove_unused_columns=True,
    group_by_length=True,  # Improves training speed
    dataloader_num_workers=2,  # Parallel data loading
    gradient_checkpointing=True  # Save memory by checkpointing gradients
)

# Use DataCollatorWithPadding to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Cell 11: Initialize Trainer and Start Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Train with proper error handling
print("Starting training...")
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training error occurred: {str(e)}")
    print("Saving checkpoint model...")
    model.save_pretrained("/content/checkpoint_model")
finally:
    wandb.finish()

# Cell 12: Save Model
print("\nSaving final model...")
try:
    # Save the model
    model.save_pretrained("/content/final_model")
    # Save the tokenizer using the appropriate method
    tokenizer.save_pretrained('/content/final_model')
    print("Model saved successfully!")

    # Create zip file
    !zip -r /content/final_model.zip /content/final_model
    print("Model zipped successfully! ")

    # Save to Drive
    drive_path = "/content/drive/MyDrive/models/final_model.zip"
    os.makedirs(os.path.dirname(drive_path), exist_ok=True)
    !cp /content/final_model.zip "{drive_path}"
    print(f"\nModel saved to Drive at: {drive_path}")

except Exception as e:
    print(f"Error in saving: {str(e)}")

print("Process completed!")




Mounted at /content/drive
Fri Dec 27 11:16:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                          

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Processing dataset...


Map:   0%|          | 0/15323 [00:00<?, ? examples/s]

Train set: 13790 examples
Validation set: 1533 examples
Initializing model...


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 886,274 || all params: 135,490,564 || trainable%: 0.6541

Trainable params: 886274 ||
All params: 135490564 ||
Trainable%: 0.6541
Starting training...




Step,Training Loss,Validation Loss
50,4.6194,4.604257
100,4.5589,4.496155
150,4.407,4.310198
200,4.2051,4.052137
250,3.9346,3.721172
300,3.596,3.347563
350,3.2382,2.984883
400,2.9,2.642831
450,2.5875,2.319569
500,2.2932,2.015377


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Training completed successfully!


0,1
eval/loss,███▇▇▅▅▄▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▆▃▁▂▃▁▃▃▄▂▄▃▃▁▄▃▂▃▃▂▃▃▁▃▃▃▁▃▃▁▂▃▃▂▃█▄▃▂
eval/samples_per_second,▆▂▅█▆▄▅█▆▅▇▆▄▅▅▅▅▇▆▅▇▆▅▅█▄▅▅▅▅█▅▅▆█▁▇▅▅▇
eval/steps_per_second,▆▂▅█▆▄▅█▆▅▇▆▄▅▅▅▅▅▇▆▇▆▅▅█▄▅▅█▅▅█▅▅█▁▇▅▅▇
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇██
train/grad_norm,██████▇▇▇▇▆▅▅▅▃▂▂▂▁▂▁▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▂▄▅▇█████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,██▇▇▆▅▅▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.18651
eval/runtime,10.314
eval/samples_per_second,148.633
eval/steps_per_second,37.231
total_flos,2466387670980912.0
train/epoch,3.0
train/global_step,2586.0
train/grad_norm,1.4329
train/learning_rate,0.0
train/loss,0.2677



Saving final model...
Model saved successfully!
  adding: content/final_model/ (stored 0%)
  adding: content/final_model/vocab.txt (deflated 62%)
  adding: content/final_model/adapter_config.json (deflated 54%)
  adding: content/final_model/tokenizer.json (deflated 73%)
  adding: content/final_model/README.md (deflated 66%)
  adding: content/final_model/tokenizer_config.json (deflated 90%)
  adding: content/final_model/special_tokens_map.json (deflated 80%)
  adding: content/final_model/adapter_model.safetensors (deflated 54%)
Model zipped successfully! 

Model saved to Drive at: /content/drive/MyDrive/models/final_model.zip
Process completed!


In [15]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name = "/content/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Test data in the format: context, question, expected answer
test_data = [
    {
        "context": "الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.",
        "question": "ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟",
        "expected_answer": "الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة."
    },
    # Add more test examples here
]

# Function to test the model
def test_model(model, tokenizer, test_data):
    results = []
    for data in test_data:
        inputs = tokenizer.encode_plus(
            data['question'],
            data['context'],
            return_tensors="pt",
            truncation=True,
            max_length=512
        )

        with torch.no_grad():
            outputs = model(**inputs)

        # Get the start and end logits
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Convert logits to token indices
        start_idx = torch.argmax(start_logits)
        end_idx = torch.argmax(end_logits)

        # Decode the predicted answer
        predicted_answer = tokenizer.decode(
            inputs['input_ids'][0][start_idx:end_idx + 1],
            skip_special_tokens=True
        )

        # Compare with expected answer
        is_correct = predicted_answer.strip() == data['expected_answer'].strip()
        results.append({
            "question": data['question'],
            "expected_answer": data['expected_answer'],
            "predicted_answer": predicted_answer,
            "is_correct": is_correct
        })
    return results

# Run the test
results = test_model(model, tokenizer, test_data)

# Print results
for result in results:
    print(f"Question: {result['question']}")
    print(f"Expected Answer: {result['expected_answer']}")
    print(f"Predicted Answer: {result['predicted_answer']}")
    print(f"Correct: {result['is_correct']}")
    print("-" * 50)

# Summary
correct_count = sum(1 for result in results if result['is_correct'])
print(f"Accuracy: {correct_count}/{len(results)} ({(correct_count / len(results)) * 100:.2f}%)")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟
Expected Answer: الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.
Predicted Answer: 
Correct: False
--------------------------------------------------
Accuracy: 0/1 (0.00%)


In [14]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load the fine-tuned model and tokenizer from Hugging Face Hub
model_name = "aubmindlab/bert-base-arabertv2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to answer a question using the fine-tuned model
def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Get the most likely start and end of the answer
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # If the start index is after the end index, we return an empty string (no valid answer)
    if start_index > end_index:
        return "No answer found"

    # Convert token indices back to string and remove special tokens
    answer_tokens = inputs.input_ids[0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

# Example usage with your dataset
contexts = [
    "الجامعة الهاشمية تقدم درجات بكالوريوس في مجالات مثل علوم الكمبيوتر والهندسة.",
    "تعليمات منح درجة البكالوريوس في الجامعة الهاشمية تحدد شروط التخرج والمتطلبات.",
    "تم إصدار تعليمات منح درجة البكالوريوس في الجامعة الهاشمية برقم (1084/32/2016) بتاريخ 23/5/2016.",
    "أعلنت الجامعة الهاشمية أن الامتحانات النهائية ستبدأ في بداية يونيو 2024.",
    "الجامعة الهاشمية توفر برامج ماجستير في مجالات متعددة مثل الهندسة والإدارة."
]

questions = [
    "ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟",
    "ما اسم التعليمات المتعلقة بمنح درجة البكالوريوس في الجامعة الهاشمية؟",
    "ما رقم وتاريخ إصدار تعليمات منح درجة البكالوريوس في الجامعة الهاشمية؟",
    "متى ستبدأ الامتحانات النهائية في الجامعة الهاشمية؟",
    "ما هي برامج الماجستير التي تقدمها الجامعة الهاشمية؟"
]

# Iterate over your dataset and get the answers
for context, question in zip(contexts, questions):
    answer = answer_question(question, context)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print("-" * 50)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: ما هي درجات البكالوريوس التي تقدمها الجامعة الهاشمية؟
Answer: No answer found
--------------------------------------------------
Question: ما اسم التعليمات المتعلقة بمنح درجة البكالوريوس في الجامعة الهاشمية؟
Answer: No answer found
--------------------------------------------------
Question: ما رقم وتاريخ إصدار تعليمات منح درجة البكالوريوس في الجامعة الهاشمية؟
Answer: No answer found
--------------------------------------------------
Question: متى ستبدأ الامتحانات النهائية في الجامعة الهاشمية؟
Answer: النهائية في الجامعة الهاشمية ؟ أعلنت الجامعة الهاشمية أن الامتحانات النهائية
--------------------------------------------------
Question: ما هي برامج الماجستير التي تقدمها الجامعة الهاشمية؟
Answer: ##عة الهاشمية توفر برامج ماجستير في مجالات متعددة مثل الهندسة
--------------------------------------------------
