In [None]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
!cd LLaMA-Factory && pip install -e .

!git clone --depth 1 https://github.com/MO7AMED3TWAN/HadithsDorr.git

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ls

In [None]:
!pip install -qU transformers==4.51.3 datasets==3.2.0
# !pip install -qU openai==1.61.0 wandb
# !pip install -qU json-repair==0.29.1
# !pip install -qU vllm==0.7.2 optimum==1.24.0

In [None]:
from google.colab import userdata
import wandb

wandb.login(key=userdata.get('wandb'))
hf_token = userdata.get('huggingface')
!huggingface-cli login --token {hf_token}

In [None]:
# !pip install numpy --upgrade --ignore-installed
# !pip install --upgrade transformers
!pip install --upgrade chardet

In [None]:
import json
import os
from os.path import join
import random
from tqdm.auto import tqdm
import requests

from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from datetime import datetime

# import json_repair
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [None]:
# data_dir = "/content/drive/MyDrive/youtube-resources/llm-finetuning"

base_model_id = "Qwen/Qwen3-0.6B"

device = "cuda"

def parse_json(text):
    try:
        return json_repair.loads(text)
    except:
        return None

In [None]:
question_text= """
أنَّ الحارِثَ بنَ هِشامٍ رَضِيَ اللَّهُ عنْه سَأَلَ رَسولَ اللَّهِ صَلَّى اللهُ عليه وسلَّمَ فقالَ: يا رَسولَ اللَّهِ، كيفَ يَأْتِيكَ الوَحْيُ؟ فقالَ رَسولُ اللَّهِ صَلَّى اللهُ عليه وسلَّمَ: أحْيانًا يَأْتِينِي مِثْلَ صَلْصَلَةِ الجَرَسِ، وهو أشَدُّهُ عَلَيَّ، فيُفْصَمُ عَنِّي وقدْ وعَيْتُ عنْه ما قالَ، وأَحْيانًا يَتَمَثَّلُ لِيَ المَلَكُ رَجُلًا فيُكَلِّمُنِي فأعِي ما يقولُ.\nقَالَتْ عَائِشَةُ رضي الله عنها: وَلَقَدْ رَأَيْتُهُ يَنْزِلُ عَلَيْهِ الْوَحْيُ فِي الْيَوْمِ الشَّدِيدِ الْبَرْدِ، فَيَفْصِمُ عَنْهُ وَإِنَّ جَبِينَهُ لَيَتَفَصَّدُ عَرَقًا
 """

## Create The Schema For The Model Interaction

### Schema Of Hadiths and Sharh

In [None]:
class HadithResponse(BaseModel):
    hadith_sharh: str = Field(..., description="Explanation of the hadith")
    hadith_narrator: str = Field(..., description="Narrator of the hadith")
    hadith_grade: str  = Field(..., description="Grade of the hadith")
    hadith_lessons: str = Field(..., description="Lessons of the hadith")
    hadith_applications: str = Field(..., description="Applications of the hadith")

system_prompt = """
You are an expert in explaining the hadiths of the Prophet. I will give you a hadith, and you will respond to it in the following format:

{
"Hadith Explanation": "Detailed explanation of the hadith",
"Hadith Narrator": "Narrator's name",
"Hadith Grade": "Grade of the hadith",
"Hadith Lessons": "Lessons learned from hadith",
"Hadith Applications": "Practical applications from hadith"

Rules:
1. Explain the hadith in classical Arabic.
2. Specify the narrator and the hadith grade accurately.
3. Mention the lessons.
4. Mention the practical applications.
5. Adhere to the structure above without any modification.
6. Do not include any introduction or conclusion.
"""

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype = None
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

In [None]:
def explain_hadith(hadith_text: str) -> dict:
    # Prepare messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"اشرح الحديث التالي:\n{hadith_text}"}
    ]

    # Tokenize with chat template
    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    ).to(model.device)

    # Generate response
    outputs = model.generate(
        inputs,
        max_new_tokens=1024,
        temperature=0.7,
        do_sample=True
    )

    # Decode and parse response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    try:
        # Extract JSON part from response
        json_start = response.find('{')
        json_end = response.rfind('}') + 1
        json_response = response[json_start:json_end]

        return HadithResponse.parse_raw(json_response)
    except Exception as e:
        print(f"Error parsing response: {e}")
        return None

# Example usage
hadith = "من كان يؤمن بالله واليوم الآخر فليقل خيراً أو ليصمت"
result = explain_hadith(hadith)
print(result.json(indent=2, ensure_ascii=False))

## Load The Mode

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype = None
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

**Show The Model Architecture**

In [None]:
model

**Apply My Schema On The Model Chat Templet**

In [None]:
text = tokenizer.apply_chat_template(
    qa_messages,
    tokenize=False,
    add_generation_prompt=True
)
# text

**Start encode the prompt to IDS**

In [None]:
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# model_inputs

**Generate The Response In IDS**

In [None]:
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=2048,
    do_sample=False, top_k=None, temperature=None, top_p=None,
)
# generated_ids

**Here I Pick up just The Response of the model in also IDS**

In [None]:
generated_ids = [
    output_ids[len(input_ids):]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# generated_ids

**Last Step that I Decode The IDS Into Text**

In [None]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [None]:
print(response)

## Now Let Us Prepare Our Data

In [None]:
# sft_data_path = join(data_dir, "dataset", "DATA.jsonl")
# sft_data_path

In [None]:
import json
import random
import os

# بيانات النظام (system message)
system_message = "\n".join([
    "You are an AI model specialized in understanding and analyzing Islamic Hadith.",
    "You will be given a hadith in Arabic.",
    "Your task is to extract the name of the scholar (mohdith), the narrator (rawi), and provide a clear explanation (sharh) of the hadith.",
    "You must return the result strictly following the provided Pydantic Schema."
])

# دالة لتحويل البيانات
llm_finetunning_data = []

# المسار الخاص بملف البيانات
input_path = "/content/drive/MyDrive/merged_hadith_data.json"
output_path = "/content/drive/MyDrive/fine_tune_hadith_data2.jsonl"

# التأكد من وجود الملف قبل فتحه
if not os.path.exists(input_path):
    raise FileNotFoundError(f"Input file not found: {input_path}")

# قراءة البيانات وتحويلها
with open(input_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip() == "":
            continue  # تجاهل الأسطر الفارغة

        try:
            rec = json.loads(line.strip())
        except json.JSONDecodeError:
            print(f"Skipping invalid JSON line: {line.strip()}")
            continue  # تجاهل الأسطر غير الصالحة

        # بناء البيانات الخاصة بالتدريب
        llm_finetunning_data.append({
            "system": system_message,
            "instruction": "\n".join([
                "# Hadith:",
                rec["hadith"],  # الحديث

                "# Output JSON:",
                "```json"
            ]),
            "input": "",  # لا حاجة لمدخل هنا، أو يمكنك إضافة شيء هنا لو احتجت
            "output": "\n".join([
                json.dumps({
                    "mohdith": rec["mohdith"],
                    "rawi": rec["rawi"],
                    "sharh": rec["sharh"]
                }, ensure_ascii=False, indent=2)
            ]),
            "history": []
        })

# خلط البيانات
random.Random(101).shuffle(llm_finetunning_data)

# التأكد من أن المجلد الذي سيحفظ فيه الملف موجود
output_dir = os.path.dirname(output_path)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# حفظ البيانات في ملف
with open(output_path, 'w', encoding='utf-8') as out_f:
    for entry in llm_finetunning_data:
        out_f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"✅ Data saved to {output_path}")


In [None]:
# len(llm_finetunning_data)

In [None]:
# llm_finetunning_data[1]

In [None]:
# train_sample_sz = 100

# train_ds = llm_finetunning_data[:train_sample_sz]
# eval_ds = llm_finetunning_data[train_sample_sz:]

# os.makedirs(join(data_dir, "dataset", "llamafactory-finetune-data"), exist_ok=True)

# with open(join(data_dir, "dataset", "llamafactory-finetune-data", "train.json"), "w") as dest:
#     json.dump(train_ds, dest, ensure_ascii=False, default=str)

# with open(join(data_dir, "dataset", "llamafactory-finetune-data", "val.json"), "w", encoding="utf8") as dest:
#     json.dump(eval_ds, dest, ensure_ascii=False, default=str)

In [None]:
join(data_dir, "dataset", "llamafactory-finetune-data", "val.json")

In [None]:
# # # Configure LLaMA-Factory for the new datasets

# # # update /content/LLaMA-Factory/data/dataset_info.json and append
# # ```
#    "QAtrain": {
#         "file_name": "/gdrive/MyDrive/youtube-resources/llm-finetuning/datasets/llamafactory-finetune-data/train.json",
#         "columns": {
#             "prompt": "instruction",
#             "query": "input",
#             "response": "output",
#             "system": "system",
#             "history": "history"
#         }
#     },
#     "QAval": {
#         "file_name": "/gdrive/MyDrive/youtube-resources/llm-finetuning/datasets/llamafactory-finetune-data/val.json",
#         "columns": {
#             "prompt": "instruction",
#             "query": "input",
#             "response": "output",
#             "system": "system",
#             "history": "history"
#         }
#     }
# # ```

# # https://wandb.ai/mr-bakrianoo/llamafactory/runs/apwbkni9
# # https://wandb.ai/mr-bakrianoo/llamafactory/runs/c5tf0q90

In [None]:
%%writefile /content/LLaMA-Factory/examples/train_lora/QA.yaml

### model
model_name_or_path: Qwen/Qwen2.5-0.5B-Instruct
trust_remote_code: true

### method
stage: sft
do_train: true
finetuning_type: lora
lora_rank: 64
lora_target: all

### dataset
dataset: QAtrain
eval_dataset: QAval
template: qwen
cutoff_len: 4096
# max_samples: 50
overwrite_cache: true
preprocessing_num_workers: 16

### output
resume_from_checkpoint: /content/drive/MyDrive/youtube-resources/llm-finetuning/Models
output_dir: /content/drive/MyDrive/youtube-resources/llm-finetuning/Models
logging_steps: 10
save_steps: 40
plot_loss: true
# overwrite_output_dir: true

### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true # full
ddp_timeout: 180000000

### eval
# val_size: 0.1
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 100

report_to: wandb
run_name: Qwennn

push_to_hub: true
export_hub_model_id: "Mo7amed3twan/QWEN_Arabic_Q&A"
hub_private_repo: true
hub_strategy: checkpoint

In [None]:
!cd LLaMA-Factory/ && llamafactory-cli train /content/LLaMA-Factory/examples/train_lora/QA.yaml

## Evaluate After Fine tuning

### Solo Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype = None
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

In [None]:
finetuned_model_id = "/content/drive/MyDrive/youtube-resources/llm-finetuning/Models/"
model.load_adapter(finetuned_model_id)

In [None]:
def generate_resp(messages):
    text = tokenizer.apply_chat_template(
        qa_messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer([text], return_tensors="pt").to(device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=1024,
        do_sample=False, top_k=None, temperature=None, top_p=None,
    )

    generated_ids = [
        output_ids[len(input_ids):]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

In [None]:
question_text="ش"

In [None]:
response = generate_resp(question_text)

In [None]:
response