In [40]:
import json
import os
from tqdm import tqdm
from pathlib import Path
from tqdm.auto import tqdm
import random

In [37]:
def restructure_json(input_path):
    # Read JSON file
    with open(input_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Restructure each hadith entry
    restructured_data = []
    for hadith in data:
        restructured_hadith = {
            "hadith_id": int(hadith["hadith_id"]),  # Convert to int
            "hadith": hadith["hadith"],
            "rawi": hadith["rawi"],
            "mohdith": hadith["mohdith"],
            "book": hadith["book"],
            "hadith_number": int(hadith["numberOrPage"]),  # Rename and convert to int
            "grade": hadith["grade"],
            "sharh": hadith["sharh"],
            # Convert lists to strings by taking first element
            "hadith_lessons": hadith["hadith_lessons"][0] if hadith["hadith_lessons"] else "",
            "hadith_application": hadith["hadith_application"][0] if hadith["hadith_application"] else "",
            "FT_Pairs": hadith["FT_Pairs"]
        }
        restructured_data.append(restructured_hadith)
    
    return restructured_data

def process_all_folders(base_path):
    # Create output directory
    output_base = Path(base_path) / "restructured_data"
    output_base.mkdir(exist_ok=True)
    
    # Get all folders that might contain JSON files
    folders = [f for f in Path(base_path).iterdir() if f.is_dir() and not f.name.startswith('.')]
    
    total_files = 0
    # Count total JSON files first
    for folder in folders:
        json_files = list(folder.glob('*.json'))
        total_files += len(json_files)
    
    print(f"Found {len(folders)} folders with {total_files} JSON files total")
    
    # Process each folder
    with tqdm(total=total_files, desc="Processing files") as pbar:
        for folder in folders:
            if folder == output_base:  # Skip the output directory
                continue
                
            # Create corresponding output folder
            output_folder = output_base / folder.name
            output_folder.mkdir(exist_ok=True)
            
            # Process each JSON file in the folder
            for json_file in folder.glob('*.json'):
                try:
                    # Restructure the data
                    restructured_data = restructure_json(json_file)
                    
                    # Save to new location
                    output_file = output_folder / json_file.name
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(restructured_data, f, ensure_ascii=False, indent=2)
                    
                    pbar.update(1)
                except Exception as e:
                    print(f"\nError processing {json_file}: {str(e)}")


In [21]:
# Usage
folder_path = "../new/"  # Update this path
process_all_folders(folder_path)

Found 3 folders with 7548 JSON files total


Processing files:   0%|          | 0/7548 [00:00<?, ?it/s]

In [22]:
def merge_json_files(folder_path):
    """
    Merges all JSON files in a folder using hadith_id as the key
    """
    merged_data = []  # نستخدم قائمة بدلاً من dictionary
    json_files = list(Path(folder_path).glob('*.json'))
    
    for json_file in tqdm(json_files, desc=f"Processing {Path(folder_path).name}"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    merged_data.extend(data)
                else:
                    merged_data.append(data)
        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")
    
    # نقوم بترتيب البيانات حسب hadith_id
    merged_data.sort(key=lambda x: x['hadith_id'])
    
    # نحول القائمة إلى dictionary
    final_data = {}
    for hadith in merged_data:
        hadith_id = str(hadith.pop('hadith_id'))  # نحول الـ ID إلى string
        final_data[f"hadith_{hadith_id}"] = hadith
    
    return [final_data]  # نرجع القاموس داخل قائمة

def process_all_folders(base_path):
    """
    Process all folders and create a single JSON file for each
    """
    folders = [f for f in Path(base_path).iterdir() if f.is_dir() and not f.name.startswith('.')]
    print(f"Found {len(folders)} folders")
    
    for folder in folders:
        try:
            merged_data = merge_json_files(folder)
            output_file = folder.parent / f"{folder.name}_merged.json"
            
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(merged_data[0], f, ensure_ascii=False, indent=2)
            
            print(f"✅ Created {output_file} with {len(merged_data[0])} entries")
            
        except Exception as e:
            print(f"❌ Error processing folder {folder}: {str(e)}")

In [28]:
# Usage
base_path = "/home/mohamed/Desktop/JUPYTER/Graduation/new/restructured_data/"
process_all_folders(base_path)

Found 2 folders


Processing Sahih_muslim:   0%|          | 0/2239 [00:00<?, ?it/s]

✅ Created /home/mohamed/Desktop/JUPYTER/Graduation/new/restructured_data/Sahih_muslim_merged.json with 2239 entries


Processing Sahih_bukhari:   0%|          | 0/5309 [00:00<?, ?it/s]

✅ Created /home/mohamed/Desktop/JUPYTER/Graduation/new/restructured_data/Sahih_bukhari_merged.json with 5309 entries


In [None]:
# Path to the merged JSON file
merged_file_path = f"./Sahih_muslim_merged.json"

# Load the merged JSON file and count the hadiths
with open(merged_file_path, 'r', encoding='utf-8') as f:
    merged_data = json.load(f)
    hadith_count = len(merged_data)
    print(f"Total number of hadiths in the merged file: {hadith_count}")

Total number of hadiths in the merged file: 2239


In [30]:
merged_file_path = f"./Sahih_bukhari_merged.json"

with open(merged_file_path, 'r', encoding='utf-8') as f:
    merged_data = json.load(f)
    hadith_count = len(merged_data)
    print(f"Total number of hadiths in the merged file: {hadith_count}")

Total number of hadiths in the merged file: 5309


# Data handling for fine tuning

In [38]:
def extract_merged_hadith_data(file_path):
    """Extracts 'hadith', 'rawi', 'book', and 'sharh' from a merged JSON file"""
    extracted_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            merged_data = json.load(f)
            # Extract data from each hadith in the merged file
            for hadith_key, hadith_value in tqdm(merged_data.items(), desc="Extracting hadiths"):
                extracted_data.append({
                    'hadith': hadith_value.get('hadith'),
                    'rawi': hadith_value.get('rawi'),
                    'book': hadith_value.get('book'),
                    'sharh': hadith_value.get('sharh'),
                    'hadith_number': hadith_value.get('hadith_number'),
                    'grade': hadith_value.get('grade'),
                    # 'FT_Pairs': hadith_value.get('FT_Pairs')
                })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {str(e)}")
    return extracted_data

def process_merged_files(input_files, output_file):
    """Process merged hadith files and save extracted data"""
    all_hadith_data = []
    
    # Process each merged file
    for file_path in input_files:
        print(f"Processing {Path(file_path).name}...")
        extracted_data = extract_merged_hadith_data(file_path)
        all_hadith_data.extend(extracted_data)
    
    # Save the extracted data
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_hadith_data, f, ensure_ascii=False, indent=4)
        
    return len(all_hadith_data)

In [39]:
# Input and output paths
input_files = [
    "./Sahih_bukhari_merged.json",
    "./Sahih_muslim_merged.json"
]
output_file = './extracted_hadith_data.json'

# Process the files
total_hadiths = process_merged_files(input_files, output_file)
print(f"✅ Extracted data saved to {output_file}")
print(f"Total hadiths processed: {total_hadiths}")

Processing Sahih_bukhari_merged.json...


Extracting hadiths:   0%|          | 0/5309 [00:00<?, ?it/s]

Processing Sahih_muslim_merged.json...


Extracting hadiths:   0%|          | 0/2239 [00:00<?, ?it/s]

✅ Extracted data saved to ./extracted_hadith_data.json
Total hadiths processed: 7548


In [41]:
# 1. قراءة ملف JSON الأصلي
with open('./extracted_hadith_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)  # يجب أن يكون ملفك عبارة عن قائمة من الأوبجيكتس

# 2. تحويل كل حديث إلى زوج (سؤال-إجابة)
formatted_data = []
for item in data:
    question = f'ما هي تفاصيل الحديث "{item["hadith"]}" وما هو الشرح الكافي له؟'
    answer = (
        f'ورد هذا الحديث فى كتاب {item["book"]} برقم {item["hadith_number"]} '
        f'ورواه {item["rawi"]} ودرجته {item["grade"]}.\n'
        f'أما شرح الحديث: {item["sharh"]}'
    )
    formatted_data.append({"question": question, "answer": answer})

# 3. خلط البيانات عشوائيًا
random.shuffle(formatted_data)

# 4. حفظ البيانات في ملف JSON واحد
with open('formatted_hadiths.json', 'w', encoding='utf-8') as f:
    json.dump(formatted_data, f, ensure_ascii=False, indent=2)

print(f"تم تحويل {len(formatted_data)} حديثًا إلى ملف واحد (formatted_hadiths.json).")

تم تحويل 7548 حديثًا إلى ملف واحد (formatted_hadiths.json).


In [None]:
# def extract_hadith_data(file_path):
#     """Extracts 'hadith', 'rawi', 'book', and 'sharh' from a JSON file."""
#     data = []
#     with open(file_path, 'r', encoding='utf-8') as f:
#         try:
#             file_data = json.load(f)
#             for item in file_data:
#                 extracted_data = {
#                     'hadith': item.get('hadith'),
#                     'rawi': item.get('rawi'),
#                     'book': item.get('book'),
#                     'sharh': item.get('sharh')
#                 }
#                 data.append(extracted_data)
#         except json.JSONDecodeError:
#             print(f"Error decoding JSON from {file_path}")
#             pass  # Skip invalid JSON files
#     return data

# def merge_hadith_files(bukhari_dir, muslim_dir, output_file):
#     """
#     Iterates through JSON files in Bukhari and Muslim directories,
#     extracts specified fields, and merges them into a single JSON file
#     with a progress bar.
#     """
#     all_hadith_data = []

#     # Get list of files to process
#     bukhari_files = [os.path.join(bukhari_dir, f) for f in os.listdir(bukhari_dir) if f.endswith('.json')]
#     muslim_files = [os.path.join(muslim_dir, f) for f in os.listdir(muslim_dir) if f.endswith('.json')]
#     all_files = bukhari_files + muslim_files

#     # Process files with a progress bar
#     for file_path in tqdm(all_files, desc="Processing Hadith Files"):
#         all_hadith_data.extend(extract_hadith_data(file_path))

#     # Save the merged data to a new JSON file
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(all_hadith_data, f, ensure_ascii=False, indent=4)

# # Define the directories for Bukhari and Muslim hadith data
# bukhari_directory = '/content/HadithsDorr/DATA/DistilitionOutput/Sahih_bukhari/'
# muslim_directory = '/content/HadithsDorr/DATA/DistilitionOutput/Sahih_muslim/'

# # Define the output file path
# output_json_file = '/content/drive/MyDrive/merged_hadith_data.json'

# # Merge the hadith files with a progress bar
# merge_hadith_files(bukhari_directory, muslim_directory, output_json_file)

# print(f"Merged hadith data saved to {output_json_file}")

In [42]:
import json
import random

# بيانات النظام (system message)
system_message = "\n".join([
    "You are an AI assistant specialized in understanding and explaining Islamic Hadith.",
    "You will be given a question related to a Hadith in Arabic.",
    "Your task is to answer the question clearly, concisely, and accurately based on Islamic scholarship.",
    "Respond in Arabic and follow the same tone and depth as classical Islamic explanations.",
    "If the question is not related to Islamic Hadiths, respond with 'لا أستطيع المساعدة في هذا الموضوع'.",
    "Don't provide any additional information or context that is not directly related to the Hadith.",
])

In [46]:
llm_finetunning_data = []

# المسار الخاص بملف البيانات
input_path = "./formatted_hadiths.json"
# تأكد من أن الملف موجود
if not os.path.exists(input_path):
    raise FileNotFoundError(f"File not found: {input_path}")

# قراءة البيانات وتحويلها
with open(input_path, 'r', encoding='utf-8') as f:
    data = json.load(f)  # Load the entire data as a list of dictionaries

In [51]:
for rec in data:
    llm_finetunning_data.append({
        "system": system_message,
        "instruction": "\n".join([
            "# question:",
            rec["question"]
        ]),
        "input": "",  
        "output": "/n".join([
            rec["answer"]
        ]),
        "history": []
    })

In [52]:
# خلط البيانات
random.Random(101).shuffle(llm_finetunning_data)

# حفظ البيانات في ملف
output_path = "./llm_finetunning_data.jsonl"
with open(output_path, 'w', encoding='utf-8') as out_f:
    for entry in llm_finetunning_data:
        out_f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"✅ Data saved to {output_path}")

✅ Data saved to ./llm_finetunning_data.jsonl


In [50]:
import json
import os
from os.path import join

# تحديد حجم التدريب
train_sample_sz = int(len(llm_finetunning_data) * 0.8)  # 80% للتدريب

# تقسيم البيانات إلى تدريب واختبار
train_ds = llm_finetunning_data[:train_sample_sz]
eval_ds = llm_finetunning_data[train_sample_sz:]

# مسار الدليل الخاص بالبيانات
data_dir = "./"  # يمكنك تعديل المسار هنا حسب احتياجك

# إنشاء المجلدات إذا لم تكن موجودة
os.makedirs(join(data_dir, "dataset", "llamafactory-finetune-data"), exist_ok=True)

# حفظ بيانات التدريب
with open(join(data_dir, "dataset", "llamafactory-finetune-data", "train.jsonl"), "w", encoding="utf8") as dest:
    json.dump(train_ds, dest, ensure_ascii=False, default=str)

# حفظ بيانات الاختبار
with open(join(data_dir, "dataset", "llamafactory-finetune-data", "val.jsonl"), "w", encoding="utf8") as dest:
    json.dump(eval_ds, dest, ensure_ascii=False, default=str)

print(f"✅ Train and evaluation datasets saved successfully.")


✅ Train and evaluation datasets saved successfully.
