In [2]:
import os
import glob
import os.path
from langchain_text_splitters import MarkdownHeaderTextSplitter

MAX_FILENAME_LENGTH = 250

In [3]:
# Read content of file, Splits the content into chunks based on Markdown headers.
def split_file_into_chunks(file_path: str, UnicodeEncoding: str = "utf-8"):
    try:
        with open(file_path, "r", encoding=UnicodeEncoding) as f:
            text = f.read()
    except FileNotFoundError:
        print(f"The file {file_path} does not exist.")
        return []
    except UnicodeDecodeError:
        print(f"The file {file_path} is not encoded in {UnicodeEncoding}.")
        return []

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    # strip headers is set to false since we want to keep the heading in documents
    text_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on, strip_headers=False
    )
    return text_splitter.split_text(text)

In [4]:
# since saving files name using the title of documents, some documents title are very long exeeding max allowed characters filename
def truncate_filename(filename, max_length=MAX_FILENAME_LENGTH):
    """Truncate the filename to a maximum length."""
    if len(filename) <= max_length:
        return filename
    return filename[:max_length].rsplit(" ", 1)[0]


def create_directories(new_base_dir, sub_dir, dir_name):
    """Create the necessary directories"""
    # Sub directory will be used for language split here I used English to demonstrate documents language
    new_dir_path = os.path.join(
        new_base_dir, sub_dir.strip()
    )  # New directory name, here in our case I named dir as New Data
    final_path = os.path.join(
        new_dir_path, dir_name.strip()
    )  # Final path where to save the data
    os.makedirs(final_path, exist_ok=True)
    return final_path

In [24]:
data = split_file_into_chunks(
    "Labor English\Executive Regulations for Regulating and Organizing Labor Inspection Work.md"
)

In [30]:
data[2].metadata

{'Header 1': 'Executive Regulations for Regulating and Organizing Labor Inspection Work',
 'Header 3': 'Chapter 3: Powers of Labor Inspectors`'}

In [5]:
import os
import re


def save_chunks_to_files(chunks, final_path):
    """
    Save the chunks as separate files.
    Chunks: List of Langchain Document Objects.
    """
    for chunk in chunks:
        if chunk.metadata:
            header_1 = chunk.metadata.get("Header 1", "")
            header_2 = chunk.metadata.get("Header 2", "")
            header_3 = re.sub(r"`", "", chunk.metadata.get("Header 3", ""))
            header_3 = re.sub(r":", "-", header_3)

            directory_path = (
                os.path.join(final_path, header_2) if header_2 != "" else final_path
            )

            os.makedirs(directory_path, exist_ok=True)

            if header_2 == "" and header_3 == "":
                file_name = f"{header_1}.md"
            else:
                file_name = f"{header_3}.md"

            file_path = os.path.join(directory_path, file_name)

            doc_content = chunk.page_content

            pattern = r"^.*?### Chapter" if header_2 == "" else r"^.*?## Part"
            doc_content = re.sub(pattern, "### Chapter", doc_content, flags=re.DOTALL)

            with open(file_path, "w", encoding="utf-8") as f:
                f.write(doc_content)

In [6]:
def process_file(
    file_path, sub_dir: str = "English", new_base_dir: str = "Labor Law Processed"
):
    """Main function to process the file"""
    # Extract the base name and directory name from the file path
    file_path_base_name = os.path.basename(file_path)
    dir_name = os.path.splitext(file_path_base_name)[0].strip()

    # Create the necessary directories
    final_path = create_directories(new_base_dir, sub_dir, dir_name)
    print(final_path)

    # Split the file into chunks
    chunks = split_file_into_chunks(file_path)
    if not chunks:
        print("No chunks were created.")
        return

    # Save the chunks as separate files
    save_chunks_to_files(chunks, final_path)

In [35]:
paths = [
    "./Data/Data English/Anti-Money Laundering Law.md",
    "./Data/Data English/Bankruptcy Law.md",
    "./Data/Data English/Family Law.md",
    "Data\Data English\Law of Real Estate Ownership and Investment by Non-Saudis .md",
    "Data\Data English\Statute of Real Estate Ownership by GCC Nationals.md",
    "Data\Data English\Enforcement Law.md",
    "Data\Data English\Juveniles Law .md",
    "Data\Data English\The Code of Law Practice.md",
]

In [37]:
# for path in paths:
#   process_file(path)

In [4]:
from tqdm import tqdm


def process_documents(directory, recursive=True):
    pattern = "**/*" if recursive else "*"
    file_paths = glob.glob(os.path.join(directory, pattern), recursive=recursive)

    for file_path in tqdm(file_paths, desc="Loading documents", unit="file"):
        if os.path.isfile(file_path):
            language = os.path.basename(os.path.dirname(file_path)).split(" ")[1]
            process_file(file_path, sub_dir=language)

In [68]:
# file_path = "Labor English\Executive Regulations for Regulating and Organizing Labor Inspection Work.md"
# language = os.path.basename(os.path.dirname(file_path)).split(" ")[1]
# process_file(file_path, sub_dir="English")

In [12]:
process_documents("Labor Arabic")
process_documents("Labor English")

Loading documents: 100%|██████████| 8/8 [00:00<00:00, 136.41file/s]


Labor Law Processed\Arabic\اللائحة التنظيمية لتأشيرات الأعمال المؤقتة والموسمية
Labor Law Processed\Arabic\اللائحة التنظيمية لعمل الأسر المنتجة
Labor Law Processed\Arabic\اللائحة التنفيذية لضبط أعمال تفتيش العمل وتنظيمها
Labor Law Processed\Arabic\تنظيم إعانة البحث عن عمل
Labor Law Processed\Arabic\تنظيم المخصص المالي لصعوبة الحصول على عمل
Labor Law Processed\Arabic\لائحة عمال الخدمة المنزلية ومن في حكمهم
Labor Law Processed\Arabic\نظام التأمين ضد التعطل عن العمل
Labor Law Processed\Arabic\نظام الضمان الصحي التعاوني


Loading documents: 100%|██████████| 8/8 [00:00<00:00, 183.86file/s]

Labor Law Processed\English\Cooperative Health Insurance System
Labor Law Processed\English\Executive Regulations for Regulating and Organizing Labor Inspection Work
Labor Law Processed\English\List of domestic workers and those in similar positions
Labor Law Processed\English\Organizing job search assistance
Labor Law Processed\English\Organizing the financial allocation for difficulty in obtaining work
Labor Law Processed\English\Regulations for the work of productive families
Labor Law Processed\English\RegulationsforTemporaryandSeasonalBusinessVisas
Labor Law Processed\English\Unemployment Insurance System





In [32]:
headers_to_split2 = [
    ("###", "Chapter"),
]

markdown_splitter2 = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split2, strip_headers=False
)

In [45]:
import re
import os
from tqdm import tqdm
import glob
import os.path
from langchain_text_splitters import MarkdownHeaderTextSplitter

MAX_FILENAME_LENGTH = 250

BASE = "Checked Data/Arabic/نظام العمل"
os.makedirs(BASE, exist_ok=True)

In [46]:
def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', " ", filename)


with open("Work System/نظام العمل.md", "r", encoding="utf-8") as f:
    markdown = f.read()

headers_to_split = [
    ("#", "Title"),
    ("##", "Part"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split, strip_headers=True
)
parts = markdown_splitter.split_text(markdown)

print("Parts length ", len(parts))

headers_to_split2 = [
    ("###", "Chapter"),
]

markdown_splitter2 = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split2, strip_headers=False
)

for part in tqdm(parts, "Parts"):
    print(part.metadata)
    sanitized_part_name = sanitize_filename(part.metadata["Part"])
    part_path = os.path.join(BASE, sanitized_part_name)
    os.makedirs(part_path, exist_ok=True)
    print(part_path)
    print("----")

    text = part.page_content.replace("##### Chapter", "### Chapter")
    text = text.replace("#### Chapter", "### Chapter")
    chapters = markdown_splitter2.split_text(part.page_content)
    for chapter in tqdm(chapters, "Chapters"):
        if "Chapter" in chapter.metadata:
            sanitized_chapter_name = sanitize_filename(chapter.metadata["Chapter"])
            chapter_path = os.path.join(part_path, sanitized_chapter_name)
            os.makedirs(chapter_path, exist_ok=True)

            file_path = os.path.join(chapter_path, sanitized_chapter_name + ".md")
            with open(file_path, "w+", encoding="utf-8") as f:
                f.write(chapter.page_content)
        else:
            final_part_path = os.path.join(part_path, sanitized_part_name)
            os.makedirs(final_part_path, exist_ok=True)
            file_path = os.path.join(final_part_path, sanitized_part_name + ".md")
            with open(file_path, "w+", encoding="utf-8") as f:
                f.write(chapter.page_content)
            # print(sanitized_part_name)
            # print(chapter.metadata)
            # print(chapter)

Parts length  15


Parts:   0%|          | 0/15 [00:00<?, ?it/s]

{'Part': 'الباب 1: التعريفات والأحكام العامة'}
Checked Data/Arabic/نظام العمل/الباب 1  التعريفات والأحكام العامة
----


Chapters: 100%|██████████| 2/2 [00:00<00:00, 1196.83it/s]


{'Part': 'الباب 2: تنظيم عمليات التوظيف'}
Checked Data/Arabic/نظام العمل/الباب 2  تنظيم عمليات التوظيف
----


Chapters: 100%|██████████| 3/3 [00:00<00:00, 601.33it/s]


{'Part': 'الباب 3: توظيف غير السعوديين'}
Checked Data/Arabic/نظام العمل/الباب 3  توظيف غير السعوديين
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 898.33it/s]


{'Part': 'الباب 4: التدريب والتأهيل'}
Checked Data/Arabic/نظام العمل/الباب 4  التدريب والتأهيل
----


Chapters: 100%|██████████| 2/2 [00:00<00:00, 1605.17it/s]


{'Part': 'الباب 5: علاقات العمل'}
Checked Data/Arabic/نظام العمل/الباب 5  علاقات العمل
----


Chapters: 100%|██████████| 4/4 [00:00<00:00, 1515.15it/s]


{'Part': 'الباب 6: شروط العمل وظروفه'}
Checked Data/Arabic/نظام العمل/الباب 6  شروط العمل وظروفه
----


Chapters: 100%|██████████| 4/4 [00:00<00:00, 2607.59it/s]


{'Part': 'الباب 7: العمل لبعض الوقت'}
Checked Data/Arabic/نظام العمل/الباب 7  العمل لبعض الوقت
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 377.93it/s]


{'Part': 'الباب 8: الوقاية من مخاطر العمل والوقاية من الحوادث الصناعية الكبرى وإصابات العمل والخدمات الصحية واالجتماعية'}
Checked Data/Arabic/نظام العمل/الباب 8  الوقاية من مخاطر العمل والوقاية من الحوادث الصناعية الكبرى وإصابات العمل والخدمات الصحية واالجتماعية
----


Chapters: 100%|██████████| 4/4 [00:00<00:00, 630.51it/s]


{'Part': 'الباب 9 : تشغيل النساء'}
Checked Data/Arabic/نظام العمل/الباب 9   تشغيل النساء
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 443.04it/s]


{'Part': 'الباب 10 : تشغيل الأحداث'}
Checked Data/Arabic/نظام العمل/الباب 10   تشغيل الأحداث
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 2126.93it/s]


{'Part': 'الباب 11 : عقد العمل البحري'}
Checked Data/Arabic/نظام العمل/الباب 11   عقد العمل البحري
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 400.99it/s]


{'Part': 'الباب 12 : العمل في المناجم والمحاجر'}
Checked Data/Arabic/نظام العمل/الباب 12   العمل في المناجم والمحاجر
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 1783.29it/s]


{'Part': 'الباب 13: تفتيش العمل'}
Checked Data/Arabic/نظام العمل/الباب 13  تفتيش العمل
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 1269.08it/s]


{'Part': 'الباب 15 : العقوبات'}
Checked Data/Arabic/نظام العمل/الباب 15   العقوبات
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 895.84it/s]


{'Part': 'الباب 16 : أحكام ختامية'}
Checked Data/Arabic/نظام العمل/الباب 16   أحكام ختامية
----


Chapters: 100%|██████████| 1/1 [00:00<00:00, 1798.59it/s]
Parts: 100%|██████████| 15/15 [00:00<00:00, 153.25it/s]


In [69]:
from llama_index.core import Document


def split_document_in_sections(text, heading="####"):
    sections = text.split(heading)[1:]
    for idx, section in enumerate(sections):
        sections[idx] = "####" + section
    return sections

In [8]:
directory = "Checked Data"
file_paths = glob.glob(os.path.join(directory, "**/*"), recursive=True)

In [9]:
processed = []

for path in file_paths:
    if os.path.isfile(path):
        processed.append(path)

In [87]:
def getArticle(text: str)-> str:
    try:
        english_pattern = r'#### (Article \d+(?: \w+)?)'
        arabic_pattern = r'#### (المادة [\w\s]+):?' 
        
        pattern = english_pattern if "Article" in text else arabic_pattern
        
        matches = re.findall(pattern, text)
        
        return matches[0]
    except Exception as e:
        print(matches)
        print("Text ", text)
        print('Article ', e)

In [88]:
def get_directory_names(file_path):
    directories = []
    current_path = file_path
    for _ in range(4):
        current_path = os.path.dirname(current_path)
        directories.append(os.path.basename(current_path))
    return directories

In [89]:
def get_metadata(file_path):
    try:
        file_name = os.path.basename(file_path)
        directories = get_directory_names(file_path)

        current_dir = directories[0]
        parent_dir = directories[1]
        grandparent_dir = directories[2]
        great_grandparent_dir = directories[3]

        if parent_dir in ["English", "Arabic"]:
            law = current_dir
            language = parent_dir
            chapter = os.path.splitext(file_name)[0]
            part = None
        else:
            law = grandparent_dir
            language = great_grandparent_dir
            part = parent_dir
            chapter = None if current_dir == parent_dir else current_dir

        _metadata = {
            "law": law,
            "language": language,
            "file_path": file_path,
            "file_name": file_name,
        }

        if part:
            _metadata["Part"] = part
        if chapter:
            _metadata["Chapter"] = chapter

        return _metadata
    except Exception as e:
        print("Metadata Exception : ", e)

In [94]:
def get_documents(directory, recursive=True):
    documents = []
    pattern = "**/*" if recursive else "*"
    file_paths = glob.glob(os.path.join(directory, pattern), recursive=recursive)
    try:
        for file_path in tqdm(file_paths, desc="Loading documents", unit="file"):
            if os.path.isfile(file_path):
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read()
                    
                    _metadata = get_metadata(file_path=file_path)
                    _excluded_llm_metadata_keys = ["file_path", "file_name"]
                    
                    splits = split_document_in_sections(text)

                    for split in splits:
                        _metadata["Article"] = getArticle(split)
                        documents.append(
                            Document(
                                text=split,
                                metadata=_metadata,
                                excluded_embed_metadata_keys=_excluded_llm_metadata_keys,
                                excluded_llm_metadata_keys=_excluded_llm_metadata_keys,
                                )
                            )
        return documents  
    except Exception as e:
        print(e)

In [95]:
docs = get_documents("Checked Data")

Loading documents:   0%|          | 0/149 [00:00<?, ?file/s]

Loading documents: 100%|██████████| 149/149 [00:00<00:00, 5389.18file/s]


In [81]:
docs[:]

[Document(id_='eb518390-cb72-4ef4-9536-8cbf1ee0d993', embedding=None, metadata={'law': 'Labor Law', 'language': 'English', 'file_path': 'Checked Data/English/Labor Law/Part 8  Protection Against Occupational Hazards, Major Industrial Accidents and WORK Injuries, and Health and Social Services/Chapter 3  Work Injuries/Chapter 3  Work Injuries.md', 'file_name': 'Chapter 3  Work Injuries.md', 'Part': 'Part 8  Protection Against Occupational Hazards, Major Industrial Accidents and WORK Injuries, and Health and Social Services', 'Chapter': 'Chapter 3  Work Injuries', 'Article': 'Article 132'}, excluded_embed_metadata_keys=['file_path', 'file_name'], excluded_llm_metadata_keys=['file_path', 'file_name'], relationships={}, text='#### Article 132\nThe provisions of this Chapter shall not apply to the firms subject to the Occupational Hazards Branch of the Social Insurance Law.\n', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metada