In [43]:
# Inserting the chapters, sections and hadiths in the xlsx file using Python scripts

In [44]:
#Connecting google drive where the docs file are saved

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
#installing the necessary libraries

!pip install python-docx pandas openpyxl




In [46]:
#importing the libraries required for scripts

import docx, re, pandas as pd
from pathlib import Path

In [47]:
#regex pattern for selecting the hadiths from the text of the docs

hadith_pattern = re.compile(r"^\s*\[[০-৯]+\]")

In [48]:
#Function to extract the chapters, sections and documents

def parse_docx(path):
    doc = docx.Document(path)
    chapters, sections, hadiths = [], [], []

    for p in doc.paragraphs:
        text = p.text.strip()
        if not text:
            continue

        # Rule 1: Chapter
        if text.startswith("অধ্যায়"):   # checked chapter which line start with this
            chapters.append(text)
            continue

        # Rule 2: Section (bold)
        if any(r.bold for r in p.runs if r.text.strip()):  # check bold text for section text
            sections.append(text)
            continue

        # Rule 3: Hadith
        if hadith_pattern.match(text):  # check the pattern for hadith
            hadiths.append(text)
            continue

    return chapters, sections, hadiths


In [49]:
# folder of docs file and looping through the folder to all files

folder = Path("/content/drive/MyDrive/pythonInt/Docs/docx")
files = sorted(folder.glob("*.docx"))

In [50]:
#saved all the 3 part in list

all_chapters, all_sections, all_hadiths = [], [], []


In [51]:
# excuting the function and storing all along

for f in files:
    chapters, sections, hadiths = parse_docx(f)
    all_chapters.extend(chapters)
    all_sections.extend(sections)
    all_hadiths.extend(hadiths)

In [52]:
print("Total Chapters:", len(all_chapters))
print("Total Sections:", len(all_sections))
print("Total Hadiths:", len(all_hadiths))

Total Chapters: 1
Total Sections: 17
Total Hadiths: 35


In [54]:
output_file = "/content/drive/MyDrive/pythonInt/final_output_ADM_by_PythonScript.xlsx"


In [55]:
df_chapters = pd.DataFrame({"id": range(1, len(all_chapters)+1), "name": all_chapters})
df_sections = pd.DataFrame({"id": range(1, len(all_sections)+1), "name": all_sections})
df_hadiths  = pd.DataFrame({"id": range(1, len(all_hadiths)+1), "name": all_hadiths})


In [56]:
#Exported to xlsx files

with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    df_chapters.to_excel(writer, sheet_name="Chapters", index=False)
    df_sections.to_excel(writer, sheet_name="Sections", index=False)
    df_hadiths.to_excel(writer, sheet_name="Hadiths", index=False)

print("Final Excel created at:", output_file)

Final Excel created at: /content/drive/MyDrive/pythonInt/final_output_ADM_by_PythonScript.xlsx
