In [8]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Target URL
url = "https://cma.org.sa/RulesRegulations/Regulations/Pages/default.aspx"

# Create folder to store PDFs
folder = "cma_pdfs"
os.makedirs(folder, exist_ok=True)

# Fetch and parse the page
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")

# Collect PDF links
pdf_links = set()
for a in soup.find_all("a", href=True):
    href = a["href"]
    if href.lower().endswith(".pdf"):
        full_url = urljoin(url, href)
        pdf_links.add(full_url)

# Download PDFs
count = 0
for link in pdf_links:
    filename = os.path.join(folder, link.split("/")[-1])
    try:
        response = requests.get(link)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            count += 1
    except Exception as e:
        print(f"Failed to download {link}: {e}")

print(f"\n✅ Downloaded {count} PDF file(s) to '{folder}/'")



✅ Downloaded 35 PDF file(s) to 'cma_pdfs/'


In [9]:
import fitz  # PyMuPDF
from docx import Document
import os

def clean_text(text):
    """Remove NULL bytes and control characters"""
    return ''.join(c for c in text if c.isprintable() or c in '\n\r\t')

# Paths
pdf_folder = "cma_pdfs"
docx_folder = "cma_docx"
os.makedirs(docx_folder, exist_ok=True)

converted = 0
for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        docx_name = os.path.splitext(filename)[0] + ".docx"
        docx_path = os.path.join(docx_folder, docx_name)

        doc = Document()
        try:
            with fitz.open(pdf_path) as pdf:
                for page in pdf:
                    raw_text = page.get_text("text")
                    cleaned_text = clean_text(raw_text).strip()
                    if cleaned_text:
                        doc.add_paragraph(cleaned_text)

            doc.save(docx_path)
            converted += 1
        except Exception as e:
            print(f"❌ Failed to convert {filename}: {e}")

print(f"\n✅ Converted {converted} PDF(s) to DOCX in '{docx_folder}/'")



✅ Converted 35 PDF(s) to DOCX in 'cma_docx/'
