In [None]:
# Install required packages for proper Arabic text handling
!pip install arabic-reshaper python-bidi ocrmypdf pytesseract

In [None]:
!pip install beautifulsoup4 requests


In [11]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from docx import Document
from docx.oxml.ns import qn
from docx.enum.text import WD_ALIGN_PARAGRAPH
import fitz  # PyMuPDF

# Your ScraperAPI Key
SCRAPER_API_KEY = "2ef897558ee673a53e6c66518eac104d"  # Replace with your actual API key

# Target website URL
base_url = "https://misa.gov.sa/ar/activities/laws-regulations/ar"
scraper_url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={base_url}"

# Folders to store the PDFs and Word documents (Windows path in current directory)
download_folder = r"C:\Users\ali-d\Desktop\KSA\MoI_pdfs"
docx_folder = r"C:\Users\ali-d\Desktop\KSA\MoI_docx"
os.makedirs(download_folder, exist_ok=True)
os.makedirs(docx_folder, exist_ok=True)

# Arabic text processing constants
ARABIC_LETTERS = r"[\u0600-\u06FF]"

def clean_arabic_text(text):
    """Clean and format Arabic text"""
    if not text:
        return ""
    text = re.sub(r"[\u202A-\u202E\u200E\u200F\u2066-\u2069]", "", text)
    text = re.sub(r'\s*[\(\)]\s*', '', text)
    replacements = {
        "ÿ£ŸÑŸä": "ÿ£Ÿä", "ŸÑÿ£ŸÑŸä": "ŸÑŸÑÿ£", "ÿ°ÿß": "ÿßÿ°", "ÿ§": "ÿ°", "Ÿâÿ°": "ÿ¶Ÿä",
        "ÿ©ÿßŸÑ": "ÿ© ÿßŸÑ", "ÿßÿßÿ°": "ÿßÿ°", "ÿßŸÑŸÖÿ°ÿ≥ÿ≥ÿßÿ™": "ÿßŸÑŸÖÿ§ÿ≥ÿ≥ÿßÿ™",
        "ŸÖÿ≥ÿ°ŸàŸÑŸäÿ©": "ŸÖÿ≥ÿ§ŸàŸÑŸäÿ©", "ÿ™ÿ°ÿØŸä": "ÿ™ÿ§ÿØŸä", "ÿßŸà": "ÿ£Ÿà",
        "ŸàÿßŸÑÿ•ÿ¨ÿ±ÿßÿ°ÿ™": "ŸàÿßŸÑÿ•ÿ¨ÿ±ÿßÿ°ÿßÿ™", "ÿ•ÿ¨ÿ±ÿßÿ°ÿ™": "ÿ•ÿ¨ÿ±ÿßÿ°ÿßÿ™"
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    text = re.sub(r'\s*([.,:;!?ÿåÿõÿü])\s*', r'\1', text)
    text = re.sub(r'([.,:;!?ÿåÿõÿü])([%s])' % ARABIC_LETTERS, r'\1 \2', text)
    text = re.sub(r"([%s])([A-Z])" % ARABIC_LETTERS, r"\1 \2", text)
    text = re.sub(r"([a-zA-Z])([%s])" % ARABIC_LETTERS, r"\1 \2", text)
    arabic_digits = str.maketrans("0123456789", "Ÿ†Ÿ°Ÿ¢Ÿ£Ÿ§Ÿ•Ÿ¶ŸßŸ®Ÿ©")
    text = text.translate(arabic_digits)
    if re.fullmatch(r"^\s*\d+\s*$", text) or re.fullmatch(r"^\s*[^\u0600-\u06FF]{1,5}\s*$", text):
        return ""
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

def set_rtl_paragraph(paragraph):
    """Set paragraph to Right-To-Left alignment for Arabic"""
    paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    p = paragraph._element
    p.set(qn("w:rtl"), "1")

def convert_pdf_to_docx(pdf_path, docx_path):
    """Convert PDF to Word document with Arabic formatting and proper UTF-8 encoding"""
    try:
        doc = Document()
        
        # Open PDF - PyMuPDF handles UTF-8 automatically
        pdf_document = fitz.open(pdf_path)
        
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            
            # Extract text blocks with positioning info to preserve RTL order
            blocks = page.get_text("blocks")
            
            # Sort blocks by vertical position (top to bottom)
            blocks = sorted(blocks, key=lambda b: b[1])
            
            current_paragraph_lines = []
            
            for block in blocks:
                # block format: (x0, y0, x1, y1, "text", block_no, block_type)
                if len(block) >= 5:
                    text = block[4].strip()
                    
                    if not text:
                        if current_paragraph_lines:
                            paragraph_text = " ".join(current_paragraph_lines)
                            cleaned_paragraph = clean_arabic_text(paragraph_text)
                            if cleaned_paragraph:
                                p = doc.add_paragraph(cleaned_paragraph)
                                set_rtl_paragraph(p)
                                # Set font that supports Arabic
                                for run in p.runs:
                                    run.font.name = 'Arial'
                                    run.font.size = 304800  # 12pt in EMUs
                            current_paragraph_lines = []
                        continue
                    
                    # Split into lines
                    lines = text.splitlines()
                    
                    for line in lines:
                        line = line.strip()
                        if not line:
                            continue
                        
                        # Check if it's a new list item or heading
                        is_new_list_item = False
                        if re.match(r"^\s*([Ÿ†-Ÿ©0-9]+\.|\*|‚Ä¢)\s*", line):
                            is_new_list_item = True
                        elif re.match(r"^\s*([ÿ£-Ÿä] -|-)\s*", line):
                            is_new_list_item = True
                        
                        if is_new_list_item and current_paragraph_lines:
                            paragraph_text = " ".join(current_paragraph_lines)
                            cleaned_paragraph = clean_arabic_text(paragraph_text)
                            if cleaned_paragraph:
                                p = doc.add_paragraph(cleaned_paragraph)
                                set_rtl_paragraph(p)
                                for run in p.runs:
                                    run.font.name = 'Arial'
                                    run.font.size = 304800
                            current_paragraph_lines = []
                        
                        current_paragraph_lines.append(line)
            
            # Add any remaining text from this page
            if current_paragraph_lines:
                paragraph_text = " ".join(current_paragraph_lines)
                cleaned_paragraph = clean_arabic_text(paragraph_text)
                if cleaned_paragraph:
                    p = doc.add_paragraph(cleaned_paragraph)
                    set_rtl_paragraph(p)
                    for run in p.runs:
                        run.font.name = 'Arial'
                        run.font.size = 304800
        
        pdf_document.close()
        doc.save(docx_path)
        return True
    except Exception as e:
        print(f"   ‚ùå Conversion error: {e}")
        import traceback
        traceback.print_exc()
        return False

# Request the page via ScraperAPI
response = requests.get(scraper_url)
soup = BeautifulSoup(response.text, "html.parser")

# Find all PDF links
pdf_links = []
for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.lower().endswith(".pdf"):  # Ensure it's a PDF link
        pdf_links.append(urljoin(base_url, href))

# Function to download PDFs and convert to Word immediately
def download_pdf(url, retries=3, delay=5):
    filename = os.path.join(download_folder, url.split("/")[-1])
    docx_filename = os.path.join(docx_folder, url.split("/")[-1].replace(".pdf", ".docx"))
    
    # Skip if Word document already exists
    if os.path.exists(docx_filename):
        print(f"‚è≠Ô∏è  Already converted: {os.path.basename(docx_filename)}")
        return
    
    for i in range(retries):
        try:
            # Request with proper encoding headers for Arabic content
            response = requests.get(url, headers={'Accept-Charset': 'utf-8'})
            if response.status_code == 200:
                # Save PDF with binary mode (preserves encoding)
                with open(filename, "wb") as f:
                    f.write(response.content)
                print(f"üì• Downloaded: {os.path.basename(filename)}")
                
                # Convert to Word immediately with UTF-8 support
                print(f"üîÑ Converting to Word with UTF-8 encoding...")
                if convert_pdf_to_docx(filename, docx_filename):
                    print(f"‚úÖ Converted: {os.path.basename(docx_filename)}")
                else:
                    print(f"‚ö†Ô∏è  Conversion failed for {os.path.basename(filename)}")
                
                return  # Successfully downloaded and converted
            else:
                print(f"Attempt {i+1} failed for {url} with status code: {response.status_code}")
        except requests.exceptions.ConnectionError as e:
            print(f"Attempt {i+1} failed for {url} with connection error: {e}")
        except Exception as e:
            print(f"Attempt {i+1} failed with error: {e}")
        time.sleep(delay) # Wait before retrying
    print(f"Failed to download {url} after {retries} attempts.")


# Download all PDFs and convert to Word
print("üöÄ Starting download and conversion process...")
print("=" * 60)

for i, pdf_url in enumerate(pdf_links, 1):
    print(f"\n[{i}/{len(pdf_links)}] Processing: {pdf_url}")
    download_pdf(pdf_url)

print("\n" + "=" * 60)
print("üéâ All PDFs downloaded and converted successfully!")
print(f"üìÅ PDFs saved to: {download_folder}")
print(f"üìÅ Word documents saved to: {docx_folder}")

üöÄ Starting download and conversion process...

[1/6] Processing: https://misa.gov.sa/app/uploads/2025/07/Investor-Guide_12-04-ar.pdf
üì• Downloaded: Investor-Guide_12-04-ar.pdf
üîÑ Converting to Word with UTF-8 encoding...
‚úÖ Converted: Investor-Guide_12-04-ar.docx

[2/6] Processing: https://misa.gov.sa/app/uploads/2025/07/Investor-Guide_12-04-ar.pdf
‚è≠Ô∏è  Already converted: Investor-Guide_12-04-ar.docx

[3/6] Processing: https://misa.gov.sa/app/uploads/2025/05/investor-guide-12-03-ar.pdf
üì• Downloaded: Investor-Guide_12-04-ar.pdf
üîÑ Converting to Word with UTF-8 encoding...
‚úÖ Converted: Investor-Guide_12-04-ar.docx

[2/6] Processing: https://misa.gov.sa/app/uploads/2025/07/Investor-Guide_12-04-ar.pdf
‚è≠Ô∏è  Already converted: Investor-Guide_12-04-ar.docx

[3/6] Processing: https://misa.gov.sa/app/uploads/2025/05/investor-guide-12-03-ar.pdf
üì• Downloaded: investor-guide-12-03-ar.pdf
üîÑ Converting to Word with UTF-8 encoding...
üì• Downloaded: investor-guide-12-03-ar

In [2]:
# Advanced PDF Text Extraction with Format Preservation
# Supports Arabic & English, excludes images, preserves structure
# Enhanced with Arabic shaping, BiDi, and OCR fallback

import os
import re
import fitz  # PyMuPDF
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import subprocess
import tempfile
import shutil

# Arabic text handling
from bidi.algorithm import get_display
import arabic_reshaper

# Initialize Arabic reshaper for proper letter joining
RESHAPER = arabic_reshaper.ArabicReshaper()

# Configuration
pdf_folder = r"C:\Users\ali-d\Desktop\KSA\MoI_pdfs"
output_folder = r"C:\Users\ali-d\Desktop\KSA\MoI_extracted_text"
os.makedirs(output_folder, exist_ok=True)

# Extraction rules
EXTRACTION_RULES = {
    'include_images': False,  # Do not include image URLs/references
    'include_links': False,   # Do not include hyperlinks
    'preserve_formatting': True,  # Preserve bold, italic, font sizes
    'preserve_structure': True,   # Preserve paragraphs, lists, spacing
    'detect_language': True,      # Detect Arabic vs English
    'remove_headers_footers': True,  # Remove page headers/footers
    'min_text_length': 3,  # Minimum characters to consider as text
}

def is_arabic(text):
    """Check if text contains Arabic characters"""
    arabic_pattern = re.compile(r'[\u0600-\u06FF]')
    return bool(arabic_pattern.search(text))

def fix_arabic_shaping(text: str) -> str:
    """Shape Arabic letters and apply BiDi so words display correctly."""
    if not text:
        return text
    # Reshape (joins letters into contextual forms) then reorder for display
    try:
        reshaped = RESHAPER.reshape(text)
        return get_display(reshaped)
    except Exception as e:
        # Fallback to original text if shaping fails
        return text

def is_header_footer(block, page_height, threshold=50):
    """Detect if text block is likely a header or footer"""
    y0 = block[1]  # Top position
    y1 = block[3]  # Bottom position
    
    # Check if in top or bottom margin
    if y0 < threshold or y1 > (page_height - threshold):
        return True
    return False

def clean_extracted_text(text):
    """Clean extracted text with rules"""
    if not text:
        return ""
    
    # Remove control characters
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
    
    # Remove SOME invisible Unicode characters but keep RTL marks (U+200F, U+202B)
    # that help with rendering
    text = re.sub(r'[\u200b-\u200e\u2010-\u202a\u202c-\u202e\u2060-\u206f\ufeff]', '', text)
    
    # Fix common Arabic OCR issues
    replacements = {
        "ÿ£ŸÑŸä": "ÿ£Ÿä", "ŸÑÿ£ŸÑŸä": "ŸÑŸÑÿ£", "ÿ°ÿß": "ÿßÿ°", "Ÿâÿ°": "ÿ¶Ÿä",
        "ÿ©ÿßŸÑ": "ÿ© ÿßŸÑ", "ÿßÿßÿ°": "ÿßÿ°", "ÿßŸÑŸÖÿ°ÿ≥ÿ≥ÿßÿ™": "ÿßŸÑŸÖÿ§ÿ≥ÿ≥ÿßÿ™",
        "ŸÖÿ≥ÿ°ŸàŸÑŸäÿ©": "ŸÖÿ≥ÿ§ŸàŸÑŸäÿ©", "ÿ™ÿ°ÿØŸä": "ÿ™ÿ§ÿØŸä", "ÿßŸà": "ÿ£Ÿà",
        "ŸàÿßŸÑÿ•ÿ¨ÿ±ÿßÿ°ÿ™": "ŸàÿßŸÑÿ•ÿ¨ÿ±ÿßÿ°ÿßÿ™", "ÿ•ÿ¨ÿ±ÿßÿ°ÿ™": "ÿ•ÿ¨ÿ±ÿßÿ°ÿßÿ™"
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    # Normalize whitespace but preserve single spaces
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Max 2 newlines
    
    return text.strip()

def is_pdf_text_based(pdf_path, sample_pages=3):
    """
    Check if PDF is text-based or image-based (scanned).
    Returns: (is_text_based: bool, text_ratio: float, details: str)
    """
    try:
        doc = fitz.open(pdf_path)
        total_pages = len(doc)
        pages_to_check = min(sample_pages, total_pages)
        
        text_chars = 0
        image_count = 0
        
        for page_num in range(pages_to_check):
            page = doc.load_page(page_num)
            
            # Count text characters
            text = page.get_text("text")
            text_chars += len(text.strip())
            
            # Count images
            image_list = page.get_images()
            image_count += len(image_list)
        
        doc.close()
        
        # Calculate average per page
        avg_text_per_page = text_chars / pages_to_check
        avg_images_per_page = image_count / pages_to_check
        
        # Determine if text-based
        # Text-based: has significant text (>100 chars/page) OR has text and few images
        is_text_based = avg_text_per_page > 100
        
        if is_text_based:
            details = f"‚úÖ Text-based PDF: ~{int(avg_text_per_page)} chars/page, {int(avg_images_per_page)} images/page"
        else:
            details = f"‚ùå Image-based PDF (scanned): ~{int(avg_text_per_page)} chars/page, {int(avg_images_per_page)} images/page"
        
        return is_text_based, avg_text_per_page, details
        
    except Exception as e:
        return False, 0, f"‚ùå Error checking PDF type: {e}"

def extract_text_from_pdf(pdf_path, rules=EXTRACTION_RULES):
    """
    Extract text from PDF with advanced formatting and rules
    Returns structured data with text blocks and metadata
    """
    print(f"\nüìñ Processing: {os.path.basename(pdf_path)}")
    
    try:
        doc = fitz.open(pdf_path)
        extracted_data = {
            'filename': os.path.basename(pdf_path),
            'pages': [],
            'total_pages': len(doc),
            'languages': set(),
            'statistics': {'arabic_blocks': 0, 'english_blocks': 0, 'mixed_blocks': 0}
        }
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_data = {
                'page_number': page_num + 1,
                'blocks': []
            }
            
            # Get text blocks with position and formatting info
            # sort=True ‚Üí better logical reading order
            blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE, sort=True)
            
            # Get page dimensions for header/footer detection
            page_height = page.rect.height
            
            for block in blocks.get("blocks", []):
                # Skip images if rule is set
                if block.get("type") == 1 and not rules['include_images']:
                    continue
                
                # Process text blocks
                if block.get("type") == 0:  # Text block
                    block_bbox = block.get("bbox", [0, 0, 0, 0])
                    
                    # Skip headers/footers if rule is set
                    if rules['remove_headers_footers'] and is_header_footer(block_bbox, page_height):
                        continue
                    
                    # Extract text from lines - keep in LOGICAL order for storage
                    block_text = ""
                    for line in block.get("lines", []):
                        line_parts = []
                        for span in line.get("spans", []):
                            text = span.get("text", "")
                            
                            # Apply minimum length rule
                            if len(text.strip()) < rules['min_text_length']:
                                continue
                            
                            # Clean text first
                            text = clean_extracted_text(text)
                            
                            # DO NOT shape here - keep logical order
                            # We'll apply shaping when saving to TXT/DOCX
                            
                            if text:
                                line_parts.append(text)
                        
                        # Join the span texts back for that visual line
                        line_text = " ".join(p for p in line_parts if p.strip())
                        if line_text:
                            block_text += line_text + "\n"
                    
                    block_text = block_text.strip()
                    
                    if block_text:
                        # Detect language
                        has_arabic = is_arabic(block_text)
                        has_english = bool(re.search(r'[a-zA-Z]', block_text))
                        
                        if has_arabic and has_english:
                            language = "mixed"
                            extracted_data['statistics']['mixed_blocks'] += 1
                        elif has_arabic:
                            language = "arabic"
                            extracted_data['statistics']['arabic_blocks'] += 1
                        elif has_english:
                            language = "english"
                            extracted_data['statistics']['english_blocks'] += 1
                        else:
                            language = "unknown"
                        
                        extracted_data['languages'].add(language)
                        
                        page_data['blocks'].append({
                            'text': block_text,
                            'language': language,
                            'position': block_bbox
                        })
            
            if page_data['blocks']:
                extracted_data['pages'].append(page_data)
        
        doc.close()
        
        print(f"  ‚úÖ Extracted {len(extracted_data['pages'])} pages")
        print(f"  üìä Arabic blocks: {extracted_data['statistics']['arabic_blocks']}")
        print(f"  üìä English blocks: {extracted_data['statistics']['english_blocks']}")
        print(f"  üìä Mixed blocks: {extracted_data['statistics']['mixed_blocks']}")
        
        return extracted_data
        
    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        return None

def save_to_txt(extracted_data, output_path):
    """Save extracted text to plain text file with visual Arabic shaping"""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"Document: {extracted_data['filename']}\n")
        f.write(f"Total Pages: {extracted_data['total_pages']}\n")
        f.write(f"Languages: {', '.join(extracted_data['languages'])}\n")
        f.write("=" * 80 + "\n\n")
        
        for page_data in extracted_data['pages']:
            f.write(f"\n--- Page {page_data['page_number']} ---\n\n")
            
            for block in page_data['blocks']:
                # Apply visual reshaping for TXT output
                text = block['text']
                if block['language'] in ['arabic', 'mixed'] and is_arabic(text):
                    text = fix_arabic_shaping(text)
                f.write(text + "\n\n")

def set_run_rtl(run):
    """Set run-level RTL for proper Arabic rendering"""
    rPr = run._element.get_or_add_rPr()
    rtl = OxmlElement('w:rtl')
    rtl.set(qn('w:val'), '1')
    rPr.append(rtl)

def set_run_rtl_and_font(run, font_name="Arial"):
    """Set run-level RTL and complex-script fonts"""
    rPr = run._element.get_or_add_rPr()
    
    # Set RTL
    rtl = OxmlElement('w:rtl')
    rtl.set(qn('w:val'), '1')
    rPr.append(rtl)
    
    # Set complex-script fonts
    rFonts = rPr.find(qn('w:rFonts'))
    if rFonts is None:
        rFonts = OxmlElement('w:rFonts')
        rPr.append(rFonts)
    
    rFonts.set(qn('w:cs'), font_name)
    rFonts.set(qn('w:ascii'), font_name)
    rFonts.set(qn('w:hAnsi'), font_name)
    run.font.name = font_name

def set_paragraph_rtl(p):
    """Set paragraph-level RTL"""
    p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
    pPr = p._element.get_or_add_pPr()
    bidi = OxmlElement('w:bidi')
    bidi.set(qn('w:val'), '1')
    pPr.append(bidi)

def to_arabic_indic_digits(text):
    """Convert Western digits (0-9) to Arabic-Indic (Ÿ†-Ÿ©)"""
    trans = str.maketrans("0123456789", "Ÿ†Ÿ°Ÿ¢Ÿ£Ÿ§Ÿ•Ÿ¶ŸßŸ®Ÿ©")
    return text.translate(trans)

def save_to_docx(extracted_data, output_path):
    """
    Save extracted text to Word with proper RTL formatting.
    Uses same visual text as TXT (with arabic reshaping) + RTL formatting + Arabic digits.
    """
    doc = Document()
    
    # Add title
    title = doc.add_paragraph(extracted_data['filename'])
    title.style = 'Title'
    
    # Add metadata
    meta = doc.add_paragraph(f"Total Pages: {extracted_data['total_pages']} | Languages: {', '.join(extracted_data['languages'])}")
    meta.style = 'Subtitle'
    
    for page_data in extracted_data['pages']:
        # Page header
        page_header = doc.add_paragraph(f"Page {page_data['page_number']}")
        page_header.style = 'Heading 1'
        
        for block in page_data['blocks']:
            txt = block['text']
            
            if block['language'] in ['arabic', 'mixed']:
                # Apply visual reshaping (same as TXT)
                if is_arabic(txt):
                    txt = fix_arabic_shaping(txt)
                
                # Convert numbers to Arabic-Indic for proper alignment
                txt = to_arabic_indic_digits(txt)
                
                # Add paragraph with RTL formatting
                p = doc.add_paragraph(txt)
                set_paragraph_rtl(p)
                
                for run in p.runs:
                    set_run_rtl_and_font(run, 'Arial')
                    run.font.size = Pt(12)
            else:
                # English text: LTR formatting
                p = doc.add_paragraph(txt)
                p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
                for run in p.runs:
                    run.font.name = 'Calibri'
                    run.font.size = Pt(11)
    
    doc.save(output_path)

def needs_ocr(extracted_data):
    """Check if PDF needs OCR fallback based on text quality"""
    arabic_chars = sum(
        ch >= '\u0600' and ch <= '\u06FF'
        for page in extracted_data['pages']
        for b in page['blocks'] 
        for ch in b['text']
    )
    total_chars = sum(
        len(b['text']) 
        for page in extracted_data['pages'] 
        for b in page['blocks']
    )
    
    # Heuristic: lots of Arabic but very little usable text ‚Üí try OCR
    return arabic_chars > 200 and total_chars < 500

def ocr_pdf(in_path):
    """Run OCR on PDF using OCRmyPDF with Arabic and English"""
    tmp = tempfile.mkdtemp()
    out_path = os.path.join(tmp, "ocr.pdf")
    
    try:
        cmd = [
            "ocrmypdf", 
            "-l", "ara+eng",  # Arabic + English
            "--rotate-pages",  # Auto-rotate pages
            "--deskew",        # Fix skewed scans
            "--clean",         # Clean background
            "--force-ocr",     # Force OCR even if text exists
            in_path, 
            out_path
        ]
        subprocess.run(cmd, check=True, capture_output=True, text=True)
        return out_path, tmp
    except subprocess.CalledProcessError as e:
        print(f"    ‚ö†Ô∏è  OCR failed: {e.stderr}")
        shutil.rmtree(tmp, ignore_errors=True)
        return None, None
    except FileNotFoundError:
        print("    ‚ö†Ô∏è  OCRmyPDF not found. Install with: pip install ocrmypdf")
        print("    ‚ö†Ô∏è  Also requires Tesseract: https://github.com/tesseract-ocr/tesseract")
        shutil.rmtree(tmp, ignore_errors=True)
        return None, None

# Process all PDFs in the folder
print("üöÄ Starting Advanced PDF Text Extraction")
print("=" * 80)
print(f"üìÅ Source folder: {pdf_folder}")
print(f"üìÅ Output folder: {output_folder}")
print("\n‚öôÔ∏è Extraction Rules:")
for key, value in EXTRACTION_RULES.items():
    print(f"   ‚Ä¢ {key}: {value}")
print("\nüìã Processing Steps:")
print("   1. Check if PDF is text-based or image-based (scanned)")
print("   2. Skip image-based PDFs (no extractable text)")
print("   3. Extract text from text-based PDFs")
print("   4. Apply OCR fallback if text quality is poor")
print("   5. Save as TXT and DOCX with proper Arabic formatting")
print("=" * 80)

pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]

if not pdf_files:
    print("\n‚ùå No PDF files found in the folder!")
else:
    print(f"\nüìö Found {len(pdf_files)} PDF files\n")
    
    # Statistics tracking
    processed_count = 0
    skipped_count = 0
    failed_count = 0
    
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"\n[{i}/{len(pdf_files)}] {pdf_file}")
        
        pdf_path = os.path.join(pdf_folder, pdf_file)
        base_name = os.path.splitext(pdf_file)[0]
        
        # Step 1: Check if PDF is text-based or image-based
        print("  üîç Checking PDF type...")
        is_text_based, text_ratio, details = is_pdf_text_based(pdf_path, sample_pages=3)
        print(f"  {details}")
        
        # Skip image-based PDFs (scanned documents)
        if not is_text_based:
            print("  ‚è≠Ô∏è  Skipping: Image-based PDF (scanned document - no extractable text)")
            print("  üí° Tip: Use OCR tools to convert scanned PDFs to text-based format first")
            skipped_count += 1
            continue
        
        # Step 2: Extract text from text-based PDF
        extracted_data = extract_text_from_pdf(pdf_path, EXTRACTION_RULES)
        
        # Step 3: Check if OCR fallback is needed (for poor quality text)
        if extracted_data and needs_ocr(extracted_data):
            print("  üîÅ Text looks broken ‚Üí running OCR fallback (ara+eng)...")
            ocr_path, tmpdir = ocr_pdf(pdf_path)
            
            if ocr_path:
                # Re-extract from OCR'd PDF
                extracted_data = extract_text_from_pdf(ocr_path, EXTRACTION_RULES)
                shutil.rmtree(tmpdir, ignore_errors=True)
                print("  ‚úÖ OCR completed successfully")
        
        # Step 4: Save extracted text
        if extracted_data:
            try:
                # Save as TXT
                txt_path = os.path.join(output_folder, f"{base_name}.txt")
                save_to_txt(extracted_data, txt_path)
                print(f"  üíæ Saved TXT: {base_name}.txt")
                
                # Save as DOCX
                docx_path = os.path.join(output_folder, f"{base_name}.docx")
                save_to_docx(extracted_data, docx_path)
                print(f"  üíæ Saved DOCX: {base_name}.docx")
                
                processed_count += 1
            except Exception as e:
                print(f"  ‚ùå Failed to save: {e}")
                failed_count += 1
        else:
            print(f"  ‚ùå No text extracted")
            failed_count += 1

print("\n" + "=" * 80)
print("üéâ Extraction Complete!")
print(f"üìä Summary:")
print(f"   ‚Ä¢ Total PDFs found: {len(pdf_files)}")
print(f"   ‚Ä¢ Successfully processed: {processed_count}")
print(f"   ‚Ä¢ Skipped (image-based): {skipped_count}")
print(f"   ‚Ä¢ Failed: {failed_count}")
print(f"üìÅ All extracted files saved to: {output_folder}")

üöÄ Starting Advanced PDF Text Extraction
üìÅ Source folder: C:\Users\ali-d\Desktop\KSA\MoI_pdfs
üìÅ Output folder: C:\Users\ali-d\Desktop\KSA\MoI_extracted_text

‚öôÔ∏è Extraction Rules:
   ‚Ä¢ include_images: False
   ‚Ä¢ include_links: False
   ‚Ä¢ preserve_formatting: True
   ‚Ä¢ preserve_structure: True
   ‚Ä¢ detect_language: True
   ‚Ä¢ remove_headers_footers: True
   ‚Ä¢ min_text_length: 3

üìã Processing Steps:
   1. Check if PDF is text-based or image-based (scanned)
   2. Skip image-based PDFs (no extractable text)
   3. Extract text from text-based PDFs
   4. Apply OCR fallback if text quality is poor
   5. Save as TXT and DOCX with proper Arabic formatting

üìö Found 5 PDF files


[1/5] investor-guide-12-03-ar.pdf
  üîç Checking PDF type...
  ‚úÖ Text-based PDF: ~527 chars/page, 4 images/page

üìñ Processing: investor-guide-12-03-ar.pdf
  ‚úÖ Extracted 58 pages
  üìä Arabic blocks: 1242
  üìä English blocks: 0
  üìä Mixed blocks: 8
  üíæ Saved TXT: investor-guid