## 1. Setup: Clone Repository & Install Dependencies

In [None]:
import os
import subprocess
from pathlib import Path

# Create working directory
WORK_DIR = "/content/traditional_medicine_ocr"
Path(WORK_DIR).mkdir(exist_ok=True)
os.chdir(WORK_DIR)

print(f"Working directory: {WORK_DIR}")
print("\n" + "="*60)
print("CLONING REPOSITORY...")
print("="*60)

# Clone the repository
!git clone https://github.com/HuyTran28/23CLCT2_TraditionalMedicineChatbot.git repo

print("\nRepository cloned successfully!")

In [None]:
print("\n" + "="*60)
print("INSTALLING DEPENDENCIES (GPU OPTIMIZED)...")
print("="*60)

# Install pip packages with GPU support
!pip install --upgrade pip setuptools wheel -q

# Install core dependencies
!pip install -q numpy>=1.26.4
!pip install -q Pillow>=10.1.0,<11.0.0
!pip install -q matplotlib>=3.8.0
!pip install -q PyMuPDF>=1.23.8
!pip install -q python-docx>=1.1.0
!pip install -q pdf2docx>=0.5.8

# Install OCR and NLP dependencies
!pip install -q underthesea>=6.0.0
!pip install -q pyvi>=0.1.1
!pip install -q opencv-contrib-python>=4.8.0.74
!pip install -q opencv-python>=4.8.0.74

# Install marker-pdf (GPU enabled on Colab)
print("\nInstalling marker-pdf (this may take a few minutes)...")
!pip install -q marker-pdf>=0.2.0

# Verify GPU availability for marker-pdf
print("\nVerifying GPU availability...")
!nvidia-smi --query-gpu=name --format=csv,noheader

print("\nAll dependencies installed successfully!")

## 2. Setup Environment & Import Modules

In [None]:
import sys
import logging
from pathlib import Path
from typing import Optional, List, Dict, Any
import shutil
import json
from datetime import datetime
import concurrent.futures
from multiprocessing import cpu_count
import zipfile
import time

# Add OCR modules to path
ocr_module_path = Path(WORK_DIR) / "repo" / "ocr"
sys.path.insert(0, str(ocr_module_path))
sys.path.insert(0, str(ocr_module_path.parent))

# Import pipeline and modules
from modules.pipeline import OCRPipeline

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Create directories
INPUT_DIR = Path(WORK_DIR) / "input"
OUTPUT_DIR = Path(WORK_DIR) / "output"
TEMP_DIR = Path(WORK_DIR) / "temp"

INPUT_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)
TEMP_DIR.mkdir(exist_ok=True)

print("Environment setup complete")
print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"CPU Count: {cpu_count()}")

## 3. Upload Input Files

In [None]:
from google.colab import files

print("="*60)
print("UPLOAD PDF FILES")
print("="*60)
print("\nClick 'Choose Files' to select PDF(s) from your computer.")
print("You can upload multiple PDFs at once.")
print("\nSupported: Single or batch processing of PDFs (including ~300+ pages)\n")

uploaded_files = files.upload()

# Move uploaded files to input directory
uploaded_count = 0
for filename, data in uploaded_files.items():
    if filename.lower().endswith('.pdf'):
        input_path = INPUT_DIR / filename
        with open(input_path, 'wb') as f:
            f.write(data)
        file_size_mb = input_path.stat().st_size / (1024*1024)
        print(f"{filename} ({file_size_mb:.2f} MB) uploaded successfully")
        uploaded_count += 1
    else:
        print(f"Skipped {filename} (not a PDF)")

print(f"\nTotal PDFs uploaded: {uploaded_count}")

# List uploaded files
pdf_files = sorted(INPUT_DIR.glob("*.pdf"))
print(f"\nPDF files ready for processing:")
for pdf in pdf_files:
    print(f"  - {pdf.name} ({pdf.stat().st_size / (1024*1024):.2f} MB)")

## 4. Configure Processing Parameters

In [None]:
# ============================================================
# PROCESSING CONFIGURATION
# ============================================================

# Number of parallel workers for processing
# NUM_WORKERS = cpu_count()
NUM_WORKERS = 4

PROCESSING_MODE = "scan"

# PDF conversion settings
DPI = 300  # Resolution for PDF to image conversion
EXTRACT_IMAGES = True  # Extract images from PDF
PRESERVE_LAYOUT = True  # Preserve document layout/structure
EXTRACT_TABLES = True  # Extract and process tables
ENABLE_PREPROCESSING = True  # Preprocess scanned pages

# LLM correction
USE_LLM_CORRECTION = True

# Display configuration
print("="*60)
print("PROCESSING CONFIGURATION")
print("="*60)
print(f"Number of Workers (CPU cores): {NUM_WORKERS}")
print(f"Processing Mode: {PROCESSING_MODE}")
print(f"DPI: {DPI}")
print(f"Extract Images: {EXTRACT_IMAGES}")
print(f"Preserve Layout: {PRESERVE_LAYOUT}")
print(f"Extract Tables: {EXTRACT_TABLES}")
print(f"Enable Preprocessing: {ENABLE_PREPROCESSING}")
print(f"LLM Correction: {USE_LLM_CORRECTION}")
print("="*60)

## 5. Initialize OCR Pipeline with Parallelism

In [None]:
print("\nInitializing OCR Pipeline...\n")

import os
os.environ["TQDM_MININTERVAL"] = os.environ.get("TQDM_MININTERVAL", "1.0")  # seconds between refreshes
os.environ["TQDM_MAX_INTERVAL"] = os.environ.get("TQDM_MAX_INTERVAL", "5.0")  # cap refresh interval

class SimpleProgress:
    def __init__(self, iterable=None, total=None, desc=None):
        self.iterable = iterable if iterable is not None else []
        self.total = total if total is not None else (len(self.iterable) if hasattr(self.iterable, "__len__") else None)
        self.desc = desc
    def __iter__(self):
        for item in self.iterable:
            yield item
        if self.desc:
            print(f"{self.desc}: done")
    def update(self, n=1):
        pass
    def close(self):
        pass

try:
    from tqdm.auto import tqdm as _tqdm
    def progress(iterable, **kwargs):
        defaults = {
            "mininterval": float(os.environ.get("TQDM_MININTERVAL", "1.0")),
            "maxinterval": float(os.environ.get("TQDM_MAX_INTERVAL", "5.0")),
            "leave": False,
            "dynamic_ncols": True,
            "position": 0,
            "smoothing": 0.0,
        }
        defaults.update(kwargs)
        return _tqdm(iterable, **defaults)
except Exception:
    progress = SimpleProgress

# Initialize the pipeline
pipeline = OCRPipeline(
    output_dir=str(OUTPUT_DIR),
    temp_dir=str(TEMP_DIR),
    dpi=DPI,
    enable_preprocessing=ENABLE_PREPROCESSING,
    auto_detect=PROCESSING_MODE == "auto",
    extract_images=EXTRACT_IMAGES,
    analyze_layout=PRESERVE_LAYOUT,
    extract_tables=EXTRACT_TABLES,
    use_llm_correction=USE_LLM_CORRECTION
)

# Configure parallel workers
pipeline.max_workers = NUM_WORKERS

print(f"Pipeline initialized with {NUM_WORKERS} workers")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Temporary directory: {TEMP_DIR}")
print("\nReady to process PDFs!")

## 6. Process PDFs with Progress Tracking

In [None]:
print("\n" + "="*60)
print("STARTING OCR PROCESSING")
print("="*60)

pdf_files = sorted(INPUT_DIR.glob("*.pdf"))

if not pdf_files:
    print("\nNo PDF files found in input directory!")
    print("Please upload PDFs first (see section 3).")
else:
    total_files = len(pdf_files)
    print(f"\nFound {total_files} PDF(s) to process")
    print(f"Processing Mode: {PROCESSING_MODE}")
    print(f"Using {NUM_WORKERS} parallel workers\n")

    # Process with a single progress iterator to avoid duplicate bars
    start_time = time.time()
    results = []

    for pdf_path in progress(pdf_files, total=len(pdf_files), desc="Processing PDFs"):
        try:
            file_start = time.time()

            # Process the PDF
            output_path = pipeline.process_pdf(
                pdf_path,
                mode=PROCESSING_MODE if PROCESSING_MODE != "auto" else None
            )

            file_duration = time.time() - file_start

            # Get output file size
            output_size = output_path.stat().st_size / (1024*1024)

            results.append({
                "input": pdf_path.name,
                "output": output_path.name,
                "status": "success",
                "duration_seconds": file_duration,
                "output_size_mb": output_size
            })

        except Exception as e:
            error_msg = str(e)
            results.append({
                "input": pdf_path.name,
                "status": "failed",
                "error": error_msg
            })

    total_duration = time.time() - start_time

    # Print summary
    print(f"\n\n" + "="*60)
    print("PROCESSING COMPLETE - SUMMARY")
    print("="*60)

    successful = sum(1 for r in results if r["status"] == "success")
    failed = sum(1 for r in results if r["status"] == "failed")

    print(f"\nTotal Files: {total_files}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Total Processing Time: {total_duration:.2f} seconds ({total_duration/60:.2f} minutes)")
    print(f"Average Time per File: {total_duration/total_files:.2f} seconds")

In [None]:
# Helper function to convert datetime objects in a dictionary to ISO format strings
def convert_datetime_to_iso(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    if isinstance(obj, dict):
        return {k: convert_datetime_to_iso(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [convert_datetime_to_iso(elem) for elem in obj]
    return obj

# Save results to JSON
metrics = pipeline.metrics.get_metrics_summary() if hasattr(pipeline, 'metrics') else {}

# Convert datetime objects in metrics to ISO format strings
metrics_serializable = convert_datetime_to_iso(metrics)

results_json = OUTPUT_DIR / "processing_results.json"
with open(results_json, 'w', encoding='utf-8') as f:
    json.dump({
        "timestamp": datetime.now().isoformat(),
        "total_files": total_files,
        "successful": successful,
        "failed": failed,
        "total_duration_seconds": total_duration,
        "workers_used": NUM_WORKERS,
        "results": results,
        "metrics": metrics_serializable # Use the serializable metrics
    }, f, indent=2, ensure_ascii=False)

print(f"\nResults saved to: processing_results.json")

## 7. Process Manually Corrected Markdown: Add Section Breaks & Convert to Word

After manually editing the markdown file, use this section to:
1. Add `</break>` tags at the end of each level-2 heading section
2. Convert the processed markdown to a Word document with proper formatting

In [None]:
from google.colab import files
import shutil

print("="*60)
print("UPLOAD MANUALLY CORRECTED MARKDOWN FILE")
print("="*60)
print("\nUpload your manually corrected .md file to:")
print("1. Add </break> tags at the end of level-2 heading sections")
print("2. Convert to Word document with proper formatting\n")

# Upload markdown file
uploaded_md_manual = files.upload()

# Store uploaded markdown files
MANUAL_MD_DIR = Path(WORK_DIR) / "manual_markdown"
MANUAL_MD_DIR.mkdir(exist_ok=True)

manual_md_files = []
for filename, data in uploaded_md_manual.items():
    if filename.lower().endswith('.md'):
        md_path = MANUAL_MD_DIR / filename
        with open(md_path, 'wb') as f:
            f.write(data)
        file_size_kb = md_path.stat().st_size / 1024
        print(f"{filename} ({file_size_kb:.2f} KB) uploaded successfully")
        manual_md_files.append(md_path)
    else:
        print(f"Skipped {filename} (not a markdown file)")

print(f"\nTotal markdown files uploaded: {len(manual_md_files)}")

# Process each markdown file: add </break> tags
if manual_md_files:
    # Initialize markdown processor
    from modules.markdown_processor import MarkdownProcessor
    md_processor = MarkdownProcessor(use_llm_correction=False)  # No LLM correction, just add breaks

    print("\n" + "="*60)
    print("PROCESSING MARKDOWN: ADDING SECTION BREAKS")
    print("="*60)

    # Create output directory
    PROCESSED_MD_DIR = OUTPUT_DIR / "processed_markdown"
    PROCESSED_MD_DIR.mkdir(exist_ok=True)

    processed_files = []

    for md_file in manual_md_files:
        print(f"\nProcessing: {md_file.name}")

        try:
            # Read markdown content
            with open(md_file, 'r', encoding='utf-8') as f:
                markdown_content = f.read()

            # Insert section breaks
            processed_content = md_processor.insert_section_breaks(markdown_content)

            # Save processed markdown
            output_filename = md_file.stem + "_with_breaks.md"
            output_md_path = PROCESSED_MD_DIR / output_filename

            with open(output_md_path, 'w', encoding='utf-8') as f:
                f.write(processed_content)

            print(f"Added </break> tags")
            print(f"  Saved to: {output_filename}")

            processed_files.append(output_md_path)

        except Exception as e:
            print(f"Error processing {md_file.name}: {e}")

    print(f"\n{'='*60}")
    print(f"Processed {len(processed_files)} markdown file(s)")
    print(f"Output directory: {PROCESSED_MD_DIR}")

    # Now convert processed markdown to Word
    print("\n" + "="*60)
    print("CONVERTING PROCESSED MARKDOWN TO WORD")
    print("="*60)

    from modules.exporter import WordExporter
    exporter = WordExporter()

    # Create output directory for Word files
    WORD_OUTPUT_DIR = OUTPUT_DIR / "manual_word_output"
    WORD_OUTPUT_DIR.mkdir(exist_ok=True)

    conversion_results = []

    for md_file in processed_files:
        print(f"\nConverting: {md_file.name}")

        try:
            # Read processed markdown content
            with open(md_file, 'r', encoding='utf-8') as f:
                markdown_content = f.read()

            # Output path
            output_filename = md_file.stem.replace('_with_breaks', '') + ".docx"
            output_path = WORD_OUTPUT_DIR / output_filename

            # Look for associated images
            pdf_name = md_file.stem.replace('_ocr_results_with_breaks', '')
            possible_image_dirs = [
                OUTPUT_DIR / "extracted_images",
            ]

            images_list = []
            for img_dir in possible_image_dirs:
                if img_dir.exists():
                    print(f"  Found image directory: {img_dir}")
                    for img_file in sorted(img_dir.glob(f"{pdf_name}_img_*.png")):
                        img_id = img_file.stem
                        images_list.append({
                            'image_id': img_id,
                            'file_path': str(img_file),
                            'width': 800,
                            'height': 600
                        })
                    if images_list:
                        print(f"  Loaded {len(images_list)} images")
                        break

            # Convert markdown to Word
            exporter.markdown_to_word(
                markdown_content,
                output_path=str(output_path),
                images=images_list if images_list else None
            )

            output_size_kb = output_path.stat().st_size / 1024

            print(f"SUCCESS: {output_filename}")
            print(f"  Size: {output_size_kb:.2f} KB")
            print(f"  Images: {len(images_list)}")

            conversion_results.append({
                "input": md_file.name,
                "output": output_filename,
                "status": "success",
                "size_kb": output_size_kb,
                "images_count": len(images_list)
            })

        except Exception as e:
            print(f"âœ— ERROR: {e}")
            conversion_results.append({
                "input": md_file.name,
                "status": "failed",
                "error": str(e)
            })

    # Print final summary
    print(f"\n\n{'='*60}")
    print("PROCESSING COMPLETE - SUMMARY")
    print("="*60)

    successful = sum(1 for r in conversion_results if r["status"] == "success")
    failed = sum(1 for r in conversion_results if r["status"] == "failed")

    print(f"\nMarkdown Files Processed: {len(manual_md_files)}")
    print(f"Word Conversions Successful: {successful}")
    print(f"Word Conversions Failed: {failed}")

    if successful > 0:
        print(f"\nOutput files:")
        print(f"  - Processed Markdown: {PROCESSED_MD_DIR}")
        print(f"  - Word Documents: {WORD_OUTPUT_DIR}")
        print("\nGenerated files:")
        for result in conversion_results:
            if result["status"] == "success":
                print(f"  - {result['output']} ({result['size_kb']:.1f} KB, {result['images_count']} images)")
else:
    print("\nNo markdown files uploaded. Please upload files first.")

## 9. Download All Output Files as ZIP

Download the entire output directory (including all results, converted files, images, etc.) as a single ZIP archive.

In [None]:
from google.colab import files
import zipfile
import os

print("\n" + "="*60)
print("ZIPPING OUTPUT DIRECTORY")
print("="*60)

zip_path = str(OUTPUT_DIR) + ".zip"

# Remove old zip if exists
if os.path.exists(zip_path):
    os.remove(zip_path)

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files_in_dir in os.walk(OUTPUT_DIR):
        for file in files_in_dir:
            abs_path = os.path.join(root, file)
            rel_path = os.path.relpath(abs_path, OUTPUT_DIR)
            zipf.write(abs_path, arcname=rel_path)
            print(f"Added: {rel_path}")

zip_size_mb = os.path.getsize(zip_path) / (1024*1024)
print(f"\nZIP archive created: {zip_path} ({zip_size_mb:.2f} MB)")
print("Downloading...")
files.download(zip_path)
print("Download started! Check your browser's download folder.")

## 10. Cleanup (Optional)

In [None]:
print("\n" + "="*60)
print("CLEANUP OPTIONS")
print("="*60)

def cleanup_directory(directory: Path, description: str):
    """Clean up a directory and show freed space"""
    if directory.exists():
        total_size = sum(f.stat().st_size for f in directory.rglob('*') if f.is_file())
        shutil.rmtree(directory)
        size_mb = total_size / (1024*1024)
        print(f"Cleaned up {description}: freed {size_mb:.2f} MB")
    else:
        print(f"{description} not found")

# Uncomment to cleanup temporary files
cleanup_directory(TEMP_DIR, "Temporary files")

# Uncomment to cleanup input files (after processing)
cleanup_directory(INPUT_DIR, "Input files")

# Uncomment to cleanup everything (keep this commented unless you're sure!)
# cleanup_directory(OUTPUT_DIR, "Output files")
# shutil.rmtree(Path(WORK_DIR) / "repo")