## 1. Setup: Clone Repository & Install Dependencies

In [None]:
import os
import subprocess
from pathlib import Path

# Create working directory
WORK_DIR = "/content/traditional_medicine_ocr"
Path(WORK_DIR).mkdir(exist_ok=True)
os.chdir(WORK_DIR)

print(f"Working directory: {WORK_DIR}")
print("\n" + "="*60)
print("CLONING REPOSITORY...")
print("="*60)

# Clone the repository
!git clone https://github.com/HuyTran28/23CLCT2_TraditionalMedicineChatbot.git repo

print("\nRepository cloned successfully!")

In [None]:
print("\n" + "="*60)
print("INSTALLING DEPENDENCIES (GPU OPTIMIZED)...")
print("="*60)

# Install pip packages with GPU support
!pip install --upgrade pip setuptools wheel -q

# Install core dependencies
!pip install -q numpy>=1.26.4
!pip install -q Pillow>=10.1.0,<11.0.0
!pip install -q matplotlib>=3.8.0
!pip install -q PyMuPDF>=1.23.8
!pip install -q python-docx>=1.1.0
!pip install -q pdf2docx>=0.5.8

# Install OCR and NLP dependencies
!pip install -q underthesea>=6.0.0
!pip install -q pyvi>=0.1.1
!pip install -q opencv-contrib-python>=4.8.0.74
!pip install -q opencv-python>=4.8.0.74

# Install marker-pdf (GPU enabled on Colab)
print("\nInstalling marker-pdf (this may take a few minutes)...")
!pip install -q marker-pdf>=0.2.0

# Verify GPU availability for marker-pdf
print("\nVerifying GPU availability...")
!nvidia-smi --query-gpu=name --format=csv,noheader

print("\nAll dependencies installed successfully!")

## 2. Setup Environment & Import Modules

In [None]:
import sys
import logging
from pathlib import Path
from typing import Optional, List, Dict, Any
import shutil
import json
from datetime import datetime
import concurrent.futures
from multiprocessing import cpu_count
import zipfile
import time

# Add OCR modules to path
ocr_module_path = Path(WORK_DIR) / "repo" / "ocr"
sys.path.insert(0, str(ocr_module_path))
sys.path.insert(0, str(ocr_module_path.parent))

# Import pipeline and modules
from modules.pipeline import OCRPipeline

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Create directories
INPUT_DIR = Path(WORK_DIR) / "input"
OUTPUT_DIR = Path(WORK_DIR) / "output"
TEMP_DIR = Path(WORK_DIR) / "temp"

INPUT_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)
TEMP_DIR.mkdir(exist_ok=True)

print("Environment setup complete")
print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"CPU Count: {cpu_count()}")

## 3. Upload Input Files

In [None]:
from google.colab import files

print("="*60)
print("UPLOAD PDF FILES")
print("="*60)
print("\nClick 'Choose Files' to select PDF(s) from your computer.")
print("You can upload multiple PDFs at once.")
print("\nSupported: Single or batch processing of PDFs (including ~300+ pages)\n")

uploaded_files = files.upload()

# Move uploaded files to input directory
uploaded_count = 0
for filename, data in uploaded_files.items():
    if filename.lower().endswith('.pdf'):
        input_path = INPUT_DIR / filename
        with open(input_path, 'wb') as f:
            f.write(data)
        file_size_mb = input_path.stat().st_size / (1024*1024)
        print(f"{filename} ({file_size_mb:.2f} MB) uploaded successfully")
        uploaded_count += 1
    else:
        print(f"Skipped {filename} (not a PDF)")

print(f"\nTotal PDFs uploaded: {uploaded_count}")

# List uploaded files
pdf_files = sorted(INPUT_DIR.glob("*.pdf"))
print(f"\nPDF files ready for processing:")
for pdf in pdf_files:
    print(f"  - {pdf.name} ({pdf.stat().st_size / (1024*1024):.2f} MB)")

## 4. Configure Processing Parameters

In [None]:
# ============================================================
# PROCESSING CONFIGURATION
# ============================================================

# Number of parallel workers for processing
# NUM_WORKERS = cpu_count()
NUM_WORKERS = 4

PROCESSING_MODE = "scan"

# PDF conversion settings
DPI = 300  # Resolution for PDF to image conversion
EXTRACT_IMAGES = True  # Extract images from PDF
PRESERVE_LAYOUT = True  # Preserve document layout/structure
EXTRACT_TABLES = True  # Extract and process tables
ENABLE_PREPROCESSING = True  # Preprocess scanned pages

# LLM correction 
USE_LLM_CORRECTION = True

# Display configuration
print("="*60)
print("PROCESSING CONFIGURATION")
print("="*60)
print(f"Number of Workers (CPU cores): {NUM_WORKERS}")
print(f"Processing Mode: {PROCESSING_MODE}")
print(f"DPI: {DPI}")
print(f"Extract Images: {EXTRACT_IMAGES}")
print(f"Preserve Layout: {PRESERVE_LAYOUT}")
print(f"Extract Tables: {EXTRACT_TABLES}")
print(f"Enable Preprocessing: {ENABLE_PREPROCESSING}")
print(f"LLM Correction: {USE_LLM_CORRECTION}")
print("="*60)

## 5. Initialize OCR Pipeline with Parallelism

In [None]:
print("\nInitializing OCR Pipeline...\n")

# Initialize the pipeline
pipeline = OCRPipeline(
    output_dir=str(OUTPUT_DIR),
    temp_dir=str(TEMP_DIR),
    dpi=DPI,
    enable_preprocessing=ENABLE_PREPROCESSING,
    auto_detect=PROCESSING_MODE == "auto",
    extract_images=EXTRACT_IMAGES,
    analyze_layout=PRESERVE_LAYOUT,
    extract_tables=EXTRACT_TABLES,
    use_llm_correction=USE_LLM_CORRECTION
)

# Configure parallel workers
pipeline.max_workers = NUM_WORKERS

print(f"Pipeline initialized with {NUM_WORKERS} workers")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Temporary directory: {TEMP_DIR}")
print("\nReady to process PDFs!")

## 6. Process PDFs with Progress Tracking

In [None]:
print("\n" + "="*60)
print("STARTING OCR PROCESSING")
print("="*60)

pdf_files = sorted(INPUT_DIR.glob("*.pdf"))

if not pdf_files:
    print("\nNo PDF files found in input directory!")
    print("Please upload PDFs first (see section 3).")
else:
    total_files = len(pdf_files)
    print(f"\nFound {total_files} PDF(s) to process")
    print(f"Processing mode: {PROCESSING_MODE}")
    print(f"Using {NUM_WORKERS} parallel workers\n")
    
    # Process with parallelism
    start_time = time.time()
    results = []
    
    for idx, pdf_path in enumerate(pdf_files, 1):
        print(f"\n{'='*60}")
        print(f"Processing file {idx}/{total_files}: {pdf_path.name}")
        print(f"{'='*60}")
        print(f"File size: {pdf_path.stat().st_size / (1024*1024):.2f} MB")
        
        try:
            file_start = time.time()
            
            # Process the PDF
            output_path = pipeline.process_pdf(
                pdf_path,
                mode=PROCESSING_MODE if PROCESSING_MODE != "auto" else None
            )
            
            file_duration = time.time() - file_start
            
            # Get output file size
            output_size = output_path.stat().st_size / (1024*1024)
            
            print(f"\nSUCCESS: {output_path.name}")
            print(f"Output size: {output_size:.2f} MB")
            print(f"Processing time: {file_duration:.2f} seconds")
            
            results.append({
                "input": pdf_path.name,
                "output": output_path.name,
                "status": "success",
                "duration_seconds": file_duration,
                "output_size_mb": output_size
            })
            
        except Exception as e:
            error_msg = str(e)
            print(f"\nERROR: {error_msg}")
            
            results.append({
                "input": pdf_path.name,
                "status": "failed",
                "error": error_msg
            })
    
    total_duration = time.time() - start_time
    
    # Print summary
    print(f"\n\n" + "="*60)
    print("PROCESSING COMPLETE - SUMMARY")
    print("="*60)
    
    successful = sum(1 for r in results if r["status"] == "success")
    failed = sum(1 for r in results if r["status"] == "failed")
    
    print(f"\nTotal Files: {total_files}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"Total Processing Time: {total_duration:.2f} seconds ({total_duration/60:.2f} minutes)")
    print(f"Average Time per File: {total_duration/total_files:.2f} seconds")
    
    # Save results to JSON
    metrics = pipeline.metrics.get_metrics_summary() if hasattr(pipeline, 'metrics') else {}
    results_json = OUTPUT_DIR / "processing_results.json"
    with open(results_json, 'w', encoding='utf-8') as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "total_files": total_files,
            "successful": successful,
            "failed": failed,
            "total_duration_seconds": total_duration,
            "workers_used": NUM_WORKERS,
            "results": results,
            "metrics": metrics
        }, f, indent=2, ensure_ascii=False)
    
    print(f"\nResults saved to: processing_results.json")

## 7. Manual Post-Processing: Upload Corrected Markdown

After reviewing and manually correcting the markdown output, upload your corrected `.md` file here to convert it to a Word document.

In [None]:
from google.colab import files
import shutil

print("="*60)
print("UPLOAD CORRECTED MARKDOWN FILE")
print("="*60)
print("\nUpload your manually corrected .md file for conversion to Word.\n")

# Upload markdown file
uploaded_md = files.upload()

# Store uploaded markdown files
MARKDOWN_DIR = Path(WORK_DIR) / "markdown_uploads"
MARKDOWN_DIR.mkdir(exist_ok=True)

md_files = []
for filename, data in uploaded_md.items():
    if filename.lower().endswith('.md'):
        md_path = MARKDOWN_DIR / filename
        with open(md_path, 'wb') as f:
            f.write(data)
        file_size_kb = md_path.stat().st_size / 1024
        print(f"{filename} ({file_size_kb:.2f} KB) uploaded successfully")
        md_files.append(md_path)
    else:
        print(f"Skipped {filename} (not a markdown file)")

print(f"\nTotal markdown files uploaded: {len(md_files)}")

## 8. Convert Markdown to Word Document

Convert the uploaded markdown files to professional Word documents with proper formatting.

In [None]:
print("\n" + "="*60)
print("CONVERTING MARKDOWN TO WORD")
print("="*60)

if not md_files:
    print("\nNo markdown files to convert. Please upload files first (see section 7).")
else:
    # Create output directory for converted files
    CONVERTED_DIR = OUTPUT_DIR / "converted_markdown"
    CONVERTED_DIR.mkdir(exist_ok=True)
    
    # Initialize the exporter
    from modules.exporter import WordExporter
    exporter = WordExporter()
    
    conversion_results = []
    
    for md_file in md_files:
        print(f"\n{'='*60}")
        print(f"Converting: {md_file.name}")
        print(f"{'='*60}")
        
        try:
            # Read markdown content
            with open(md_file, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            # Output path
            output_filename = md_file.stem + ".docx"
            output_path = CONVERTED_DIR / output_filename
            
            # Look for associated images in the extracted_images folder
            # Try to find images from the original OCR output
            pdf_name = md_file.stem.replace('_corrected', '').replace('_edited', '')
            possible_image_dirs = [
                OUTPUT_DIR / "extracted_images" / pdf_name,
                OUTPUT_DIR / "extracted_images",
            ]
            
            images_list = []
            for img_dir in possible_image_dirs:
                if img_dir.exists():
                    print(f"Found image directory: {img_dir}")
                    # Load images from directory
                    for img_file in sorted(img_dir.glob("img_*.png")):
                        img_id = img_file.stem
                        images_list.append({
                            'image_id': img_id,
                            'file_path': str(img_file),
                            'width': 800,  # Default width
                            'height': 600   # Default height
                        })
                    if images_list:
                        print(f"Loaded {len(images_list)} images")
                        break
            
            # Convert markdown to Word
            print(f"Converting to Word document...")
            exporter.markdown_to_word(
                markdown_content,
                output_path=str(output_path),
                images=images_list if images_list else None
            )
            
            # Get file size
            output_size_kb = output_path.stat().st_size / 1024
            
            print(f"\nSUCCESS: {output_filename}")
            print(f"  Output: {output_path}")
            print(f"  Size: {output_size_kb:.2f} KB")
            print(f"  Images included: {len(images_list)}")
            
            conversion_results.append({
                "input": md_file.name,
                "output": output_filename,
                "status": "success",
                "size_kb": output_size_kb,
                "images_count": len(images_list)
            })
            
        except Exception as e:
            error_msg = str(e)
            print(f"\nERROR: {error_msg}")
            conversion_results.append({
                "input": md_file.name,
                "status": "failed",
                "error": error_msg
            })
    
    # Print summary
    print(f"\n\n" + "="*60)
    print("CONVERSION COMPLETE - SUMMARY")
    print("="*60)
    
    successful = sum(1 for r in conversion_results if r["status"] == "success")
    failed = sum(1 for r in conversion_results if r["status"] == "failed")
    
    print(f"\nTotal Files: {len(md_files)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    
    if successful > 0:
        print(f"\nConverted files saved to: {CONVERTED_DIR}")
        print("\nConverted files:")
        for result in conversion_results:
            if result["status"] == "success":
                print(f"  - {result['output']} ({result['size_kb']:.1f} KB, {result['images_count']} images)")
    
    # Save conversion results
    results_json = CONVERTED_DIR / "conversion_results.json"
    with open(results_json, 'w', encoding='utf-8') as f:
        json.dump({
            "timestamp": datetime.now().isoformat(),
            "total_files": len(md_files),
            "successful": successful,
            "failed": failed,
            "results": conversion_results
        }, f, indent=2, ensure_ascii=False)

## 9. Download All Output Files as ZIP

Download the entire output directory (including all results, converted files, images, etc.) as a single ZIP archive.

In [None]:
from google.colab import files
import zipfile
import os

print("\n" + "="*60)
print("ZIPPING OUTPUT DIRECTORY")
print("="*60)

zip_path = str(OUTPUT_DIR) + ".zip"

# Remove old zip if exists
if os.path.exists(zip_path):
    os.remove(zip_path)

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files_in_dir in os.walk(OUTPUT_DIR):
        for file in files_in_dir:
            abs_path = os.path.join(root, file)
            rel_path = os.path.relpath(abs_path, OUTPUT_DIR)
            zipf.write(abs_path, arcname=rel_path)
            print(f"Added: {rel_path}")

zip_size_mb = os.path.getsize(zip_path) / (1024*1024)
print(f"\nZIP archive created: {zip_path} ({zip_size_mb:.2f} MB)")
print("Downloading...")
files.download(zip_path)
print("Download started! Check your browser's download folder.")

## 10. Cleanup (Optional)

In [None]:
print("\n" + "="*60)
print("CLEANUP OPTIONS")
print("="*60)

def cleanup_directory(directory: Path, description: str):
    """Clean up a directory and show freed space"""
    if directory.exists():
        total_size = sum(f.stat().st_size for f in directory.rglob('*') if f.is_file())
        shutil.rmtree(directory)
        size_mb = total_size / (1024*1024)
        print(f"Cleaned up {description}: freed {size_mb:.2f} MB")
    else:
        print(f"{description} not found")

# Uncomment to cleanup temporary files
cleanup_directory(TEMP_DIR, "Temporary files")

# Uncomment to cleanup input files (after processing)
cleanup_directory(INPUT_DIR, "Input files")

# Uncomment to cleanup everything (keep this commented unless you're sure!)
# cleanup_directory(OUTPUT_DIR, "Output files")
# shutil.rmtree(Path(WORK_DIR) / "repo")