## 1. Setup Environment

Install required dependencies (run once per session)

In [None]:
# Check if running on Colab
try:
    import google.colab
    IN_COLAB = True
    print("Running on Google Colab")
except:
    IN_COLAB = False
    print("Running locally")

# Check GPU availability
import torch
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"  CUDA version: {torch.version.cuda}")
else:
    print("No GPU detected - will use CPU (slower)")

In [None]:
%%capture
import sys
def is_colab():
    try:
        import google.colab
        return True
    except ImportError:
        return False
if is_colab():
    import os
    if 'COLAB_GPU' in os.environ:
        !pip install -q paddlepaddle-gpu==2.6.2.post120 paddleocr==2.6.1.3
    else:
        !pip install -q paddlepaddle==2.6.2 paddleocr==2.6.1.3
    !pip install -q torch torchvision numpy Pillow lmdb vietocr
    !pip install -q pdf2docx PyMuPDF python-docx
    !pip install -q opencv-python opencv-contrib-python transformers matplotlib craft-text-detector
else:
    !pip install -r requirements.txt

## 2. Clone Repository (Colab only)

If running on Colab, clone the repository to access the modules

In [None]:
if IN_COLAB:
    # Clone repository
    !git clone https://github.com/HuyTran28/23CLCT2_TraditionalMedicineChatbot.git
    
    # Change to OCR directory
    import os
    os.chdir('/content/23CLCT2_TraditionalMedicineChatbot/ocr')
    print("Repository cloned and working directory set")
else:
    print("Skipping - not on Colab")
    print("   Place your PDF files in the 'input' folder")

## 3. Upload Input Files (Colab only)

Upload your PDF files to process

In [None]:
if IN_COLAB:
    from google.colab import files
    from pathlib import Path
    
    # Create input directory
    Path('./input').mkdir(exist_ok=True)
    
    print("Upload your PDF files:")
    uploaded = files.upload()
    
    # Move uploaded files to input directory
    for filename in uploaded.keys():
        import shutil
        shutil.move(filename, f'./input/{filename}')
else:
    print("Skipping - not on Colab")
    print("   Place your PDF files in the 'input' folder")

## 4. Initialize Pipeline

Set up the OCR pipeline with your preferred settings

In [None]:
import sys
import logging
from pathlib import Path

# Add modules to path
sys.path.insert(0, './modules')

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

from modules.pipeline import OCRPipeline

print("Modules imported successfully")

In [None]:
# Configuration
CONFIG = {
    'input_dir': './input',           # Input folder containing PDFs
    'output_dir': './output',         # Output folder for Word files
    'temp_dir': './temp',             # Temporary files directory
    'dpi': 300,                       # Resolution for image conversion
    'enable_preprocessing': True,     # Enable image enhancement
    'auto_detect': True,              # Auto-detect digital vs scanned
    'max_workers': None,              # Number of worker threads for pages (None=CPU count)
}

# Initialize pipeline 
pipeline = OCRPipeline(
    output_dir=CONFIG['output_dir'],
    temp_dir=CONFIG['temp_dir'],
    dpi=CONFIG['dpi'],
    enable_preprocessing=CONFIG['enable_preprocessing'],
    auto_detect=CONFIG['auto_detect'],
)
# set workers after construction (keeps notebook compatibility)
pipeline.max_workers = CONFIG.get('max_workers')

print("Pipeline initialized with configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## 5. Process Single File

Process a single PDF file

In [None]:
# Specify the PDF file to process
pdf_file = "sample.pdf"  # Change this to your filename

input_path = Path(CONFIG['input_dir']) / pdf_file

if not input_path.exists():
    print(f"File not found: {input_path}")
    print(f\
    for f in Path(CONFIG['input_dir']).glob('*.pdf'):
        print(f"  - {f.name}")
else:
    print(f"Processing: {pdf_file}")
    print("=" * 60)
    
    try:
        output_path = pipeline.process_pdf(input_path, mode=None)
        print("\n" + "=" * 60)
        print(f"Success! Output saved to: {output_path}")
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()

## 7. Download Results (Colab only)

Download the converted Word files

In [None]:
if IN_COLAB:
    from google.colab import files
    import zipfile
    from pathlib import Path
    
    output_dir = Path(CONFIG['output_dir'])
    docx_files = list(output_dir.glob('*.docx'))
    
    if not docx_files:
        print("No output files found")
    elif len(docx_files) == 1:
        # Download single file
        print(f"Downloading: {docx_files[0].name}")
        files.download(str(docx_files[0]))
    else:
        # Zip and download multiple files
        zip_path = 'ocr_results.zip'
        with zipfile.ZipFile(zip_path, 'w') as zipf:
            for docx_file in docx_files:
                zipf.write(docx_file, docx_file.name)
        
        print(f"Downloading {len(docx_files)} files as {zip_path}")
        files.download(zip_path)
else:
    print("Skipping - not on Colab")
    print(f"   Output files are in: {CONFIG['output_dir']}")

## 8. Cleanup (Optional)

Remove temporary files to free up space

In [None]:
import shutil
from pathlib import Path

temp_dir = Path(CONFIG['temp_dir'])
if temp_dir.exists():
    shutil.rmtree(temp_dir)
    print(f"Cleaned up temporary files in {temp_dir}")
else:
    print("No temporary files to clean")