# 1. Setup Google Colab GPU Environment
Configure the runtime to use GPU acceleration in Google Colab. Check GPU availability and display GPU information using `nvidia-smi`.

In [None]:
# Check GPU availability
import os
os.system('nvidia-smi')

# 2. Install Required Libraries
Install necessary packages for PDF processing, GPU computation, and parallelism such as PyPDF2, pdfplumber, CUDA-compatible libraries, and parallel processing tools.

In [None]:
# Clone the repository and install dependencies
!git clone https://github.com/HuyTran28/23CLCT2_TraditionalMedicineChatbot.git /content/repo
%cd /content/repo/ocr

# Install required libraries
!pip install PyPDF2 pdfplumber ray torch tensorflow tqdm pytesseract pillow

# 3. Configure GPU Access and Memory
Set up TensorFlow or PyTorch to utilize GPU, configure memory growth to prevent OOM errors, and verify GPU is accessible to the pipeline.

In [None]:
# TensorFlow GPU setup
import tensorflow as tf
try:
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"TensorFlow is using GPU: {gpus}")
    else:
        print("No GPU found for TensorFlow.")
except Exception as e:
    print(f"TensorFlow GPU setup error: {e}")

# PyTorch GPU setup
import torch
if torch.cuda.is_available():
    print(f"PyTorch is using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found for PyTorch.")

# 4. Load and Preprocess PDF
Load the 300-page PDF document, split into chunks or pages, and prepare data structures for parallel processing.

In [None]:
# Add the repo modules to path
import sys
sys.path.insert(0, '/content/repo/ocr')

# Upload PDF file to Colab
from google.colab import files
uploaded = files.upload()
pdf_path = next(iter(uploaded))

# Load PDF and split into pages
import pdfplumber
pages = []
with pdfplumber.open(pdf_path) as pdf:
    for i in range(len(pdf.pages)):
        pages.append(pdf.pages[i])
print(f"Loaded {len(pages)} pages from PDF.")

# 5. Implement Parallel Processing with GPU
Use multiprocessing, concurrent.futures, or Ray to parallelize PDF processing tasks across GPU cores. Distribute pages or chunks across available compute resources.

In [None]:
# Import modules from the cloned repository
from modules.ocr_engine import OCREngine
from modules.pipeline import Pipeline
from modules.exporter import Exporter
import ray

ray.init(ignore_reinit_error=True)

# Initialize OCR engine with GPU support
ocr_engine = OCREngine(use_gpu=True)

@ray.remote
def process_page_with_pipeline(page_num, page):
    """Process a page using the repository's OCR pipeline"""
    try:
        # Extract text using the OCR engine
        text = ocr_engine.process_image(page)
        return page_num, text
    except Exception as e:
        print(f"Error processing page {page_num}: {e}")
        return page_num, ""

# Run parallel processing
results = ray.get([process_page_with_pipeline.remote(i, pages[i]) for i in range(len(pages))])
results.sort(key=lambda x: x[0])
texts = [text for _, text in results]
print(f"Processed {len(texts)} pages in parallel using the pipeline.")

# 6. Optimize Memory Usage for Large Files
Implement batch processing, streaming techniques, and memory-efficient data structures to handle large PDFs without exceeding Colab's memory limits.

In [None]:
# Batch processing example for memory efficiency
batch_size = 50
batched_results = []
for start in range(0, len(pages), batch_size):
    batch = pages[start:start+batch_size]
    batch_results = ray.get([process_page.remote(i+start, page) for i, page in enumerate(batch)])
    batch_results.sort(key=lambda x: x[0])
    batched_results.extend(batch_results)
    print(f"Processed batch {start//batch_size + 1} of {len(pages)//batch_size + 1}")
texts = [text for _, text in batched_results]

# 7. Monitor GPU Performance
Track GPU utilization, memory consumption, and processing speed using GPU monitoring tools and custom metrics during pipeline execution.

In [None]:
# Monitor GPU usage and processing speed
import time
start_time = time.time()
os.system('nvidia-smi')
# (Run your pipeline here)
end_time = time.time()
print(f"Total processing time: {end_time - start_time:.2f} seconds")

# 8. Run Pipeline and Benchmark Results
Execute the complete pipeline, measure execution time, compare CPU vs GPU performance, and document optimization improvements.

In [None]:
# Run the pipeline and benchmark
import time
start = time.time()
# Run batch processing
end = time.time()
print(f"GPU parallel processing time: {end-start:.2f} seconds")

# Save results
with open('ocr_results.txt', 'w', encoding='utf-8') as f:
    for i, text in enumerate(texts):
        f.write(f"Page {i+1}:\n{text}\n\n")
print("OCR results saved to ocr_results.txt")