# PaddleOCR

In [None]:
# ignore if running outside Google Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/TraditionalMedicineChatbot/

In [None]:
!pip install paddlepaddle paddlepaddle-gpu
!pip install paddleocr
!pip install "langchain==0.0.353"

In [None]:
# --- PaddleOCR Initialization ---
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='vi')

In [None]:
# --- OCR Usage: Process Image ---
import os
img_filename = 'sample.png'
img_path = os.path.abspath(img_filename) if os.path.exists(img_filename) else None
if img_path is None:
    raise FileNotFoundError("Could not find 'sample.png' locally. Upload it or mount Google Drive and set img_path accordingly.")
print(f'Using image: {img_path}')
try:
    result = ocr.predict(img_path)
except Exception:
    result = ocr.ocr(img_path, cls=True)

In [None]:
import json
import numpy as np

# 1. Prepare data container
output_data = []
ocr_data = result[0]

rec_texts = ocr_data.get('rec_texts', [])
rec_scores = ocr_data.get('rec_scores', [])
rec_polys = ocr_data.get('rec_polys', [])

# 2. Convert data to standard Python types (to avoid JSON errors)
if rec_texts:
    for i in range(len(rec_texts)):
        # Handle the box: Convert numpy array to list if necessary
        box = rec_polys[i]
        if isinstance(box, np.ndarray):
            box = box.tolist()

        # Handle the score: Convert numpy float to python float
        score = rec_scores[i]
        if isinstance(score, (np.float32, np.float64)):
            score = float(score)

        # Create a structured dictionary for this detection
        detection = {
            "id": i + 1,
            "text": rec_texts[i],
            "confidence": score,
            "box": box
        }
        output_data.append(detection)

# 3. Write to JSON file
output_filename = 'ocr_result.json'
try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=4)
    print(f"Successfully saved {len(output_data)} detections to '{output_filename}'")
except Exception as e:
    print(f"Error saving JSON: {e}")

In [None]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
print("Loading Text Correction model...")
model_path = "protonx-models/protonx-legal-tc"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

model.to(device)
model.eval()
print("Model loaded successfully.")

In [None]:
def correct_text_with_hf(raw_text):
    """
    Takes raw OCR text and passes it through the ProtonX Legal TC model
    to fix accents and grammar.
    """
    if not raw_text or len(str(raw_text).strip()) == 0:
        return ""

    inputs = tokenizer(
        raw_text,
        return_tensors="pt",
        truncation=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            num_beams=10,
            max_new_tokens=128,
            length_penalty=1.0,
            early_stopping=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
if 'result' in locals() and result and result[0] is not None:
    ocr_data = result[0]

    # Extract lists safely
    rec_texts = ocr_data.get('rec_texts', [])
    rec_scores = ocr_data.get('rec_scores', [])
    rec_polys = ocr_data.get('rec_polys', [])

    print(f"Loaded {len(rec_texts)} detected text lines.")
else:
    print("Variable 'result' is empty or not defined. Please run PaddleOCR first.")
    rec_texts, rec_scores, rec_polys = [], [], []

In [None]:
output_data = []

if rec_texts:
    total = len(rec_texts)
    for i in range(total):
        raw_text = rec_texts[i]

        # 1. Status Update
        if i % 5 == 0:
            print(f"Processing line {i+1}/{total}...")

        # 2. Run Correction
        corrected = correct_text_with_hf(raw_text)

        # 3. Handle Numpy Types for JSON serialization
        # Box
        box = rec_polys[i]
        if isinstance(box, np.ndarray):
            box = box.tolist()

        # Score
        score = rec_scores[i]
        if isinstance(score, (np.float32, np.float64)):
            score = float(score)

        # 4. Build Dictionary
        detection = {
            "id": i + 1,
            "original_text": raw_text,
            "corrected_text": corrected,
            "confidence": score,
            "box": box
        }
        output_data.append(detection)

    print("Processing complete.")
else:
    print("No text to process.")

In [None]:
output_filename = 'ocr_result_corrected.json'

try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=4)
    print(f"Successfully saved {len(output_data)} detections to '{output_filename}'")
except Exception as e:
    print(f"Error saving JSON: {e}")

In [None]:
import os
import matplotlib.pyplot as plt
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont

In [None]:
if not os.path.exists('Roboto-Regular.ttf'):
    !wget -q -O Roboto-Regular.ttf https://github.com/googlefonts/roboto/raw/main/src/hinted/Roboto-Regular.ttf
    print("Font downloaded.")
else:
    print("Font already exists.")

In [None]:
def visualize_ocr_in_colab(image_path, ocr_results):
  if not os.path.exists(image_path):
      print(f"ERROR: Image not found at {image_path}")
      print("Tip: Drag and drop your image into the 'Files' folder on the left sidebar.")
      return

  # Load Font
  try:
      font = ImageFont.truetype("Roboto-Regular.ttf", 8)
  except:
      font = ImageFont.load_default()

  # Load Image
  img_cv2 = cv2.imread(image_path)
  if img_cv2 is None:
      print("Could not read image. Check file format.")
      return

  img_rgb = cv2.cvtColor(img_cv2, cv2.COLOR_BGR2RGB)
  pil_img = Image.fromarray(img_rgb)
  draw = ImageDraw.Draw(pil_img)

  print(f"Visualizing {len(ocr_results)} detections...")

  for item in ocr_results:
      # Extract data
      box = np.array(item['box'], dtype=np.int32)
      text_corrected = item['corrected_text']
      id_num = item['id']

      # 1. Draw Box (Green)
      # Convert numpy box to list of tuples for PIL
      flat_box = [tuple(point) for point in box]
      draw.polygon(flat_box, outline="#00FF00", width=3)

      # 2. Draw Text (Red on White BG)
      label = f"[{id_num}] {text_corrected}"

      # Position: Top-left of the box
      txt_x = np.min(box[:, 0])
      txt_y = np.min(box[:, 1]) - 30 # Move up a bit

      # Draw background rectangle for text (better visibility)
      try:
          left, top, right, bottom = draw.textbbox((txt_x, txt_y), label, font=font)
          draw.rectangle((left-5, top-5, right+5, bottom+5), fill="white")
      except:
          pass # fallback for older PIL versions

      draw.text((txt_x, txt_y), label, font=font, fill="#FF0000")

  # Display using Matplotlib (Safe for Colab)
  plt.figure(figsize=(20, 20))
  plt.imshow(np.array(pil_img))
  plt.axis('off')
  plt.show()

  print("Visualization function ready.")


# Ensure output_data exists from your previous cells before running this
if 'output_data' in locals() and os.path.exists(img_path):
    visualize_ocr_in_colab(img_path, output_data)
else:
    print("Cannot visualize: Make sure 'output_data' exists and IMAGE_PATH is correct.")

In [None]:
# 1. Install
! pip install --quiet vietocr

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from vietocr.tool.predictor import Predictor
from vietocr.tool.config import Cfg

# --- Setup VietOCR ---
config = Cfg.load_config_from_name('vgg_transformer')
config['cnn']['pretrained'] = False
config['device'] = 'cuda:0' # Use 'cpu' if no GPU
config['predictor']['beamsearch'] = False
recognizer = Predictor(config)

def debug_and_read(image_path):
    # 1. Load and Resize
    # Resizing to a fixed width helps the "dilation" kernel work consistently
    img_cv = cv2.imread(image_path)
    if img_cv is None:
        raise FileNotFoundError(f"Image not found: {image_path}")

    target_width = 1500
    h, w = img_cv.shape[:2]
    scale = target_width / w
    new_h = int(h * scale)
    img_cv = cv2.resize(img_cv, (target_width, new_h))

    # 2. Convert to Black/White for detection
    gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)

    # Use adaptive thresholding to handle shadows/lighting better
    binary = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV, 21, 10
    )

    # 3. Dilate to connect words into lines
    # Kernel size: (Wide, Short). Wide to connect words horizontally.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
    dilated = cv2.dilate(binary, kernel, iterations=2)

    # 4. Find Contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Sort from Top to Bottom
    bounding_boxes = [cv2.boundingRect(c) for c in contours]
    bounding_boxes.sort(key=lambda x: x[1])

    # 5. Draw Boxes & Read Text
    output_image = img_cv.copy()
    full_text = []

    print(f"Detected {len(bounding_boxes)} potential lines...")

    for x, y, w, h in bounding_boxes:
        # Filter noise: Box must be reasonable size
        if h < 10 or w < 20:
            continue
        # Filter "Whole Page" borders: Ignore if box is > 90% of image area
        if (w * h) > (0.9 * img_cv.shape[0] * img_cv.shape[1]):
            continue

        # Draw red box for debugging
        cv2.rectangle(output_image, (x, y), (x+w, y+h), (0, 0, 255), 2)

        # Crop and Read
        # Add padding
        pad = 5
        crop = img_cv[max(0, y-pad):min(new_h, y+h+pad), max(0, x-pad):min(target_width, x+w+pad)]

        # Convert to PIL for VietOCR
        crop_pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))

        try:
            text = recognizer.predict(crop_pil)
            full_text.append(text)
        except:
            continue

    # 6. Show the Debug Image
    plt.figure(figsize=(10, 15))
    plt.imshow(cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.title("Red boxes = Detected Lines")
    plt.show()

    return "\n".join(full_text)

# Run it
result = debug_and_read('sample.png')
print("\n--- Extracted Text ---\n")
print(result)