In [None]:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from pathlib import Path

# Load PDF using pdf2image
# run image processing techniques that come from cv2 <-- documentation is on opencv-python
# run the ocr using pytesseract and compare results based on bounding box results

In [None]:
# Use the grayscale image for OCR
# TODO: experiment with different processed versions
ocr_image = gray.copy()

# Get detailed OCR data including bounding boxes
# Level 5 = word level
ocr_data = pytesseract.image_to_data(ocr_image, output_type=pytesseract.Output.DICT)

# Create a pandas DataFrame from OCR results
df_ocr = pd.DataFrame(ocr_data)

# TODO: Filter out empty text
# TODO: Remove invalid confidence scores
# do this using pandas dataframe operations

print(f"Total words detected: {len(df_ocr)}")
print(f"\nFirst 10 detected words:")
display(df_ocr[['text', 'conf', 'left', 'top', 'width', 'height']].head(10))

# Statistics
print(f"\nOCR Statistics:")
print(f"Average confidence: {df_ocr['conf'].mean():.2f}%")
print(f"Min confidence: {df_ocr['conf'].min():.2f}%")
print(f"Max confidence: {df_ocr['conf'].max():.2f}%")

In [None]:

# Create a copy of the grayscale image for drawing
# TODO: change this based on what pictures you want to use.
# image_with_boxes = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
image_with_boxes = ...

# Draw bounding boxes for each word
for idx, row in df_ocr.iterrows():
    x, y, w, h = row['left'], row['top'], row['width'], row['height']
    conf = row['conf']
    
    # Color based on confidence: green (high) to red (low)
    if conf > 80:
        color = (0, 255, 0)  # Green
    elif conf > 60:
        color = (0, 255, 255)  # Yellow
    else:
        color = (0, 0, 255)  # Red
    
    # Draw rectangle
    cv2.rectangle(image_with_boxes, (x, y), (x + w, y + h), color, 2)
    
    # Optionally add confidence score
    # cv2.putText(image_with_boxes, f"{conf:.0f}", (x, y-5), 
    #             cv2.FONT_HERSHEY_SIMPLEX, 0.3, color, 1)

# Display the result
plt.figure(figsize=(15, 10))
plt.imshow(cv2.cvtColor(image_with_boxes, cv2.COLOR_BGR2RGB))
plt.title("Text Detection with Bounding Boxes\n(Green: High Confidence, Yellow: Medium, Red: Low)", 
          fontsize=14, fontweight='bold')
plt.axis('off')
plt.show()

print("\nColor Legend:")
print("🟢 Green: Confidence > 80%")
print("🟡 Yellow: Confidence 60-80%")
print("🔴 Red: Confidence < 60%")

In [None]:
# Get line-level OCR data
# We'll group words by their line number
df_lines = df_ocr.groupby('line_num').agg({
    'text': lambda x: ' '.join(x),
    'left': 'min',
    'top': 'min',
    'width': lambda x: max(df_ocr.loc[x.index, 'left'] + df_ocr.loc[x.index, 'width']) - min(df_ocr.loc[x.index, 'left']),
    'height': 'max',
    'conf': 'mean'
}).reset_index()

print(f"Total lines detected: {len(df_lines)}")
print("\nFirst 10 lines:")
display(df_lines[['line_num', 'text', 'conf']].head(10))

# Create image with line-level bounding boxes
image_with_lines = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)

for idx, row in df_lines.iterrows():
    x, y, w, h = int(row['left']), int(row['top']), int(row['width']), int(row['height'])
    
    # Draw thicker boxes for lines
    cv2.rectangle(image_with_lines, (x, y), (x + w, y + h), (255, 0, 0), 3)
    
    # Add line number
    cv2.putText(image_with_lines, f"L{row['line_num']}", (x, y-10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)

# Display
plt.figure(figsize=(15, 10))
plt.imshow(cv2.cvtColor(image_with_lines, cv2.COLOR_BGR2RGB))
plt.title("Line-Level Text Detection", fontsize=14, fontweight='bold')
plt.axis('off')
plt.show()

In [None]:
# Extract full text
# TODO: see how well this does based on the filters that you put in
full_text = pytesseract.image_to_string(ocr_image)

print("="*80)
print("EXTRACTED TEXT FROM IMAGE")
print("="*80)
print(full_text)
print("="*80)