In [15]:
import cv2
import matplotlib.pyplot as plt
from ultralytics import YOLO
from paddleocr import PaddleOCR
import numpy as np
import re
import os
import json
import math
import base64
from scipy.ndimage import interpolation as inter
from typing import List, Tuple
import matplotlib.pyplot as plt


imgsz = 640

model = YOLO('./TRAINING/runs/segment/train3/weights/best.pt')
ocr = PaddleOCR(
    ocr_version='PP-OCRv5',
    use_doc_orientation_classify=True, 
    use_doc_unwarping=False, 
    use_textline_orientation=True
    )

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /home/coco/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in /home/coco/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in /home/coco/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in /home/coco/.paddlex/official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [2]:
def correct_skew(image, delta=1, limit=5):
    def determine_score(arr, angle):
        data = inter.rotate(arr, angle, reshape=False, order=0)
        histogram = np.sum(data, axis=1, dtype=float)
        score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
        return histogram, score
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    scores = []
    angles = np.arange(-limit, limit + delta, delta)
    for angle in angles:
        histogram, score = determine_score(thresh, angle)
        scores.append(score)
    best_angle = angles[scores.index(max(scores))]
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
    corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return best_angle, corrected

def calculate_aspect_ratio(width, height):
    gcd = math.gcd(int(width), int(height))
    simplified_width = int(width) // gcd
    simplified_height = int(height) // gcd
    return f"{simplified_width}:{simplified_height}", simplified_width / simplified_height

def aspect_ratio_distance(ratio, target_ratio=82/57):
    return abs(ratio - target_ratio)

def find_mask_corners(mask):
    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        epsilon = 0.02 * cv2.arcLength(largest_contour, True)
        approx = cv2.approxPolyDP(largest_contour, epsilon, True)
        if len(approx) == 4:
            return approx.reshape(4, 2).astype(np.float32)
        else:
            rect = cv2.minAreaRect(largest_contour)
            box = cv2.boxPoints(rect)
            return box.astype(np.float32)
    return None

def evaluate_mask_shape(mask):
    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return 0
    largest_contour = max(contours, key=cv2.contourArea)
    epsilon = 0.02 * cv2.arcLength(largest_contour, True)
    approx = cv2.approxPolyDP(largest_contour, epsilon, True)
    shape_score = 0
    if len(approx) == 4:
        shape_score += 10
    area = cv2.contourArea(largest_contour)
    hull = cv2.convexHull(largest_contour)
    hull_area = cv2.contourArea(hull)
    if hull_area > 0:
        solidity = area / hull_area
        shape_score += solidity * 5
    perimeter = cv2.arcLength(largest_contour, True)
    if perimeter > 0:
        circularity = 4 * np.pi * area / (perimeter * perimeter)
        rectangularity = 1 - circularity
        shape_score += rectangularity * 3
    return shape_score

def calculate_mask_area(mask):
    return np.sum(mask)

def is_four_sided_shape(mask):
    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return False
    largest_contour = max(contours, key=cv2.contourArea)
    epsilon = 0.02 * cv2.arcLength(largest_contour, True)
    approx = cv2.approxPolyDP(largest_contour, epsilon, True)
    return len(approx) == 4

def check_minimum_width(mask, min_width=20):
    mask_coords = np.where(mask)
    if len(mask_coords[0]) > 0:
        min_x, max_x = np.min(mask_coords[1]), np.max(mask_coords[1])
        width = max_x - min_x + 1
        return width >= min_width
    return False

def meets_mandatory_requirements(mask, max_aspect_distance=0.5):
    if not is_four_sided_shape(mask):
        return False, "Not 4-sided"
    if not check_minimum_width(mask):
        return False, "Width < 20px"
    mask_coords = np.where(mask)
    if len(mask_coords[0]) == 0:
        return False, "Empty mask"
    min_y, max_y = np.min(mask_coords[0]), np.max(mask_coords[0])
    min_x, max_x = np.min(mask_coords[1]), np.max(mask_coords[1])
    mask_width = max_x - min_x + 1
    mask_height = max_y - min_y + 1
    # aspect_str, aspect_float = calculate_aspect_ratio(mask_width, mask_height)
    # distance = aspect_ratio_distance(aspect_float)
    # if distance > max_aspect_distance:
    #     return False, f"Aspect ratio too far: {aspect_str}"
    return True, "Valid"

def order_points(pts):
    rect = np.zeros((4, 2), dtype=np.float32)
    s = pts.sum(axis=1)
    diff = np.diff(pts, axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect


In [50]:
def extract_bounding_boxes(ocr_results: List[dict]) -> List[dict]:
    """
    Extract bounding boxes from OCR results
    
    Args:
        ocr_results: List of OCR result dictionaries
        
    Returns:
        List of dictionaries containing bounding box info for each page/image
    """
    all_boxes = []
    
    for page_idx, result in enumerate(ocr_results):
        # page_boxes = {
        #     # 'page_index': page_idx,
        #     'boxes': [],
        #     # 'texts': result.get('rec_texts', []),
        #     # 'scores': result.get('rec_scores', [])
        # }
        
        # Method 1: Using rec_boxes (if available)
        if 'rec_boxes' in result and len(result['rec_boxes']) > 0:
            for i, box in enumerate(result['rec_boxes']):
                # rec_boxes format: [x1, y1, x2, y2] or similar
                bbox_info = {
                    'bbox': box.tolist(),
                    'text': result['rec_texts'][i] if i < len(result['rec_texts']) else '',
                    'score': result['rec_scores'][i] if i < len(result['rec_scores']) else 0,
                    'type': 'rec_box'
                }
                all_boxes.append(bbox_info)
        
        # Method 2: Using rec_polys (polygon format)
        elif 'rec_polys' in result and len(result['rec_polys']) > 0:
            for i, poly in enumerate(result['rec_polys']):
                # Convert polygon to bounding box
                x_coords = poly[:, 0]
                y_coords = poly[:, 1]
                x1, y1 = np.min(x_coords), np.min(y_coords)
                x2, y2 = np.max(x_coords), np.max(y_coords)
                
                bbox_info = {
                    'bbox': [x1, y1, x2, y2],
                    'polygon': poly.tolist(),
                    'text': result['rec_texts'][i] if i < len(result['rec_texts']) else '',
                    'score': result['rec_scores'][i] if i < len(result['rec_scores']) else 0,
                    'type': 'polygon'
                }
                all_boxes.append(bbox_info)
        
        # Method 3: Using dt_polys (detection polygons)
        elif 'dt_polys' in result and len(result['dt_polys']) > 0:
            for i, poly in enumerate(result['dt_polys']):
                # Convert polygon to bounding box
                x_coords = poly[:, 0]
                y_coords = poly[:, 1]
                x1, y1 = np.min(x_coords), np.min(y_coords)
                x2, y2 = np.max(x_coords), np.max(y_coords)
                
                bbox_info = {
                    'bbox': [x1, y1, x2, y2],
                    'polygon': poly.tolist(),
                    'text': result['rec_texts'][i] if i < len(result['rec_texts']) else '',
                    'score': result['rec_scores'][i] if i < len(result['rec_scores']) else 0,
                    'type': 'detection'
                }
                all_boxes.append(bbox_info)
        
        # all_boxes.append(page_boxes)
    
    return all_boxes

In [58]:
def process_image(img_data):
    try:
        nparr = np.frombuffer(base64.b64decode(img_data), np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        original_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = original_rgb.shape[:2]
        
        results = model(cv2.resize(img.copy(), (imgsz, imgsz)), verbose=False, conf=0.4, device='cuda')
        ocr_results = []
        
        if results[0].masks is not None:

            boxes = [[x1, y1, x2, y2, score] for x1, y1, x2, y2, score, _ in results[0].boxes.data.tolist()]
            mask_indices = np.argsort([mask[0][0] for mask in results[0].masks.data.tolist()])
            box_indices = np.argsort([box[0] for box in boxes])
            index_mapping = dict(zip(box_indices, range(len(boxes))))
            tracked_masks = [results[0].masks.data.tolist()[mask_indices[index_mapping[i]]] 
                            for i in range(len(boxes)) if i in index_mapping]

            
            for mask_idx, (box, mask) in enumerate(zip(boxes, tracked_masks)):
                
                xyxy = np.array(box[:4])
                x1, y1, x2, y2 = map(int, xyxy)
                scale_x, scale_y = w / imgsz, h / imgsz
                x1, y1, x2, y2 = int(x1 * scale_x), int(y1 * scale_y), int(x2 * scale_x), int(y2 * scale_y)
                
                padding = 50
                x1_pad = max(0, x1 - padding)
                y1_pad = max(0, y1 - padding)
                x2_pad = min(w, x2 + padding)
                y2_pad = min(h, y2 + padding)
                
                cropped_no_pad = original_rgb[y1:y2, x1:x2]
                if cropped_no_pad.size == 0:
                    continue
                
                mask_data = np.array(mask[:-2])
                enhanced_mask = mask_data > 0.5
                is_valid, reason = meets_mandatory_requirements(enhanced_mask)
                
                if is_valid:
                    mask_coords = np.where(enhanced_mask)
                    min_y, max_y = np.min(mask_coords[0]), np.max(mask_coords[0])
                    min_x, max_x = np.min(mask_coords[1]), np.max(mask_coords[1])
                    mask_width = max_x - min_x + 1
                    mask_height = max_y - min_y + 1
                    aspect_str, aspect_float = calculate_aspect_ratio(mask_width, mask_height)
                    # distance = aspect_ratio_distance(aspect_float)
                    shape_score = evaluate_mask_shape(enhanced_mask)
                    area = calculate_mask_area(enhanced_mask)
                    # valid_candidate = {'mask': enhanced_mask, 'image': original_rgb, 'name': 'full', 'distance': distance, 'shape_score': shape_score, 'area': area}
                    valid_candidate = {'mask': enhanced_mask, 'image': original_rgb, 'name': 'full', 'shape_score': shape_score, 'area': area}
                else:
                    valid_candidate = None
                
                if valid_candidate is None:
                    working_image = cv2.cvtColor(cropped_no_pad, cv2.COLOR_RGB2BGR)
                else:
                    # corners = find_mask_corners(valid_candidate['mask'])
                    # if corners is not None:
                    #     ordered_corners = order_points(corners)
                    #     width = int(max(np.linalg.norm(ordered_corners[1] - ordered_corners[0]), np.linalg.norm(ordered_corners[2] - ordered_corners[3])))
                    #     height = int(max(np.linalg.norm(ordered_corners[3] - ordered_corners[0]), np.linalg.norm(ordered_corners[2] - ordered_corners[1])))
                    #     dst_corners = np.array([[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]], dtype=np.float32)
                    #     M = cv2.getPerspectiveTransform(ordered_corners, dst_corners)
                    #     warped = cv2.warpPerspective(valid_candidate['image'], M, (width, height))
                    #     working_image = cv2.cvtColor(warped, cv2.COLOR_RGB2BGR)
                    # else:
                    working_image = cv2.cvtColor(cropped_no_pad, cv2.COLOR_RGB2BGR)

                _, corrected_bgr = correct_skew(working_image)
                final_gray = cv2.cvtColor(corrected_bgr, cv2.COLOR_BGR2GRAY)
                _, thresholded = cv2.threshold(final_gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
                
                try:
                    final_img = cv2.cvtColor(thresholded, cv2.COLOR_GRAY2RGB)
                    result = ocr.predict(final_img)
                    if result and result[0]:
                        ocr_texts = result[0].get('rec_texts', [])
                        confidences = result[0].get('rec_scores', [])
                        rec_polys = result[0].get('rec_polys', [])
                        rec_boxes = result[0].get('rec_boxes', [])
                        dt_polys = result[0].get('dt_polys', [])
                        # print(rec_boxes)
                        # print('-'*8)

                        # print(extract_bounding_boxes(result))
                        
                        if ocr_texts:
                            ocr_results.extend(extract_bounding_boxes(result))
                            # ocr_text = " ".join(ocr_texts)
                            # avg_confidence = sum(confidences) / len(confidences) if confidences else 0

                            # for ocr_text, conf in zip(ocr_texts, confidences):
                            #     # ocr_text = re.sub(r'[^A-Za-z0-9\s]', '', ocr_text).strip()
                            #     if len(ocr_text) >= 3:
                            #         ocr_results.extend(extract_bounding_boxes(result))
                                    # ocr_results.append({'text': ocr_text, 'confidence': conf, 'bbox': [x1, y1, x2, y2]})
                        else:
                            continue
                    else:
                        ocr_text = ""
                        avg_confidence = 0
                except Exception as e:
                    ocr_text = f"ERROR: {str(e)[:30]}"
                    avg_confidence = 0

                # ocr_results.append({'text': ocr_text, 'confidence': avg_confidence,})
        
        return {'success': True, 'results': ocr_results}
        
    except Exception as e:
        return {'success': False, 'error': str(e)}

In [59]:
# img_path = './datasets/valid/images/ds1_0000_jpg.rf.4eae7fa15bbb753677b864a8c8437835.jpg'
# img_path = './6163206443046652006.jpg'
img_path = './TRAINING/sg-11134207-7rcda-lri9ci2lu1wo0e.jpg'

img = base64.b64encode(open(img_path, 'rb').read()).decode('utf-8')

process_image(img)


  data = inter.rotate(arr, angle, reshape=False, order=0)


{'success': True,
 'results': [{'bbox': [64, 15, 73, 25],
   'text': '￥',
   'score': 0.528009831905365,
   'type': 'rec_box'},
  {'bbox': [46, 33, 55, 42],
   'text': '★',
   'score': 0.3508351147174835,
   'type': 'rec_box'},
  {'bbox': [81, 32, 91, 42],
   'text': '★',
   'score': 0.8847569227218628,
   'type': 'rec_box'},
  {'bbox': [52, 56, 61, 66],
   'text': '★',
   'score': 0.6082736253738403,
   'type': 'rec_box'},
  {'bbox': [77, 58, 84, 65],
   'text': '1',
   'score': 0.10120319575071335,
   'type': 'rec_box'},
  {'bbox': [18, 232, 207, 272],
   'text': 'NAME SURNAME',
   'score': 0.98024582862854,
   'type': 'rec_box'},
  {'bbox': [20, 267, 166, 291],
   'text': 'JOB POSITION',
   'score': 0.9643695950508118,
   'type': 'rec_box'},
  {'bbox': [17, 302, 134, 322],
   'text': '+0123 456 789',
   'score': 0.9298632740974426,
   'type': 'rec_box'},
  {'bbox': [16, 343, 220, 360],
   'text': 'name@companyname.com',
   'score': 0.982434868812561,
   'type': 'rec_box'},
  {'bbox'

In [57]:
# image_path = './sg-11134207-7rcda-lri9ci2lu1wo0e.jpg'
# results = model(image_path)

# img = cv2.imread(image_path)
# img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# plt.figure(figsize=(15, 10))
# plt.subplot(1, 2, 1)
# plt.imshow(img_rgb)
# plt.title('Original Image')
# plt.axis('off')

# plt.subplot(1, 2, 2)
# annotated_img = results[0].plot()
# annotated_rgb = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)
# plt.imshow(annotated_rgb)
# plt.title('YOLO Segmentation Results')
# plt.axis('off')

# plt.tight_layout()
# plt.show()

# for result in results:
#     print(f"Detected {len(result.boxes)} objects")
#     if result.masks is not None:
#         print(f"Generated {len(result.masks)} masks")
#     print(f"Confidence scores: {result.boxes.conf.tolist()}")
#     print(f"Class IDs: {result.boxes.cls.tolist()}")