#Setup

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install matplotlib --upgrade
!pip install -U pillow

from google.colab import drive
drive.mount('/content/gdrive')

import math
import time
import cv2
import os
import pytesseract
from pytesseract import Output
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
#from deslant_img import deslant_img
import imutils
import fnmatch
import pandas as pd
from datetime import datetime

from skimage.transform import hough_line, hough_line_peaks, probabilistic_hough_line
from skimage.feature import canny
from skimage.draw import line

# Hyperparameters
img_width = 2100
img_height = 1650
line_dedup_threshold = 10
crop_left_margin = 100
crop_top_margin = 40
crop_dim = 200
min_horizontal_gap = 35
min_horizontal_intercept = 22
max_first_horizontal_intercept = 50
default_first_horizontal_intercept = 15

segment_width, segment_height = 100, 25
segment_corners = np.array([(0, segment_height), (segment_width, segment_height), (segment_width, 0), (0, 0)]).astype(np.float32)

# Parameters
root_dir = "/content/gdrive/MyDrive/ErukaTraining/OC/"
input_dir = f"{root_dir}raw/"
output_dir = f"{root_dir}images/"
error_dir = f"{root_dir}processing_logs/segmentation_errors/"

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 22 not upgraded.
Need to get 4,850 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1 [1,598 kB]
Get:2 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu focal/universe amd64 tesseract-ocr amd64 4.1.1-2build2 [262 kB]
Fetched 4,850 kB in 0s (12.5 MB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/Fron

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pillow
  Downloading Pillow-9.4.0-cp39-cp39-manylinux_2_28_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow
  Attempting uninstall: pillow
    Found existing installation: Pillow 8.4.0
    Uninstalling Pillow-8.4.0:
      Successfully uninstalled Pillow-8.4.0
Successfully installed pillow-9.4.0


Mounted at /content/gdrive


# Data Structures

In [None]:
class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y
        self.coords_int = [round(x), round(y)]
        self.coords_float = [x, y]

class Line:
    def __init__(self, x, y, slope):
        self.x = x
        self.y = y
        self.slope = slope
        self.is_horizontal = abs(slope) < 1
        # Intercept with the center of the image which is at (crop_dim/2, crop_dim/2)
        if self.is_horizontal:
            self.intercept = self.y - ((self.x - crop_dim/2) * self.slope)
        else:
            self.intercept = self.x - ((self.y - crop_dim/2) / self.slope)

class Box:
    def __init__(self, tl, tr, bl, br):
        self.tl = tl
        self.tr = tr
        self.bl = bl
        self.br = br

        minx = min(tl.x, tr.x, bl.x, br.x)
        maxx = max(tl.x, tr.x, bl.x, br.x)
        miny = min(tl.y, tr.y, bl.y, br.y)
        maxy = max(tl.y, tr.y, bl.y, br.y)

        self.height = maxy - miny
        self.width = maxx - minx

        self.polygon_int = np.array([bl.coords_int, br.coords_int, tr.coords_int, tl.coords_int])
        self.polygon_float = np.array([bl.coords_float, br.coords_float, tr.coords_float, tl.coords_float], dtype=np.float32)

        self.center = Point((minx+maxx)/2, (miny+maxy)/2)
    
    def __str__(self):
        return f"tl: ({self.tl.x},{self.tl.y}) tr: ({self.tr.x},{self.tr.y}) bl: ({self.bl.x},{self.bl.y}) br: ({self.br.x},{self.br.y})"

TODO:
- Cases where building is not detected
- Test stability for more images

# Line Detection

In [None]:
# Merge horizontal lines that are close together
def dedup_horizontal_lines(raw_lines):
    raw_lines.sort(key=lambda l: l.intercept)
    lines=[]
    merge_count = 1
    for line in raw_lines:
        if len(lines) == 0 or abs(line.intercept - lines[-1].intercept) > line_dedup_threshold:
            lines.append(line)
            merge_count = 1
        else:
            lines[-1] = Line(crop_dim/2, (lines[-1].intercept*merge_count + line.intercept)/(merge_count+1), (lines[-1].slope*merge_count + line.slope)/(merge_count+1))
            merge_count += 1
    return lines

# Ensure the first horizontal line is valid
def filter_horizontal_lines(raw_lines):
    lines=[]
    prev_intercept = 0.0
    for i, line in enumerate(raw_lines):
        # Remove lines outside of margin
        if line.intercept < min_horizontal_intercept:
            continue
        lines.append(line)
    
    if lines[0].intercept > max_first_horizontal_intercept:
        lines.insert(0, Line(crop_dim/2, default_first_horizontal_intercept, 0))

    return lines

def dedup_vertical_lines(raw_lines):
    raw_lines.sort(key=lambda l: l.intercept)
    lines=[]
    merge_count = 1
    for line in raw_lines:
        if len(lines) == 0 or abs(line.intercept - lines[-1].intercept) > line_dedup_threshold:
            lines.append(line)
            merge_count = 1
        else:
            lines[-1] = Line((lines[-1].intercept*merge_count + line.intercept)/(merge_count+1), crop_dim/2, line.slope) # Don't average the slopes
            merge_count += 1
    return lines

def detect_lines(img_cropped):
    # Preprocessing for Hough
    gray = cv2.cvtColor(img_cropped,cv2.COLOR_BGR2GRAY)
    thresh, thresh_image = cv2.threshold(gray, 165, 255, cv2.THRESH_BINARY)
    gray = cv2.convertScaleAbs(thresh_image) # converting the scale
    edges = cv2.Canny(gray, 0, 200)

    # Parameters
    thresh = 10
    min_distance = 15

    # Perform hough transformation
    dimx, dimy = edges.shape
    #diagonal = np.sqrt(dimx**2 + dimy**2)
    #thresh = int(thresh * diagonal)
    # TODO: only scan near horizontal and near vertical lines
    tested_angles = np.linspace(-np.pi, np.pi, 360, endpoint = False)

    # Apply hough lines to retrieve all possible lines
    h, theta, d = hough_line(edges, theta = tested_angles)
    hspace, angles, dists = hough_line_peaks(h, theta, d, thresh, min_distance)
    
    lines = []
    for _, angle, dist in zip(hspace, angles, dists):
        (x0, y0) = dist * np.array([np.cos(angle), np.sin(angle)])
        angle_abs = abs(angle)
        # We are only looking for nearly vertical and horizontal lines so use a pi/8 margin
        if np.pi/8 < angle_abs and angle_abs < 3*np.pi/8:
            continue
        slope = np.tan(angle + np.pi/2)
        lines.append(Line(x0, y0, slope))

    h_lines = [line for line in lines if line.is_horizontal]
    v_lines = [line for line in lines if not line.is_horizontal]

    deduped = dedup_horizontal_lines(h_lines)
    # print(f"Before filtering: {len(deduped)}")
    deduped = filter_horizontal_lines(deduped)
    # print(f"After filtering: {len(filtered)}")

    # return h_lines, v_lines
    return deduped, dedup_vertical_lines(v_lines)


# Resolve bounding boxes

In [None]:
## Finding the intersection points

# To avoid some precision bugs
def bound_slope(slope):
    return min(max(slope, -1000), 1000)

def intersection(line1, line2):
    A = np.array([[bound_slope(-line1.slope), 1], 
                  [bound_slope(-line2.slope), 1]])
    b = np.array([[line1.y - bound_slope(line1.slope)*line1.x], 
                  [line2.y - bound_slope(line2.slope)*line2.x]])
    
    x = np.linalg.lstsq(A, b, rcond=-1)[0]
    return Point(x[0][0], x[1][0]) # use lstsq to solve Ax = b, not inv() which is unstable

# Function sourced from this answer: https://stackoverflow.com/a/70371736

def hough_inter(theta1, rho1, theta2, rho2):
    A = np.array([[np.cos(theta1), np.sin(theta1)], 
                  [np.cos(theta2), np.sin(theta2)]])
    b = np.array([rho1, rho2])
    
    return np.linalg.lstsq(A, b)[0] # use lstsq to solve Ax = b, not inv() which is unstable

def resolve_bounding_boxes(h_lines, v_lines):
    # Now loop through all combinations of lines, only checking for intersections if they are of different types
    intersections = []

    for h_line in h_lines:
        for v_line in v_lines:
            intersections.append(intersection(h_line, v_line))

    boxes = []

    for i, h_line in enumerate(h_lines):
        bl = intersection(h_line, v_lines[0])
        br = intersection(h_line, v_lines[1])
        if i > 0:
            boxes.append(Box(tl, tr, bl, br))
        tl = Point(bl.x, bl.y)
        tr = Point(br.x, br.y)
    
    return intersections, boxes



#Batch process

In [None]:
# Get initial position, this should be the center of the "Buildings" cell
def get_initial_position(img):
    d = pytesseract.image_to_data(img, output_type=Output.DICT)

    building_index, total_index, land_index, valuation_index = -999, -999, -999, -999

    for index, text in enumerate(d['text']):
        if text.lower() == "buildings":
            building_index = index
        if text.lower() == "total":
            total_index = index
        if text.lower() == 'land':
            land_index = index
        if text.lower() == 'valuations':
            valuation_index = index

    if building_index == -999: 
        raise RuntimeError("could not find buildings cell")

    return Point(d['left'][building_index] + d['width'][building_index]/2, d['top'][building_index] + d['height'][building_index]/2)

def interpolate_row_coordinate(a, b, rows):
    new_x = (a.x * (rows-1) + b.x)/rows
    new_y = (a.y * (rows-1) + b.y)/rows
    return Point(new_x, new_y)

def get_next_position(center, img, out_dir, i):

    top = int(center.y - crop_top_margin)
    left = int(center.x - crop_left_margin)

    img_cropped = img[top:top+crop_dim, left:left+crop_dim]
    h_lines, v_lines = detect_lines(img_cropped)

    plt.clf()
    plt.imshow(img_cropped)
    for line in h_lines:
        plt.axline((line.x, line.y), slope=line.slope, color='red')
    for line in v_lines:
        plt.axline((line.x, line.y), slope=line.slope, color='blue')
        
    if len(h_lines) < 3:
        # plt.savefig(f"{out_dir}{i}_lines.jpg")
        raise RuntimeError("could not detect at least 3 horizontal lines")
    
    if len(v_lines) != 2:
        # plt.savefig(f"{out_dir}{i}_lines.jpg")
        print("Warning: could not detect exactly 2 vertical lines, using defaults")
        v_lines = [Line(10, crop_dim/2, 1000), Line(190, crop_dim/2, 1000)]
    
    intersections, boxes = resolve_bounding_boxes(h_lines, v_lines)

    next_box = boxes[1]
    
    if next_box.height < 35:
        print(f"Warning: box of height {next_box.height} is likely due to a extra horizontal line")
        next_box = boxes[2]

    if next_box.height > 70:
        print(f"Warning: box of height {next_box.height} is likely due to a missing horizontal line")
        rows = round(next_box.height/50)
        next_box = Box(next_box.tl, next_box.tr, interpolate_row_coordinate(next_box.tl, next_box.bl, rows), interpolate_row_coordinate(next_box.tr, next_box.br, rows))


    if not (math.isclose(next_box.height, 50, rel_tol=0.2) and math.isclose(next_box.width, 180, rel_tol=0.2)):
        # plt.savefig(f"{out_dir}{i}_lines.jpg")
        raise RuntimeError(f"box dimensions {next_box.width}x{next_box.height} is unexpected")

    M = cv2.getPerspectiveTransform(next_box.polygon_float, segment_corners)
    out = cv2.warpPerspective(img_cropped, M,(segment_width, segment_height))

    return Point(center.x - crop_left_margin + next_box.center.x, center.y - crop_top_margin + next_box.center.y), out

all_files = set(os.listdir(input_dir))
print(f"Input files: {len(all_files)}")

existing_files = set(os.listdir(output_dir))
print(f"Existing files: {len(existing_files)}")

error_files = fnmatch.filter(os.listdir(error_dir), "*.csv")
error_set = set()
for error_file in error_files:
  error_file_df = pd.read_csv(f'{error_dir}{error_file}', converters={'file': str})
  error_file_set = set(error_file_df['file'].values.flatten())
  error_set.update(error_file_set)
print(f"Errors: {len(error_set)}")

missing_files = list(all_files - existing_files - error_set)
print(len(missing_files))
print(f"Files to process: {len(missing_files)}")

batch = 0
batch_total = 1
batch_files = missing_files[batch*len(missing_files)//batch_total:(batch+1)*len(missing_files)//batch_total]
errors = []
box_count = 0

for file_i, file in enumerate(batch_files):
    if not file.endswith(".jpg"):
        continue

    print(f"Processing file ({file_i}/{len(batch_files)}): {file}")

    if (file_i + 1) % 1000 == 0:
        errors_df = pd.DataFrame(errors, columns=['file', 'message'])
        errors_df.to_csv(f"{error_dir}{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv", index=False)

    img = cv2.imread(f'{input_dir}{file}')
    try:
      img = cv2.resize(img, (img_width, img_height))
    except:
        print(f"Processing failed for {file}: resize error\n")
        errors.append([file, "resize error"])
        continue
    try:
        center = get_initial_position(img)
        print(f"Center: ({center.x},{center.y})")
    except RuntimeError as error:
        print(f"Processing failed for {file}: {str(error)}\n")
        errors.append([file, str(error)])
        continue

    try:
        center, segmented_img = get_next_position(center, img, output_dir, 0)
        box_count += 1
    except RuntimeError as error:
        print(f"Processing failed for {file}: {str(error)}\n")
        errors.append([file, str(error)])
        continue
    except IndexError as error:
        print(f"Processing failed for {file}: {str(error)}\n")
        errors.append([file, str(error)])
        continue

    cv2.imwrite(f"{output_dir}{file}", segmented_img)

errors_df = pd.DataFrame(errors, columns=['file', 'message'])
errors_df.to_csv(f"{error_dir}{datetime.now().strftime('%Y-%m-%d-%H-%M')}.csv", index=False)
print(f"Box count: {box_count}")



KeyboardInterrupt: ignored