# <font color="black">Assignment 2: Reading Order of Documents using TesseractOCR and OpenCV</font>

In [1]:
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'



### <font color="black">Cropped the original document to remove unnecessary noise</font>

In [2]:
image = cv2.imread('Assignment02-Test-Page-Edited.jpg')


###  <font color="black">Convert to a gray image for removal of colors if any</font>

In [3]:
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


###  <font color="black"> Used a Gaussian Blur to help in creating vertical and horizontal contours </font>

In [4]:
blur=cv2.GaussianBlur(gray_image,(11,11),0)
cv2.imwrite('blur.jpg',blur)

True

### <font color="black"> Inverted Binarized Image </font>

In [5]:
threshold_image = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
cv2.imwrite('thresh.jpg',threshold_image)

True

### <font color="black"> Dilate Image to fill in gaps and Kernel matrix used to isolate contours </font>

In [6]:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10,1))
cv2.imwrite('kernel.jpg',kernel)
dilated_image = cv2.dilate(threshold_image, kernel, iterations=3)
cv2.imwrite('dilate.jpg',dilated_image)

True

### <font color="black"> Find Contours Coordinates</font>

In [7]:
contours=cv2.findContours(dilated_image,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)

In [8]:
contours=contours[0] if len(contours)==2 else contours[1]

### <font color="black">Convert to coordinates for rectangle creation through opencv</font>

In [9]:
for c in contours:
    x,y,w,h=cv2.boundingRect(c)
    cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
cv2.imwrite('final.jpg',image)

True

## <font color="black">Note: Couldn't Remove Signatures as Ink Weight is close to that of Typed characters weight</font>

## <font color="black">Q2: Creation of DAG </font>

In [10]:
class LineSegmentNode:
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.x_range = (x, x+w)
        self.y_range = (y, y+h)
        self.children = []

    def add_child(self, child):
        self.children.append(child)


### <font color="black">Define conditions for edge creation </font>

In [11]:
def is_above(a, b):
    return a.y < b.y

def is_left_of(a, b):
    return a.x + a.w <= b.x

def is_overlapping(a, b):
    return a.x_range[1] >= b.x_range[0] and a.x_range[0] <= b.x_range[1]

def is_between(a, b, c):
    return a.y_range[0] <= c.y <= b.y_range[1]

### <font color="black">Building DAG</font>

In [12]:
line_segments = []
for c in contours:
    x,y,w,h=cv2.boundingRect(c)
    line_segments.append(LineSegmentNode(x, y, w, h))

for i, a in enumerate(line_segments):
    for j in range(i):
        b = line_segments[j]
        if is_overlapping(a, b) and is_above(a, b):
            a.add_child(b)
        elif is_left_of(a, b):
            c_found = False
            for c in line_segments[j+1:i]:
                if is_between(a, b, c) and is_overlapping(a, c) and is_overlapping(b, c):
                    c_found = True
                    break
            if not c_found:
                a.add_child(b)


## <font color="black"> Drawing Line Segments through Graph Traversal </font>

In [13]:
def draw_line_segments(node, image):
    cv2.rectangle(image, (node.x, node.y), (node.x + node.w, node.y + node.h), (0, 255, 0), 2)
    for child in node.children:
        draw_line_segments(child, image)
for i,node in enumerate(line_segments):
    if not node.children:
        draw_line_segments(node, image)
cv2.imwrite('finalgraphed.jpg',image)

True

## <font color="black">Sorted By Width with OrderNumber </font>

In [14]:
line_segments.sort(key=lambda x: x.w)

for i, node in enumerate(line_segments):
    cv2.rectangle(image, (node.x, node.y), (node.x + node.w, node.y + node.h), (0, 255, 0), 2)
    cv2.putText(image, str(i+1), (node.x-5, node.y+15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

cv2.imwrite('sorted_line_segments.jpg', image)

True

## <font color="black">After Topological Sort</font>

In [13]:

def topological_sort(node, stack, visited):
    visited.add(node)
    for child in node.children:
        if child not in visited:
            topological_sort(child, stack, visited)
    stack.append(node)


all_nodes = line_segments.copy()


no_incoming_edges = []
for node in line_segments:
    has_parent = False
    for other_node in line_segments:
        if node in other_node.children:
            has_parent = True
            break
    if not has_parent:
        no_incoming_edges.append(node)


stack = []
visited = set()
for node in no_incoming_edges:
    topological_sort(node, stack, visited)


order_number = 1
for node in reversed(stack):
    cv2.rectangle(image, (node.x, node.y), (node.x + node.w, node.y + node.h), (0, 255, 0), 2)
    cv2.putText(image, str(order_number), (node.x, node.y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
    order_number += 1


cv2.imwrite('topological_sorted.jpg', image)


True

# <font color="black">Generic Function for Images</font>

In [2]:
class LineSegmentNode:
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.x_range = (x, x+w)
        self.y_range = (y, y+h)
        self.children = []

    def add_child(self, child):
        self.children.append(child)

    
        
def is_above(a, b):
    return a.y < b.y

def is_left_of(a, b):
    return a.x_range[1] <= b.x

def is_overlapping(a, b):
    return a.x_range[1] >= b.x_range[0] and a.x_range[0] <= b.x_range[1]

def is_between(a, b, c):
    return a.y_range[0] <= c.y_range[1]<= b.y_range[1] 




def OCR(input_image,blur,kw,diterate):
    global image
    image= cv2.imread(input_image+'.jpg')
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur=cv2.GaussianBlur(gray_image,(blur,blur),0)
    cv2.imwrite(input_image+'_blur'+'.jpg',blur)
    threshold_image = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    cv2.imwrite(input_image+'_thresh'+'.jpg',threshold_image)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kw,1))
    cv2.imwrite(input_image+'_kernel.jpg',kernel)
    dilated_image = cv2.dilate(threshold_image, kernel, iterations=diterate)
    cv2.imwrite(input_image+'_dilate.jpg',dilated_image)
    contours=cv2.findContours(dilated_image,cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    contours=contours[0] if len(contours)==2 else contours[1]
    global line_segments
    line_segments=[]
    for c in contours:
        x,y,w,h=cv2.boundingRect(c)
        cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),1)
        line_segments.append(LineSegmentNode(x, y, w, h))
    cv2.imwrite(input_image+'_bounded.jpg',image)
    
    for i, a in enumerate(line_segments):
        for j in range(i):
            b = line_segments[j]
            if is_left_of(a, b):
                c_found = False
                for c in line_segments[j+1:i]:
                    if is_between(a, b, c) and is_overlapping(a, c) and is_overlapping(b, c):
                        c_found = True
                        break
                if not c_found:
                    a.add_child(b)
            elif is_overlapping(a, b) and is_above(a, b):
                a.add_child(b)

    
#OCR('Assignment02-Test-Page-Edited',11,10,3)
#OCR('ResearchPaper',1,13,2)
#OCR('Ad',13,7,3)
OCR('Columnx2',13,8,2)


    
    


## <font color="black">Sorted By Width Draw</font>

In [4]:
def SortByWidthDraw(input_image):
    line_segments.sort(key=lambda x: x.w)

    for i, node in enumerate(line_segments):
        cv2.rectangle(image, (node.x, node.y), (node.x + node.w, node.y + node.h), (0, 255, 0), 1)
        cv2.putText(image, str(i+1), (node.x+5, node.y+15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imwrite(input_image+'sorted_line_segments.jpg', image)
#SortByWidthDraw('Assignment02-Test-Page-Edited')
#SortByWidthDraw('ResearchPaper')
#SortByWidthDraw('Ad')
SortByWidthDraw('Columnx2')

## <font color="black">Q4: Topological Sort Draw</font> 

In [10]:
def topological_sort(node, stack, visited):
    visited.add(node)
    for child in node.children:
        if child not in visited:
            topological_sort(child, stack, visited)
    stack.append(node)


all_nodes = line_segments.copy()


no_incoming_edges = []
for node in line_segments:
    has_parent = False
    for other_node in line_segments:
        if node in other_node.children:
            has_parent = True
            break
    if not has_parent:
        no_incoming_edges.append(node)


stack = []
visited = set()
for node in no_incoming_edges:
    topological_sort(node, stack, visited)


order_number = 1
for node in reversed(stack):
    cv2.rectangle(image, (node.x, node.y), (node.x + node.w, node.y + node.h), (0, 255, 0), 1)
    cv2.putText(image, str(order_number), (node.x, node.y + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
    order_number += 1

def TopologicalSortSave(input_image):
    cv2.imwrite(input_image+'topological_sorted.jpg', image)
#TopologicalSortSave('Assignment02-Test-Page-Edited')
#TopologicalSortSave('ResearchPaper')
#TopologicalSortSave('Ad')
TopologicalSortSave('Columnx2')