Analysis of the paper : [Text and non-text segmentation based on connected component features](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7333930) (ICDAR 2015)

# Get data

In [1]:
from google.colab import drive 
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!ls /content/drive/MyDrive/BTP/100Docbank | wc -l
DIR = "/content/drive/MyDrive/BTP/100Docbank"

100


# Implement the steps in the algo on a random image

## Imports and img selection

In [3]:
import numpy as np
import cv2
import os
from google.colab.patches import cv2_imshow

In [4]:
np.random.seed(69)
l = sorted(os.listdir(DIR))

imgp = []
for i in range(10):
    idx = np.random.randint(len(l))
    imgp.append(os.path.join(DIR,l[idx]))
imgp

['/content/drive/MyDrive/BTP/100Docbank/23.tar_1402.5330.gz_fusion_1_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/44.tar_1503.06300.gz_dodona_ijhcs_revised_round2_6_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/39.tar_1802.04452.gz_ms_18_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/80.tar_1605.00521.gz_323_3_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/230.tar_1611.08510.gz_DPTG_PA_ABM_004_4_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/132.tar_1410.2655.gz_CRBTSM_parizot_final_7_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/212.tar_1807.09084.gz_pollicott-dimaff-arxiv_66_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/135.tar_1805.05760.gz_cataracts_3_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/111.tar_1804.08410.gz_Asymptotic_analysis_5_ori.jpg',
 '/content/drive/MyDrive/BTP/100Docbank/232.tar_1808.04097.gz_ep_LHC_submit_22_ori.jpg']

## 1) Preprocess image

In [7]:
for i in imgp:
    img = cv2.imread(i)
    cv2_imshow(img)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    # cv2_imshow(thresh)
    output = img.copy()

    # Remove horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15,1))
    remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        cv2.drawContours(output, [c], -1, (255,255,255), 3)

    # Remove vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,30))
    remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        cv2.drawContours(output, [c], -1, (255,255,255), 3)

    gray = cv2.cvtColor(output, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    cc_output = cv2.connectedComponentsWithStats(thresh, 4, cv2.CV_32S)
    (numLabels, labels, stats, centroids) = cc_output
    print(img.shape, labels.shape, stats.shape, centroids.shape)
    
    componentMask = np.zeros(labels.shape)

    for i in range(0, numLabels):
        # if this is the first component then we examine the
        # *background* (typically we would just ignore this
        # component in our loop)
        if i == 0:
            text = "examining component {}/{} (background)".format(
                i + 1, numLabels)
        # otherwise, we are examining an actual connected component
        else:
            text = "examining component {}/{}".format( i + 1, numLabels)
        # print a status message update for the current connected
        # component
        # print("[INFO] {}".format(text))
        # extract the connected component statistics and centroid for
        # the current label
        x = stats[i, cv2.CC_STAT_LEFT]
        y = stats[i, cv2.CC_STAT_TOP]
        w = stats[i, cv2.CC_STAT_WIDTH]
        h = stats[i, cv2.CC_STAT_HEIGHT]
        area = stats[i, cv2.CC_STAT_AREA]
        (cX, cY) = centroids[i]

        # output = img.copy()
        cv2.rectangle(output, (x, y), (x + w, y + h), (0, 255, 0), 3)
        cv2.circle(output, (int(cX), int(cY)), 4, (0, 0, 255), -1)

        if i!=0:
            componentMask+=((labels == i).astype("uint8") * 255)
        # show our output image and connected component mask
    cv2_imshow(output)
    # cv2_imshow(componentMask)
    # break

Output hidden; open in https://colab.research.google.com to view.

What all we can do next:


*   Remove equation only elements (=,>=,(,integrals etc) only on equation parts (apply EquationNet??)
*   Try to remove the fraction line if possible (long ones have been handled) (handled by global horizontal limit as 15)
*   Apply TableNet, get the table and then do CC in that.
*   Detect figures, charts and do CC only on that.

