# Newspaper Segmentation

#### This scipt is recommended to be used for both pre-processing and post-processing methods

Following functions and methods are implemented in order to produce segmentation within the newspapers and further improvements will be added in future.

Thresholding(Otsu's, Basic Segmentation)
Denoising
Dilation
Run Length Smoothing Algorithm

In [56]:
import cv2
import re
import os
import numpy as np
import math
import copy
from tqdm import tqdm
from pythonRLSA import rlsa

In [57]:
def read_all_images(filepath:str, mode:str, write=(False,False)):
    files = sorted(os.listdir(filepath), key=lambda f: int(re.sub('\D', '', f)))
    print("Total_nos_images ==>",len(files))
    
    if mode == 'binary':
        data = [cv2.imread(filepath+name, cv2.IMREAD_GRAYSCALE) for name in files]
    else:
        data = [cv2.imread(filepath+name) for name in files]
        
    if write[0]:
        path = write[1]
        if not os.path.exists(path):
            os.makedirs(path)
        for ind,each in enumerate(data):
            name = path+'img_'+str(ind+1)
            cv2.imwrite(name+".png", each)
    return files,data

In [58]:
def prepare_dataset(orgList, filepath:str, filename:None, custom_resolution:tuple):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)
        
    custom_height, custom_width = custom_resolution
    
    if custom_height > custom_width:
        resizedArr = np.empty((len(orgList), custom_height, custom_width, 3), dtype='float32')

        for ind, image in enumerate(orgList):
            img = image.astype('float32')
            resized = cv2.resize(img, (custom_width, custom_height))
            resizedArr[ind] = resized
            if filename:
                filename_r = 'resized_'+str(ind)+".png"
                cv2.imwrite(filepath+filename_r, resized)
            
    return resizedArr

In [59]:
def apply_threshold(imgArr, filepath:str, filename:str, params:tuple):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)
    
    methods = {'binary': cv2.THRESH_BINARY,
               'binary_inv': cv2.THRESH_BINARY_INV,
               'otsu': cv2.THRESH_OTSU}
    
    copy = imgArr.copy()
    gray = cv2.cvtColor(imgArr, cv2.COLOR_BGR2GRAY) #grayscale conversion

    if params[2] in list(methods.keys()):
        ret,thres = cv2.threshold(gray, params[0], params[1], methods[params[2]])
    else:
        print('Thresholding method not found from ',list(methods.keys()))
    if filename:
        cv2.imwrite(filepath+filename, thres)

    return thres

In [60]:
def make_denoised(imgArr, filepath:str, filename:str, params:tuple):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)
        
    copy = imgArr.copy()
    gray = cv2.cvtColor(imgArr, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(imgArr,None,
                                        params[0],params[1],params[2])
    
    if filename:
        cv2.imwrite(filepath+filename, denoised)

    return denoised

In [61]:
def apply_morphology(imgArr, filepath:str, filename:str, params:tuple):
    if filepath is not None:
        if not os.path.exists(filepath):
            os.makedirs(filepath)
            print('\nCreating directory ->',filepath)
    
    copy = imgArr.copy()
    if params[0] == 'dilation':
        size = params[1]
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, size)
        dilated = cv2.dilate(imgArr, kernel, iterations=params[2])
        output = dilated
        
    if params[0] == 'erosion':
        size = params[1]
        kernel = np.ones((size,size), dtype=np.float32)
        eroded = cv2.erode(imgArr, kernel)
        output = eroded
        
    if filename:   
        cv2.imwrite(filepath+filename, output) 
    
    return output

In [62]:
def perform_RLSA_hor(imgArr, val:int, filepath:str, filename:str):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)

    x, y = imgArr.shape[0],imgArr.shape[1]
    value = max(math.ceil(x/100),math.ceil(y/100))+val

    mask = rlsa.rlsa(imgArr, True, False, value)

    if filename:
        cv2.imwrite(filepath+filename, mask)

    return mask

def perform_RLSA_ver(imgArr, val:int, filepath:str, filename:str):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)

    x, y = imgArr.shape[0],imgArr.shape[1]
    value = max(math.ceil(x/100),math.ceil(y/100))+val

    mask = rlsa.rlsa(imgArr, False, True, value)

    if filename:
        cv2.imwrite(filepath+filename, mask)
        
    return mask

def get_RLSA_final(imgArr1, imgArr2, filepath:str, filename:str):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)

    rlsa_f = cv2.bitwise_or(imgArr1,imgArr2)
    rlsa_f = cv2.bitwise_not(rlsa_f)

    if filename:
        cv2.imwrite(filepath+filename, rlsa_f)

    return rlsa_f

In [63]:
def get_title_masks(imgList, orgList, filepath:str, filter_height = (True, [0,0]), filter_width = (False, [0,0]), morph_params = False, no_fill = False, bbox = True):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)
    
    if no_fill:
        thickness = 1
    else:
        thickness = -1
    
    final_bbox = dict()
    
    for ind in range(0,len(imgList)):
        temp_bbox = []
        contours, hierarchy = cv2.findContours(imgList[ind], cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        mask2 = np.ones(orgList[ind].shape, dtype="uint8") * 0
                
        title = [contour for contour in contours if cv2.boundingRect(contour)[3] > filter_height[1][0]]
        title_heights = [cv2.boundingRect(contour)[3] for contour in contours if cv2.boundingRect(contour)[3] > filter_height[1][0]]
        title_widths = [cv2.boundingRect(contour)[2] for contour in contours if cv2.boundingRect(contour)[3] > filter_width[1][0]]
        
        avgwidth = sum(title_widths)/len(title_widths)
        
        cnt = 0
        for cnts in title:
            area = cv2.contourArea(cnts)
            peri = cv2.arcLength(cnts, True)
            epsilon = 0.001*peri
            approx = cv2.approxPolyDP(cnts, epsilon, True)
            cnt+=1
            bbox_rect = cv2.boundingRect(approx)
            [x, y, w, h] = bbox_rect
            
            if filter_height[0]:
                if h < filter_height[1][1]:
                    title = orgList[ind][y: y+h, x: x+w]
                    if bbox:
                        cv2.rectangle(mask2, (x,y),(x+w,y+h), (255, 255, 255), thickness)
                        mask = apply_morphology(mask2, None, False, morph_params)
                    else:
                        cv2.drawContours(mask2, [approx], -1, (255, 255, 255), thickness)
                    temp_bbox.append(bbox_rect)
                    
            if filter_width[0]:
                if w > filter_width[1][1]*avgwidth:
                    title = orgList[ind][y: y+h, x: x+w]
                    if bbox:
                        cv2.rectangle(mask2, (x,y),(x+w,y+h), (255, 255, 255), thickness)
                        mask = apply_morphology(mask2, None, False, morph_params)
                    else:
                        cv2.drawContours(mask2, [approx], -1, (255, 255, 255), thickness)
                    temp_bbox.append(bbox_rect)
                
        cv2.imwrite(filepath+'/result_'+str(ind)+'.png', mask)
        final_bbox[ind] = temp_bbox
        
    return final_bbox

In [64]:
def get_content_masks(imgList, filepath:str, title_bboxes, filter_height = (True, 0), filter_width = (False, 0), morph_params = False, no_fill = False, bbox = True):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)
    
    if no_fill:
        thickness = 1
    else:
        thickness = -1
    
    final_bbox = dict()
    
    for ind in range(0,len(imgList)):
        temp_bbox = []
        
        for bbx in title_bboxes[ind]:
            (x,y,w,h) = bbx
            imgList[ind][y: y+h, x: x+w] = 0
            
        dilated = apply_morphology(imgList[ind], None, False, morph_params)        
        mask1 = np.ones(imgList[ind].shape, dtype="uint8") * 0
        
        contours, hierarchy = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        heights = [cv2.boundingRect(contour)[3] for contour in contours]
        widths = [cv2.boundingRect(contour)[2] for contour in contours]
        
        avgheight = 26.54 #calculated on a sample using formula <--avgheight = sum(heights)/len(heights)-->
        avgwidth = 54.64 #calculated on a sample using formula <--avgwidth = sum(widths)/len(widths)-->
        
        for cnts in contours:
            area = cv2.contourArea(cnts)
            peri = cv2.arcLength(cnts, True)
            epsilon = 0.001*peri
            approx = cv2.approxPolyDP(cnts, epsilon, True)
            bbox_rect = cv2.boundingRect(approx)
            [x, y, w, h] = bbox_rect
            
            if filter_height[0] and filter_width[0]:
                if h > filter_height[1]*avgheight and w > filter_width[1]*avgwidth:
                    if bbox:
                        cv2.rectangle(mask1, (x,y),(x+w,y+h), (255, 255, 255), thickness)
                    else:
                        cv2.drawContours(mask1, [approx], -1, (255, 255, 255), thickness)
                        
                    temp_bbox.append(bbox_rect)
                
        cv2.imwrite(filepath+'/result_'+str(ind)+'.png', mask1)
        final_bbox[ind] = temp_bbox
        
    return final_bbox

In [65]:
def extract_bboxes(dstImg, contour_params=(cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)):
    title_bboxes = dict()
    for ind in range(0,len(dstImg)):
        bboxes = list()
        contours, hierarchy = cv2.findContours(dstImg[ind], contour_params[0], contour_params[1])
        for cnts in contours:
            peri = cv2.arcLength(cnts, True)
            epsilon = 0.001*peri
            approx = cv2.approxPolyDP(cnts, epsilon, True)
            bbox_rect = cv2.boundingRect(approx)
            bboxes.append(bbox_rect)
        title_bboxes[ind] = bboxes
        
    return title_bboxes

In [66]:
def apply_bbox(img, filepath:str, filename:str, bboxes, color=(180, 180, 180)):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        print('\nCreating directory ->',filepath)
        
    for cnts in bboxes:
        x,y,w,h = cnts
        cv2.rectangle(img, (x,y), (x+w, y+h), color, -1)
        cv2.rectangle(img, (x,y), (x+w, y+h), (0, 0, 0), 1)
        
    cv2.imwrite(filepath+filename, img)
    return img    

In [67]:
path = 'Images/'
fl,data = read_all_images(path, mode=False) #write=(True,path)
print(fl)

Total_nos_images ==> 21
['resized_17.png', 'resized_18.png', 'resized_19.png', 'resized_20.png', 'resized_21.png', 'resized_22.png', 'resized_23.png', 'resized_24.png', 'resized_25.png', 'resized_26.png', 'resized_27.png', 'resized_28.png', 'resized_29.png', 'resized_30.png', 'img_2018-02-02-10-C.png', 'img_2018-02-02-26-C.png', 'img_2018-02-02-37-C.png', 'Financial_Exp_30_Sep_2020_page-0005.jpg', 'Financial_Exp_30_Sep_2020_page-0007.jpg', 'Financial_Exp_30_Sep_2020_page-0010.jpg', 'Financial_Exp_30_Sep_2020_page-0014.jpg']


In [68]:
filepath_r = "Outputs/resized/"
org = prepare_dataset(data, filepath_r, True, (1507,960))

### Extracting RLSA final output

In [69]:
filepath_1 = "Outputs/denoised/"
filepath_2 = "Outputs/threshold/"

params_denoise = (10,7,21) # filter strength, templateWindowSize, searchWindowSize
params_thres = (200, 255, 'otsu')

filepath_r = "Outputs/resized/"
_,res_data = read_all_images(filepath_r, mode=None)

for ind,imgArr in tqdm(enumerate(res_data)):
    filename_1 = 'denoised_'+str(ind)+".png"
    filename_2 = 'thres_'+str(ind)+".png"
    
    denoised = make_denoised(imgArr,filepath_1,filename_1,params_denoise)
    thres = apply_threshold(imgArr,filepath_2,filename_2,params_thres)

Total_nos_images ==> 21


21it [01:26,  4.10s/it]


RLSA Horizontal - Title/Content

In [70]:
filepath_2 = "Outputs/threshold/"
filepath_3 = "Outputs/title_RLSA/RLSA_h/"
thres_data = read_all_images(filepath_2, mode='binary')

for ind,imgArr in tqdm(enumerate(thres_data[1])):
    filename_3 = 'rlsa_h_'+str(ind)+".png"
    rlsa_h = perform_RLSA_hor(imgArr,10,filepath_3,filename_3)
    rlsa_h = perform_RLSA_hor(imgArr,100,"Outputs/RLSA_h/",filename_3)

Total_nos_images ==> 21


21it [03:43, 10.64s/it]


RLSA Vertical - Title/Content

In [71]:
filepath_2 = "Outputs/threshold/"
filepath_4 = "Outputs/title_RLSA/RLSA_v/"
thres_data = read_all_images(filepath_2, mode='binary')

for ind,imgArr in tqdm(enumerate(thres_data[1])):
    filename_4 = 'rlsa_v_'+str(ind)+".png"
    rlsa_v = perform_RLSA_ver(imgArr,10,filepath_4,filename_4)
    rlsa_v = perform_RLSA_ver(imgArr,100,"Outputs/RLSA_v/",filename_4)

Total_nos_images ==> 21


21it [04:22, 12.51s/it]


RLSA final - Title/Content

In [72]:
filepath_5 = "Outputs/title_RLSA/RLSA_f/"
_,data_h = read_all_images(filepath_3,mode=False)
_,data_v = read_all_images(filepath_4,mode=False)

_,data_h_1 = read_all_images("Outputs/RLSA_h/",mode=False)
_,data_v_1 = read_all_images("Outputs/RLSA_v/",mode=False)

for ind in tqdm(range(0,len(data_h))):
    filename_5 = 'rlsa_f_'+str(ind)+".png"
    get_RLSA_final(data_h[ind],data_v[ind],filepath_5,filename_5)
    get_RLSA_final(data_h_1[ind],data_v_1[ind],"Outputs/RLSA_f/",filename_5)

Total_nos_images ==> 21
Total_nos_images ==> 21
Total_nos_images ==> 21
Total_nos_images ==> 21


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:01<00:00, 16.80it/s]


### Extracting Title masks from RLSA output

In [73]:
filepath_5 = "Outputs/title_RLSA/RLSA_f/"
_,data_rlsa = read_all_images(filepath_5, mode='binary')

title_mask_path = 'Outputs/title_masks/'
final_bbox = get_title_masks(data_rlsa, data_rlsa, title_mask_path, 
                                             filter_height = (True, [9.055, 37.949]),
                                             morph_params = ('dilation', (6,1), 2),
                                             no_fill = False, bbox = True)

Total_nos_images ==> 21


In [74]:
filepath_6 = "Outputs/title_masks/"
_,data_rlsa = read_all_images(filepath_6, mode='binary')

title_mask_path = 'Outputs/title_masks_f/'
final_bbox = get_title_masks(data_rlsa, data_rlsa, title_mask_path,
                                            filter_height = (False, [0,0]),
                                            filter_width = (True, [0,0.5]),
                                            morph_params = ('dilation', (2,5), 1),
                                            no_fill = False, bbox = True)

Total_nos_images ==> 21


Extract title bounding boxes

In [75]:
filepath_7 = "Outputs/title_masks_f/"
_,title_masks = read_all_images(filepath_7, mode='binary')

title_bboxes = extract_bboxes(title_masks)

Total_nos_images ==> 21


### Extract Content masks

In [76]:
filepath_8 = "Outputs/RLSA_f/"
_,data_rlsa = read_all_images(filepath_8, mode='binary')

content_mask_path = 'Outputs/content_masks_f/'
final_bbox = get_content_masks(data_rlsa, 
                               content_mask_path, title_bboxes,
                               filter_height = (True, 0.25),
                               filter_width = (True, 0.25),
                               morph_params = ('dilation', (1,3), 2),
                               no_fill = False, bbox = True)

Total_nos_images ==> 21


Extract content bounding boxes

In [77]:
filepath_9 = "Outputs/content_masks_f/"
_,content_masks = read_all_images(filepath_9, mode='binary')

content_bboxes = extract_bboxes(content_masks)

Total_nos_images ==> 21


In [78]:
original = copy.deepcopy(org)

final_path = "Results/"
for ind in range(len(original)):
    final_filename = "Result"+str(ind)+".png"
    out = apply_bbox(original[ind], final_path, final_filename, title_bboxes[ind], (180, 180, 180))
    apply_bbox(out, final_path, final_filename, content_bboxes[ind], (180, 180, 180))