In [16]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

In [2]:
def read_compos(file='compo.json'):
    compos = json.load(open(file))['compos']
    df = pd.DataFrame(columns=['id', 'column_min', 'column_max', 'row_min', 'row_max', 'center', 'height', 'width', 'area', 'class'])
    for i, compo in enumerate(compos):
        compo['area'] = compo['height'] * compo['width']
        compo['center'] = ((compo['column_min'] + compo['column_max'])/2, (compo['row_min'] + compo['row_max'])/2)
        df.loc[i] = compo
    df = df[df['class'] != 'TextView']
    return df

In [3]:
def draw(org, compos, opt='class'):
    img_h, img_w = compos.iloc[0].height, compos.iloc[0].width
    img = cv2.resize(org, (img_w, img_h))
    board = img.copy()
    for i in range(len(compos)):
        compo = compos.iloc[i]
        board = cv2.rectangle(board, (compo.column_min, compo.row_min), (compo.column_max, compo.row_max), (255,0,0))
        board = cv2.putText(board, str(compo[opt]), (compo.column_min+5, compo.row_min+20), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0,0,255), 1)
    
    cv2.imshow('board', board)
    cv2.waitKey()
    cv2.destroyAllWindows()

In [4]:
def inspect_data(compos):
    import matplotlib.pyplot as plt
    plt.xlim(xmax=10000)
    plt.scatter(list(compos['area']), list(compos['id']))

In [9]:
def dbscan_clustering_by_area(org, compos):
    x = np.reshape(list(compos['area']), (-1, 1))
    from sklearn.cluster import DBSCAN
    clustering = DBSCAN(eps=200, min_samples=1).fit(x)
    clustering.labels_
    compos['cluster'] = clustering.labels_
    draw(org, compos, 'cluster')
    return max(compos['cluster'])

In [10]:
compos = read_compos()
org = cv2.imread('9.png')
# draw(org, compos)

In [11]:
cluster_num = dbscan_clustering_by_area(org, compos)

10

In [15]:
compos.groupby('cluster').groups

{0: Int64Index([0], dtype='int64'),
 1: Int64Index([1], dtype='int64'),
 2: Int64Index([3], dtype='int64'),
 3: Int64Index([4, 5, 6, 16, 26, 28, 29, 30, 31, 32, 33, 34, 83], dtype='int64'),
 4: Int64Index([14], dtype='int64'),
 5: Int64Index([15], dtype='int64'),
 6: Int64Index([19, 20, 21, 22, 23], dtype='int64'),
 7: Int64Index([40, 49, 57, 66, 71, 76, 80], dtype='int64'),
 8: Int64Index([42, 47, 52], dtype='int64'),
 9: Int64Index([84], dtype='int64'),
 10: Int64Index([85], dtype='int64')}

In [8]:
compos

Unnamed: 0,id,column_min,column_max,row_min,row_max,center,height,width,area,class,cluster
0,0,0,635,0,800,"(317.5, 400.0)",800,635,508000,Background,0
1,0,0,633,0,10,"(316.5, 5.0)",10,633,6330,Compo,1
3,2,133,434,20,42,"(283.5, 31.0)",22,301,6622,Compo,2
4,3,510,548,25,38,"(529.0, 31.5)",13,38,494,Compo,3
5,4,567,597,27,37,"(582.0, 32.0)",10,30,300,Compo,3
6,5,24,52,53,63,"(38.0, 58.0)",10,28,280,Compo,3
14,13,460,611,85,172,"(535.5, 128.5)",87,151,13137,Compo,4
15,14,23,433,86,221,"(228.0, 153.5)",135,410,55350,Compo,5
16,15,462,475,175,189,"(468.5, 182.0)",14,13,182,Compo,3
19,18,23,102,225,262,"(62.5, 243.5)",37,79,2923,Compo,6
