In [20]:
from glob import glob
import random
import os
import xml.etree.ElementTree as ET
import shutil
import numpy as np
import pandas as pd

from PIL import Image

In [21]:
import sys
sys.path.append("../")

from src.constants import *

In [22]:
BASE_DIR_IMG = f"{PATH}/data/tiles/image_slices/"
BASE_DIR_XML = f"{PATH}/data/tiles/xml_slices/"

CSV = f"{PATH}/data/tiles/data_cardinalidades_sliced.csv"

In [23]:
FILES = glob(BASE_DIR_XML+"*.xml") + glob(BASE_DIR_IMG+"*.png")
IMAGES = glob(os.path.join(BASE_DIR_IMG,"*.png"))

In [24]:
CLASSES = ['muchos_opcional','muchos_obligatorio','uno_opcional','uno_obligatorio']

In [55]:
def generate_train_test(dataset):    
    for (dtype, image_paths, output_csv) in dataset:
        print ("[INFO] creating '{}' set...".format(dtype))
        print ("[INFO] {} total images in '{}' set".format(len(image_paths), dtype))

        csv = open(output_csv, "w")
        
        header_row = ["image_path","xmin", "ymin", "xmax", "ymax", "label"]
        csv.write("{}\n".format(",".join(header_row)))
        
        for image_path in image_paths:
            fname = image_path.split("/")[-1]
            fname = fname[:fname.rfind(".")]+".xml"
            annot_path = BASE_DIR_XML + fname
            tree = ET.parse(annot_path)
            root = tree.getroot()
            size = root.find("size")
#             h = int(size.find("height").text)
#             w = int(size.find("width").text)

            w,h = Image.open(image_path).size
            print(image_path, w, h)
            
            for label in CLASSES:
                for o in tree.iter("object"):
                    if o.find("name").text==label:
                        box = o.find("bndbox")
                        xmin = int(box.find("xmin").text)
                        ymin = int(box.find("ymin").text)
                        xmax = int(box.find("xmax").text)
                        ymax = int(box.find("ymax").text)
                        # truncate any bounding box coordinates that fall outside
                        # the boundaries of the image
                        xmin = max(0, xmin)
                        ymin = max(0, ymin)
                        xmax = min(w, xmax)
                        ymax = min(h, ymax)

                        # ignore the bounding boxes where the minimum values are larger
                        # than the maximum values and vice-versa due to annotation errors
                        if xmin >= xmax or ymin >= ymax:
                            print("a")
                            continue
                        elif xmax <= xmin or ymax <= ymin:
                            print("b")
                            continue

                        row = [os.path.abspath(image_path),str(xmin), str(ymin), str(xmax), str(ymax), str(label)]
                        csv.write("{}\n".format(",".join(row)))
        csv.close()

In [56]:
def split_wrapper():    
    dataset = [("csv", IMAGES, CSV)]
    #Create and write train and test csv
    generate_train_test(dataset)

In [57]:
def get_val_count(df):
    return df[df.columns[-1]].value_counts()

split_wrapper()

csv = pd.read_csv(CSV)
csv.head()

[INFO] creating 'csv' set...
[INFO] 240 total images in 'csv' set
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000005.png 516 637
muchos_obligatorio
279 64 297 83
279 64 297 83
muchos_obligatorio
210 133 228 150
210 133 228 150
uno_opcional
546 64 563 84
546 64 563 84
uno_opcional
283 317 302 337
283 317 302 337
uno_obligatorio
384 65 400 83
384 65 400 83
uno_obligatorio
209 252 230 270
209 252 230 270
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000094.png 529 504
muchos_obligatorio
442 430 463 446
442 430 463 446
muchos_obligatorio
435 166 453 182
435 166 453 182
uno_obligatorio
368 49 384 63
368 49 384 63
uno_obligatorio
415 47 433 63
415 47 433 63
uno_obligatorio
399 174 416 190
399 174 416 190
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000212.png 312 242
muchos_opcional
10 178 33 199
10 178 33 199
uno_opcional
11 257 33 277
11 257 33 277
uno_opcional
227 180 173 202
227 180 173 202
a
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000089.png 585 665
muchos_obligator

Unnamed: 0,image_path,xmin,ymin,xmax,ymax,label
0,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,279,64,297,83,muchos_obligatorio
1,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,210,133,228,150,muchos_obligatorio
2,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,546,64,563,84,uno_opcional
3,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,283,317,302,337,uno_opcional
4,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,384,65,400,83,uno_obligatorio


In [58]:
get_val_count(csv)

uno_obligatorio       579
muchos_obligatorio    495
uno_opcional          134
muchos_opcional        87
Name: label, dtype: int64

In [59]:
mapper = pd.read_csv("../data/tiles/mapper.csv")
mapper.columns = ["image_name","tile1","tile2","tile3","tile4","tile5","tile6"]

for c in mapper.columns[1:]:
    mapper[c] = mapper[c].apply(lambda x: str(x).zfill(6))

In [60]:
mapper.head()

Unnamed: 0,image_name,tile1,tile2,tile3,tile4,tile5,tile6
0,ERDiagramsMySQL-1,1,2,3,4,5,6
1,ERDiagramsMySQL-10,7,8,9,10,11,12
2,ERDiagramsMySQL-11,13,14,15,16,17,18
3,ERDiagramsMySQL-12,19,20,21,22,23,24
4,ERDiagramsMySQL-13,25,26,27,28,29,30


In [61]:
train_original = pd.read_csv("../data/csv/train_diagramas_linux.csv")
train_images = train_original.image_path.unique()
train_images = [img_path.split(os.sep)[-1].split(".")[0] for img_path in train_images]

In [62]:
train_sliced = csv[csv['image_path'].isin(train_images)]
test_sliced = csv[~csv['image_path'].isin(train_images)]

In [63]:
def get_original_image(slice_path, mapper=mapper):
    img_path = slice_path.split(os.sep)[-1]
    img_path = img_path.split(".")[0]
    mapper = mapper.set_index("image_name")
    mapper_t = mapper.T
    for c in mapper_t.columns:
        if img_path in mapper_t[c].values:
            return c
    return None

csv['original_img_path'] = csv['image_path'].apply(lambda x: get_original_image(x))
csv.head()

Unnamed: 0,image_path,xmin,ymin,xmax,ymax,label,original_img_path
0,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,279,64,297,83,muchos_obligatorio,ERDiagramsMySQL-1
1,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,210,133,228,150,muchos_obligatorio,ERDiagramsMySQL-1
2,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,546,64,563,84,uno_opcional,ERDiagramsMySQL-1
3,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,283,317,302,337,uno_opcional,ERDiagramsMySQL-1
4,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,384,65,400,83,uno_obligatorio,ERDiagramsMySQL-1


In [64]:
train_sliced = csv[csv['original_img_path'].isin(train_images)]
test_sliced = csv[~csv['original_img_path'].isin(train_images)]

In [65]:
print(train_sliced.shape)
print(test_sliced.shape)

(985, 7)
(310, 7)


In [66]:
train_sliced.drop(columns=["original_img_path"]).to_csv(f"{PATH}/data/tiles/train_cardinalidades_linux.csv")
test_sliced.drop(columns=["original_img_path"]).to_csv(f"{PATH}/data/tiles/test_cardinalidades_linux.csv")

In [67]:
train_sliced.head(6)

Unnamed: 0,image_path,xmin,ymin,xmax,ymax,label,original_img_path
0,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,279,64,297,83,muchos_obligatorio,ERDiagramsMySQL-1
1,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,210,133,228,150,muchos_obligatorio,ERDiagramsMySQL-1
2,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,546,64,563,84,uno_opcional,ERDiagramsMySQL-1
3,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,283,317,302,337,uno_opcional,ERDiagramsMySQL-1
4,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,384,65,400,83,uno_obligatorio,ERDiagramsMySQL-1
5,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,209,252,230,270,uno_obligatorio,ERDiagramsMySQL-1
