In [1]:
from glob import glob
import random
import os
import xml.etree.ElementTree as ET
import shutil
import numpy as np
import pandas as pd

from PIL import Image

In [2]:
import sys
sys.path.append("../")

from src.constants import *

In [3]:
BASE_DIR_IMG = f"{PATH}/data/tiles/image_slices/"
BASE_DIR_XML = f"{PATH}/data/tiles/xml_slices/"

CSV = f"{PATH}/data/tiles/data_cardinalidades_sliced.csv"

In [4]:
FILES = glob(BASE_DIR_XML+"*.xml") + glob(BASE_DIR_IMG+"*.png") + glob(os.path.join(BASE_DIR_IMG,"*.jpg"))
IMAGES = glob(os.path.join(BASE_DIR_IMG,"*.png")) + glob(os.path.join(BASE_DIR_IMG,"*.jpg"))

In [5]:
CLASSES = ['muchos_opcional','muchos_obligatorio','uno_opcional','uno_obligatorio']

In [6]:
def generate_train_test(dataset, sep="/"):    
    for (dtype, image_paths, output_csv) in dataset:
        print ("[INFO] creating '{}' set...".format(dtype))
        print ("[INFO] {} total images in '{}' set".format(len(image_paths), dtype))

        csv = open(output_csv, "w")
        
        header_row = ["image_path","xmin", "ymin", "xmax", "ymax", "label"]
        csv.write("{}\n".format(",".join(header_row)))
        
        for image_path in image_paths:
            fname = image_path.split(sep)[-1]
            fname = fname[:fname.rfind(".")]+".xml"
            annot_path = BASE_DIR_XML + fname
            tree = ET.parse(annot_path)
            root = tree.getroot()
            size = root.find("size")
#             h = int(size.find("height").text)
#             w = int(size.find("width").text)

            w,h = Image.open(image_path).size
            print(image_path, w, h)
            
            for label in CLASSES:
                for o in tree.iter("object"):
                    if o.find("name").text==label:
                        box = o.find("bndbox")
                        xmin = int(box.find("xmin").text)
                        ymin = int(box.find("ymin").text)
                        xmax = int(box.find("xmax").text)
                        ymax = int(box.find("ymax").text)
                        # truncate any bounding box coordinates that fall outside
                        # the boundaries of the image
                        xmin = max(0, xmin)
                        ymin = max(0, ymin)
                        xmax = min(w, xmax)
                        ymax = min(h, ymax)

                        # ignore the bounding boxes where the minimum values are larger
                        # than the maximum values and vice-versa due to annotation errors
                        if xmin >= xmax or ymin >= ymax:
                            print("a")
                            continue
                        elif xmax <= xmin or ymax <= ymin:
                            print("b")
                            continue

                        row = [os.path.abspath(image_path),str(xmin), str(ymin), str(xmax), str(ymax), str(label)]
                        csv.write("{}\n".format(",".join(row)))
        csv.close()

In [7]:
def split_wrapper():    
    dataset = [("csv", IMAGES, CSV)]
    #Create and write train and test csv
    generate_train_test(dataset)

In [8]:
def get_val_count(df):
    return df[df.columns[-1]].value_counts()

split_wrapper()

csv = pd.read_csv(CSV)
csv.head()

[INFO] creating 'csv' set...
[INFO] 462 total images in 'csv' set
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000413.png 1465 1407
a
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000005.png 637 516
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000403.png 774 704
a
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000094.png 504 529
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000273.png 215 421
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000429.png 277 383
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000341.png 637 537
a
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000212.png 242 312
a
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000383.png 581 541
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000396.png 606 421
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000089.png 665 585
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000072.png 438 722
/home/nacho/TFI-Cazcarra/data/tiles/image_slices/000185.png 337 278
/home/nacho/TFI-Cazcarra/data/tiles/imag

Unnamed: 0,image_path,xmin,ymin,xmax,ymax,label
0,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,277,318,333,366,muchos_opcional
1,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,376,466,433,519,muchos_opcional
2,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,538,270,590,321,muchos_opcional
3,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,609,366,661,421,muchos_opcional
4,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,717,738,769,801,muchos_opcional


In [9]:
get_val_count(csv)

uno_obligatorio       1006
muchos_obligatorio     823
uno_opcional           325
muchos_opcional        314
Name: label, dtype: int64

In [11]:
mapper = pd.read_csv("../data/tiles/mapper.csv")
mapper.columns = ["image_name","tile1","tile2","tile3","tile4","tile5","tile6"]

for c in mapper.columns[1:]:
    mapper[c] = mapper[c].apply(lambda x: str(x).zfill(6))

In [12]:
mapper.head()

Unnamed: 0,image_name,tile1,tile2,tile3,tile4,tile5,tile6
0,ERDiagramsMySQL-1,1,2,3,4,5,6
1,ERDiagramsMySQL-10,7,8,9,10,11,12
2,ERDiagramsMySQL-11,13,14,15,16,17,18
3,ERDiagramsMySQL-12,19,20,21,22,23,24
4,ERDiagramsMySQL-13,25,26,27,28,29,30


In [14]:
train_original = pd.read_csv("../data/csv/train_diagramas_2023.csv")
train_images = train_original.image_path.unique()
train_images = [img_path.split(os.sep)[-1].split(".")[0] for img_path in train_images]

In [15]:
train_sliced = csv[csv['image_path'].isin(train_images)]
test_sliced = csv[~csv['image_path'].isin(train_images)]

In [16]:
def get_original_image(slice_path, mapper=mapper):
    img_path = slice_path.split(os.sep)[-1]
    img_path = img_path.split(".")[0]
    mapper = mapper.set_index("image_name")
    mapper_t = mapper.T
    for c in mapper_t.columns:
        if img_path in mapper_t[c].values:
            return c
    return None

csv['original_img_path'] = csv['image_path'].apply(lambda x: get_original_image(x))
csv.head()

Unnamed: 0,image_path,xmin,ymin,xmax,ymax,label,original_img_path
0,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,277,318,333,366,muchos_opcional,example35
1,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,376,466,433,519,muchos_opcional,example35
2,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,538,270,590,321,muchos_opcional,example35
3,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,609,366,661,421,muchos_opcional,example35
4,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,717,738,769,801,muchos_opcional,example35


In [17]:
train_sliced = csv[csv['original_img_path'].isin(train_images)]
test_sliced = csv[~csv['original_img_path'].isin(train_images)]

In [18]:
print(train_sliced.shape)
print(test_sliced.shape)

(1659, 7)
(809, 7)


In [19]:
train_sliced.drop(columns=["original_img_path"]).to_csv(f"{PATH}/data/tiles/train_cardinalidades_2023.csv")
test_sliced.drop(columns=["original_img_path"]).to_csv(f"{PATH}/data/tiles/test_cardinalidades_2023.csv")

In [20]:
train_sliced.head(6)

Unnamed: 0,image_path,xmin,ymin,xmax,ymax,label,original_img_path
0,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,277,318,333,366,muchos_opcional,example35
1,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,376,466,433,519,muchos_opcional,example35
2,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,538,270,590,321,muchos_opcional,example35
3,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,609,366,661,421,muchos_opcional,example35
4,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,717,738,769,801,muchos_opcional,example35
5,/home/nacho/TFI-Cazcarra/data/tiles/image_slic...,920,530,970,579,muchos_opcional,example35
