In [23]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import xml.etree.cElementTree as ET
import glob
import os
import json
import random
import shutil
import PIL
from PIL import Image, ImageOps
from tqdm import tqdm
from skmultilearn.model_selection import iterative_train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import os

#### Bronze

In [3]:
def xml_to_yolo_bbox(bbox, w, h):
    # xmin, ymin, xmax, ymax
    x_center = ((bbox[2] + bbox[0]) / 2) / w
    y_center = ((bbox[3] + bbox[1]) / 2) / h

    width = (bbox[2] - bbox[0]) / w
    height = (bbox[3] - bbox[1]) / h

    return [x_center, y_center, width, height]

def yolo_to_xml_bbox(bbox, w, h):
    # x_center, y_center, width, height
    w_half_len = (bbox[2] * w) / 2
    h_half_len = (bbox[3] * h) / 2

    xmin = int((bbox[0] * w) - w_half_len)
    ymin = int((bbox[1] * h) - h_half_len)
    xmax = int((bbox[0] * w) + w_half_len)
    ymax = int((bbox[1] * h) + h_half_len)

    return [xmin, ymin, xmax, ymax]

In [4]:
classes = ["without_mask", "with_mask", "mask_weared_incorrect"]

raw_label_dir = "../data/raw/annotations"
raw_image_dir = "../data/raw/images"
bronze_label_dir = "../data/bronze/labels"
bronze_label_metadata_dir = "../data/bronze"

if not os.path.isdir(bronze_label_dir):
    os.mkdir(bronze_label_dir)

files = glob.glob(os.path.join(raw_label_dir, "*.xml"))

In [22]:
for file in tqdm(files):
    basename = os.path.basename(file)
    filename = os.path.splitext(basename)[0]
    if not os.path.exists(os.path.join(raw_image_dir, f"{filename}.png")):
        print(f"{filename} image does not exist!")
        continue

    result = []

    tree = ET.parse(file)
    root = tree.getroot()
    width = int(root.find("size").find("width").text)
    height = int(root.find("size").find("height").text)

    for obj in root.findall("object"):
        label = obj.find("name").text

        if label not in classes:
            classes.append(label)
        index = classes.index(label)
        pil_bbox = [int(x.text) for x in obj.find("bndbox")]
        yolo_bbox = xml_to_yolo_bbox(pil_bbox, width, height)

        bbox_string = " ".join([str(x) for x in yolo_bbox])
        result.append(f"{index} {bbox_string}")

    if result:
        with open(os.path.join(bronze_label_dir, f"{filename}.txt"), "w", encoding = "utf-8") as f:
            f.write("\n".join(result))

100%|██████████████████████████████████████████████████████████████████████████████| 853/853 [00:00<00:00, 2049.16it/s]


In [23]:
with open(f"{bronze_label_metadata_dir}/classes.txt", "w", encoding = "utf-8") as f:
    f.write(json.dumps(classes))

#### Silver

In [29]:
bronze_label_files = glob.glob(f'{bronze_label_dir}/*.txt')
image_files = glob.glob(f'{raw_image_dir}/*.png')

In [34]:
corrupt_img = []
for file in tqdm(image_files):
    try:
        im = Image.open(file)
        im.verify() #detect defect on image
        im.close() 
        
        #reload is necessary
        im = Image.open(file) 
        im.transpose(PIL.Image.FLIP_LEFT_RIGHT) #detect truncated image
        im.close()
        
        statfile = os.stat(file)
        filesize = statfile.st_size
        if filesize == 0: #check image size
            corrupt_img.append(file)
    except: 
        corrupt_img.append(file)

print(f'{len(corrupt_img)} image files is corrupted')

100%|████████████████████████████████████████████████████████████████████████████████| 853/853 [00:39<00:00, 21.54it/s]

0 image files is corrupted





In [None]:
classes = []
for file in tqdm(bronze_label_files):
    with open(file, "r", encoding = "utf-8") as f:
        lines = f.read()
        class_ = list(set([i[0] for i in lines.split('\n')]))
        class_.sort()
    classes.append(class_)

In [170]:
df = pd.DataFrame({'classes': classes})
df = df.astype({'classes': 'string'})
df['class_0'] = df["classes"].str.contains('0').astype('int32')
df['class_1'] = df["classes"].str.contains('1').astype('int32')
df['class_2'] = df["classes"].str.contains('2').astype('int32')
df = df.drop('classes', axis=1)

In [171]:
X = np.array([[i,l] for i,l in zip(image_files, bronze_label_files)])
y = df.values

In [172]:
X_train, y_train, X_val, y_val = iterative_train_test_split(X, y, test_size = 0.25)

In [173]:
df_train = pd.concat([pd.DataFrame(X_train).rename(columns={0: "image", 1: "label"}),
                      pd.DataFrame(y_train).rename(columns={0: "label_0", 1: "label_1", 2:"label_2"})
                     ], axis=1)

df_val = pd.concat([pd.DataFrame(X_val).rename(columns={0: "image", 1: "label"}),
                     pd.DataFrame(y_val).rename(columns={0: "label_0", 1: "label_1", 2:"label_2"})
                    ], axis=1)

In [174]:
silver_label_dir = '../data/silver/labels'
silver_image_dir = '../data/silver/images'
train_split = 0.8

if not os.path.exists(silver_label_dir): 
    os.makedirs(silver_label_dir)
    os.makedirs(f'{silver_label_dir}/train')
    os.makedirs(f'{silver_label_dir}/val')
if not os.path.exists(silver_image_dir): 
    os.makedirs(silver_image_dir)
    os.makedirs(f'{silver_image_dir}/train')
    os.makedirs(f'{silver_image_dir}/val')

for im,lbl in df_train[['image', 'label']].values.tolist():
    im = im.replace('\\', '/')
    lbl = lbl.replace('\\', '/')
    shutil.copyfile(im, f"{silver_image_dir}/train/{im.split('/')[-1]}")
    shutil.copyfile(lbl, f"{silver_label_dir}/train/{lbl.split('/')[-1]}")

for im,lbl in df_val[['image', 'label']].values.tolist():
    im = im.replace('\\', '/')
    lbl = lbl.replace('\\', '/')
    shutil.copyfile(im, f"{silver_image_dir}/val/{im.split('/')[-1]}")
    shutil.copyfile(lbl, f"{silver_label_dir}/val/{lbl.split('/')[-1]}")