# TL-Seoul dataset preprocessing
* convert annotation information from text to json file
    ```
    |class|top|left|width|height| -> |image_id|image_name|category_id|bbox(x_min|y_min|width|height)|trainvaltest|
    ```

In [20]:
from IPython.core.display import Image, display
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

image_date = "2018-09-04-14-24-39"
image_file = "pylon_camera_node-000000-1536038679.956.jpg"

## display image with bounding box
# image = cv2.imread(image_path)
# cv2.rectangle(image,(bbox[0], bbox[1]),(bbox[2]+bbox[0], bbox[3]+bbox[1]),(220, 0, 0), 1)

# plt.figure(figsize=(20,20))
# plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# plt.show()


  from IPython.core.display import Image, display


In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd

base_path = r"C:\Users\kyung\Downloads\images"

def get_images_from_txt(path):
    images = set()
    with open(path) as file:
        for line in file.readlines():
            images.add(line.strip())
    return images

# train
train_images = get_images_from_txt(f"{base_path}/train.txt")
print(f"train dataset size:      {len(train_images)}")

# val
val_images = get_images_from_txt(f"{base_path}/valid.txt")
print(f"validation dataset size: {len(val_images)}")

# test
test_images = get_images_from_txt(f"{base_path}/test.txt")
print(f"test dataset size:       {len(test_images)}")

data = dict(image_id=list(), image_name=list(), category_id=list(), bbox=list(), trainvaltest=list())
image_id_cnt = 0
images = {}
for path in Path(base_path).glob('*'):
    if path.is_dir():
        images[path.parts[-1]] = [p.parts[-1] for p in path.glob('*.jpg')]

no_anno_list = []
no_use_list = []
cnt_trainvaltest = dict(train=0, val=0, test=0)

tk = tqdm(images.items())
for image_folder, image_file_list in tk:
    for idx, image_file in enumerate(image_file_list):
        image_path = f"{base_path}/{image_folder}/{image_file}"

        # check annotation
        annotation_path = f"{base_path}/{image_folder}/anno"
        bbox_path = f"{annotation_path}/bbox/{image_file}.csv"
        bulb_path = f"{annotation_path}/bulb/{image_file}.csv"

        # check train/val/test
        trainvaltest = None
        if f"./{image_folder}/{image_file}" in train_images:
            trainvaltest = "train"

        if f"./{image_folder}/{image_file}" in val_images:
            trainvaltest = "val"
        
        if f"./{image_folder}/{image_file}" in test_images:
            trainvaltest = "test"

        if trainvaltest is None:
            no_use_list.append(image_path)
            continue

        cnt_trainvaltest[trainvaltest] += 1
                
        if (not Path(bbox_path).exists()) or (not Path(bulb_path).exists()):
            no_anno_list.append(image_path)
            data["image_id"].append(image_id_cnt)
            data["image_name"].append(f"./{image_folder}/{image_file}")
            data["category_id"].append(-1)
            data["bbox"].append([])
            data["trainvaltest"].append(trainvaltest)
            image_id_cnt += 1
            tk.set_postfix(images=f"{idx+1}/{len(image_file_list)}")
            continue

        with open(bbox_path, "r") as file:
            for anno in file.readlines():
                anno = anno.strip()
                category_id, *bbox = map(int, anno.split(","))
                data["image_id"].append(image_id_cnt)
                data["image_name"].append(f"./{image_folder}/{image_file}")
                data["category_id"].append(category_id)
                data["bbox"].append(bbox)
                data["trainvaltest"].append(trainvaltest)
            image_id_cnt += 1
            tk.set_postfix(images=f"{idx+1}/{len(image_file_list)}")

dataframe = pd.DataFrame.from_dict(data)
dataframe.to_csv(f'{base_path}/train.csv', index=False)
print(dataframe.head())

train dataset size:      15555
validation dataset size: 5185
test dataset size:       5142


  0%|          | 0/483 [00:00<?, ?it/s]

   image_id                                         image_name  category_id  \
0         0  ./2018-09-04-14-24-39/pylon_camera_node-000000...            1   
1         1  ./2018-09-04-14-24-39/pylon_camera_node-000010...            1   
2         2  ./2018-09-04-14-24-39/pylon_camera_node-000020...            1   
3         3  ./2018-09-04-14-24-39/pylon_camera_node-000030...            1   
4         4  ./2018-09-04-14-24-39/pylon_camera_node-000040...            1   

                  bbox trainvaltest  
0  [1453, 594, 31, 10]        train  
1  [1359, 596, 29, 10]         test  
2   [1285, 598, 29, 9]          val  
3  [1212, 597, 26, 10]          val  
4   [1138, 596, 25, 9]         test  


In [2]:
print("no annotation:", len(no_anno_list))
print("no use_list:", len(no_use_list))
print("no train/val/test:", len(no_use_list))

cnt_trainvaltest

no annotation: 808
no use_list: 44
no train/val/test: 44


{'train': 15555, 'val': 5185, 'test': 5142}

In [1]:
import pandas as pd
base_path = r"C:\Users\kyung\Downloads\images"
dataframe = pd.read_csv(f'{base_path}/train.csv')
dataframe.nunique()

image_id        24933
image_name      24933
category_id         5
bbox            74942
trainvaltest        3
dtype: int64