## Reformatting LISA datasets

The LISA datasets-- traffic lights and traffic signs-- are stored in a series of different folders. Within each folder is a csv file, where each row corresponds to one object. In order to train the YOLO model, I need three folders-- a train, validate, and test folder-- which each contain two folders: images and labels. For each image, I have a imageName.jpg file in the images folder-- the image itself-- and a imageName.txt file in the labels folder, with one row per object in that image.

This script converts from the LISA format into the YOLO format.

In [1]:
# Importing libraries
import os
import pandas as pd
import numpy as np
import math
import shutil

In [4]:
# Getting all training images
lisa_relative_path = "raw_images/LISA_traffic_lights"
day_folders = [f"{lisa_relative_path}/dayTrain/dayClip{i}" for i in range(1, 14)]
night_folders = [f"{lisa_relative_path}/dayTrain/dayClip{i}" for i in range(1, 14)]
folders = day_folders + night_folders

# Creating two folder sets: one with labels and images for day, one for night
save = True # Whether I save the images in the YOLO format
dest_dir = "YOLO_data/LISA_traffic_lights"
day_dir = os.path.join(dest_dir, "day")
night_dir = os.path.join(dest_dir, "night")
if not os.path.exists(dest_dir): os.mkdir(dest_dir)
if not os.path.exists(day_dir): os.mkdir(day_dir)
if not os.path.exists(night_dir): os.mkdir(night_dir)

for this_dir in [day_dir, night_dir]:
    img_dir = os.path.join(this_dir, 'images')
    txt_dir = os.path.join(this_dir, 'labels')
    if not os.path.exists(img_dir): os.mkdir(img_dir)
    if not os.path.exists(txt_dir): os.mkdir(txt_dir)

for f in folders: print(f)

raw_images/LISA_traffic_lights/dayTrain/dayClip1
raw_images/LISA_traffic_lights/dayTrain/dayClip2
raw_images/LISA_traffic_lights/dayTrain/dayClip3
raw_images/LISA_traffic_lights/dayTrain/dayClip4
raw_images/LISA_traffic_lights/dayTrain/dayClip5
raw_images/LISA_traffic_lights/dayTrain/dayClip6
raw_images/LISA_traffic_lights/dayTrain/dayClip7
raw_images/LISA_traffic_lights/dayTrain/dayClip8
raw_images/LISA_traffic_lights/dayTrain/dayClip9
raw_images/LISA_traffic_lights/dayTrain/dayClip10
raw_images/LISA_traffic_lights/dayTrain/dayClip11
raw_images/LISA_traffic_lights/dayTrain/dayClip12
raw_images/LISA_traffic_lights/dayTrain/dayClip13
raw_images/LISA_traffic_lights/dayTrain/dayClip1
raw_images/LISA_traffic_lights/dayTrain/dayClip2
raw_images/LISA_traffic_lights/dayTrain/dayClip3
raw_images/LISA_traffic_lights/dayTrain/dayClip4
raw_images/LISA_traffic_lights/dayTrain/dayClip5
raw_images/LISA_traffic_lights/dayTrain/dayClip6
raw_images/LISA_traffic_lights/dayTrain/dayClip7
raw_images/LISA_

In [5]:
# Copying images from into YOLO Folder

for x, f in enumerate(folders):
    print(f)

    # Loading the data frame for this folders' training sequence
    csv_path = os.path.join(lisa_relative_path, f"Annotations/Annotations/{f}/frameAnnotationsBOX.csv")
    col_names = ['file_name', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'origin', 'frame', 'origin2', 'frame2']
    label_df = pd.read_csv(csv_path, sep=';', header=0, names=col_names)

    # Cleaning up filenames
    if f[:3] == "day": label_df['file_name'] = 'frames/' + label_df['file_name'].str[12:]
    else: label_df['file_name'] = 'frames/' + label_df['file_name'].str[14:]
    display(label_df.head())

    # Determining list of unique labels
    unique_labels = label_df['label'].unique()
    unique_dict = dict(zip(range(unique_labels.shape[0]), unique_labels))
    lookup_dict = {}
    for key, value in unique_dict.items(): lookup_dict[value] = key
    display(unique_dict)

    # Determining list of unique files
    unique_files = label_df['file_name'].unique()
    n_unique_files = unique_files.shape[0]
    all_indices = np.linspace(0, n_unique_files-1, n_unique_files-1)

    # Determining if these files belong in the day or night folder
    this_mode = "night"
    if f[:3] == "day": this_mode = "day"

    # Compiling all data for .txt file for each image
    for i, (img_path, txt_df) in enumerate(label_df.groupby('filename')):
        lines = list()
        for row in txt_df.itertuples():
            xcenter = int(row.xmin + row.xmax / 2)
            ycenter = int(row.ymin + row.ymax / 2) / 960
            xwidth = (row.xmax - row.xmin) / 1280
            ywidth = (row.ymax - row.ymin) / 960
            line = f"{lookup_dict[row.label]} {xcenter} {ycenter} {xwidth} {ywidth}"
            lines.append(line)

        old_img_path = f"{lisa_relative_path}/{this_mode}Train/{f}/{img_path}"
        new_img_path = f"{dest_dir}/{this_mode}/images/{x:03d}_{i:04d}.jpg"
        new_txt_path = f"{dest_dir}/{this_mode}/labels/{x:03d}_{i:04d}.txt"

        # Saving files to new YOLO location
        if save:
            shutil.copy(old_img_path, new_img_path)
            with open(new_txt_path, 'w') as file:
                for line in lines:
                    file.write(line + '\n')

SyntaxError: invalid syntax (1897432207.py, line 18)

The next step is to create more manageable, usable datasets from this large dataset. To start, I will make two datasets: One will have 2,000 images-- 60% training, 20% validation, and 20% testing. Each component will be 50% day photos, 50% night photos. The other dataset will have the same ratios but only 20 total images, taken from the first dataset. This second dataset will be used only for proof of concept runs-- i.e. I'll use it to quickly train a model to establish that all parts of a workflow work correctly, but the results of this model will never matter.

In [None]:
# Creating Framework for datasets
dest_dir1 = "YOLO_data/Dataset1_2000"
dest_dir2 = "YOLO_data/Dataset2_20"

if not os.path.exists(dest_dir1): os.mkdir(dest_dir1)
if not os.path.exists(dest_dir2): os.mkdir(dest_dir2)

for this_dir in [dest_dir1, dest_dir2]:
    test_dir = os.path.join(this_dir, 'test')
    train_dir = os.path.join(this_dir, 'train')
    valid_dir = os.path.join(this_dir, 'valid')
    if not os.path.exists(test_dir): os.mkdir(test_dir)
    if not os.path.exists(train_dir): os.mkdir(train_dir)
    if not os.path.exists(valid_dir): os.mkdir(valid_dir)

for this_dir in [test_dir, train_dir, valid_dir]:
    img_dir = os.path.join(this_dir, 'images')
    txt_dir = os.path.join(this_dir, 'labels')
    if not os.path.exists(img_dir): os.mkdir(img_dir)
    if not os.path.exists(txt_dir): os.mkdir(txt_dir)

# Getting list of image filenames
import glob
import random
day_filenames = glob.glob(f"{dest_dir}/day/labels")
night_filenames = glob.glob(f"{dest_dir}/night/labels")

# Shuffling list of filenames (to randomly select files to use)
random.shuffle(day_filenames)
random.shuffle(night_filenames)

# Confirming there are enough files here to use
assert len(day_filenames) >= 1000
assert len(night_filenames) >= 1000

# Copying files dataset1 framework
# Copying files into train dataset
for i in range(0, 600):
    # Copying over daytime file
    old_img_path = day_filenames[i]
    new_img_path = f"{dest_dir1}/train/images/day_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
    # Copying over nighttime file
    old_img_path = night_filenames[i]
    new_img_path = f"{dest_dir1}/train/images/night_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
# Copying files into valid dataset
for i in range(600, 800):
    # Copying over daytime file
    old_img_path = day_filenames[i]
    new_img_path = f"{dest_dir1}/valid/images/day_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
    # Copying over nighttime file
    old_img_path = night_filenames[i]
    new_img_path = f"{dest_dir1}/valid/images/night_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
# Copying files into test dataset
for i in range(800, 1000):
    # Copying over daytime file
    old_img_path = day_filenames[i]
    new_img_path = f"{dest_dir1}/test/images/day_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
    # Copying over nighttime file
    old_img_path = night_filenames[i]
    new_img_path = f"{dest_dir1}/test/images/night_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)

# Copying files dataset2 framework
# Copying files into train dataset
for i in range(0, 6):
    # Copying over daytime file
    old_img_path = day_filenames[i]
    new_img_path = f"{dest_dir1}/train/images/day_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
    # Copying over nighttime file
    old_img_path = night_filenames[i]
    new_img_path = f"{dest_dir1}/train/images/night_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
# Copying files into valid dataset
for i in range(6, 8):
    # Copying over daytime file
    old_img_path = day_filenames[i]
    new_img_path = f"{dest_dir1}/valid/images/day_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
    # Copying over nighttime file
    old_img_path = night_filenames[i]
    new_img_path = f"{dest_dir1}/valid/images/night_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
# Copying files into test dataset
for i in range(8, 10):
    # Copying over daytime file
    old_img_path = day_filenames[i]
    new_img_path = f"{dest_dir1}/test/images/day_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)
    # Copying over nighttime file
    old_img_path = night_filenames[i]
    new_img_path = f"{dest_dir1}/test/images/night_{i+1}.jpg)"
    shutil.copy(old_img_path, new_img_path)