## Reformatting LISA datasets

The LISA datasets-- traffic lights and traffic signs-- are stored in a series of different folders. Within each folder is a csv file, where each row corresponds to one object. In order to train the YOLO model, I need three folders-- a train, validate, and test folder-- which each contain two folders: images and labels. For each image, I have a imageName.jpg file in the images folder-- the image itself-- and a imageName.txt file in the labels folder, with one row per object in that image.

This script converts from the LISA format into the YOLO format.

In [1]:
# Importing libraries
import os
import pandas as pd
import numpy as np
import math
import shutil

In [2]:
# Getting all training images
lisa_relative_path = "raw_images/LISA_traffic_lights"
day_folders = [f"{lisa_relative_path}/dayTrain/dayClip{i}" for i in range(1, 14)]
night_folders = [f"{lisa_relative_path}/nightTrain/nightClip{i}" for i in range(1, 6)]
folders = day_folders + night_folders

# Creating two folder sets: one with labels and images for day, one for night
save = True # Whether I save the images in the YOLO format
dest_dir = "YOLO_data/LISA_traffic_lights"
day_dir = os.path.join(dest_dir, "day")
night_dir = os.path.join(dest_dir, "night")
if not os.path.exists(dest_dir): os.mkdir(dest_dir)
if not os.path.exists(day_dir): os.mkdir(day_dir)
if not os.path.exists(night_dir): os.mkdir(night_dir)

for this_dir in [day_dir, night_dir]:
    img_dir = os.path.join(this_dir, 'images')
    txt_dir = os.path.join(this_dir, 'labels')
    if not os.path.exists(img_dir): os.mkdir(img_dir)
    if not os.path.exists(txt_dir): os.mkdir(txt_dir)

for f in folders: print(f)

raw_images/LISA_traffic_lights/dayTrain/dayClip1
raw_images/LISA_traffic_lights/dayTrain/dayClip2
raw_images/LISA_traffic_lights/dayTrain/dayClip3
raw_images/LISA_traffic_lights/dayTrain/dayClip4
raw_images/LISA_traffic_lights/dayTrain/dayClip5
raw_images/LISA_traffic_lights/dayTrain/dayClip6
raw_images/LISA_traffic_lights/dayTrain/dayClip7
raw_images/LISA_traffic_lights/dayTrain/dayClip8
raw_images/LISA_traffic_lights/dayTrain/dayClip9
raw_images/LISA_traffic_lights/dayTrain/dayClip10
raw_images/LISA_traffic_lights/dayTrain/dayClip11
raw_images/LISA_traffic_lights/dayTrain/dayClip12
raw_images/LISA_traffic_lights/dayTrain/dayClip13
raw_images/LISA_traffic_lights/nightTrain/nightClip1
raw_images/LISA_traffic_lights/nightTrain/nightClip2
raw_images/LISA_traffic_lights/nightTrain/nightClip3
raw_images/LISA_traffic_lights/nightTrain/nightClip4
raw_images/LISA_traffic_lights/nightTrain/nightClip5


In [3]:
# Copying images from into YOLO Folder

# Determining list of unique labels
unique_dict = {0: 'stop', 1: 'stopLeft', 2: 'go', 3: 'goLeft', 4: 'warning', 5: 'warningLeft'}
lookup_dict = {}
for key, value in unique_dict.items(): lookup_dict[value] = key
first_lookup_dict = {}

for x, f in enumerate(folders):
    print(x, f)

    # Loading the data frame for this folders' training sequence
    csv_path = os.path.join(lisa_relative_path, f"Annotations/Annotations/{f[31:]}/frameAnnotationsBOX.csv")
    col_names = ['file_name', 'label', 'xmin', 'ymin', 'xmax', 'ymax', 'origin', 'frame', 'origin2', 'frame2']
    label_df = pd.read_csv(csv_path, sep=';', header=0, names=col_names)

    # Cleaning up filenames
    if "day" in f: label_df['file_name'] = 'frames/' + label_df['file_name'].str[12:]
    else: label_df['file_name'] = 'frames/' + label_df['file_name'].str[14:]
    # display(label_df.head())

    # Determining list of unique files
    unique_files = label_df['file_name'].unique()
    n_unique_files = unique_files.shape[0]
    all_indices = np.linspace(0, n_unique_files-1, n_unique_files-1)

    # Compiling all data for .txt file for each image
    for i, (img_path, txt_df) in enumerate(label_df.groupby('file_name')):
        lines = list()
        for row in txt_df.itertuples():
            xcenter = int((row.xmin + row.xmax) / 2) / 1280
            if xcenter >= 1: print(row.xmin, row.xmax, xcenter)
            ycenter = int((row.ymin + row.ymax) / 2) / 960
            xwidth = (row.xmax - row.xmin) / 1280
            ywidth = (row.ymax - row.ymin) / 960
            line = f"{lookup_dict[row.label]} {xcenter} {ycenter} {xwidth} {ywidth}"
            lines.append(line)
            if float(line.split(' ')[1]) > 1.0:
                print(line)

        # Determining if these files belong in the day or night folder
        this_mode = "night"
        if "day" in f: this_mode = "day"

        # Getting file paths
        old_img_path = f"{lisa_relative_path}/{this_mode}Train/{this_mode}Train/{f.rsplit('/', 1)[1]}/{img_path}"
        new_img_path = f"{dest_dir}/{this_mode}/images/{x:03d}_{i:04d}.jpg"
        new_txt_path = f"{dest_dir}/{this_mode}/labels/{x:03d}_{i:04d}.txt"

        first_lookup_dict[new_txt_path] = old_img_path

        # Saving files to new YOLO location
        if save:
            shutil.copy(old_img_path, new_img_path)
            with open(new_txt_path, 'w') as file:
                for line in lines:
                    file.write(line + '\n')

0 raw_images/LISA_traffic_lights/dayTrain/dayClip1
1 raw_images/LISA_traffic_lights/dayTrain/dayClip2
2 raw_images/LISA_traffic_lights/dayTrain/dayClip3
3 raw_images/LISA_traffic_lights/dayTrain/dayClip4
4 raw_images/LISA_traffic_lights/dayTrain/dayClip5
5 raw_images/LISA_traffic_lights/dayTrain/dayClip6
6 raw_images/LISA_traffic_lights/dayTrain/dayClip7
7 raw_images/LISA_traffic_lights/dayTrain/dayClip8
8 raw_images/LISA_traffic_lights/dayTrain/dayClip9
9 raw_images/LISA_traffic_lights/dayTrain/dayClip10
10 raw_images/LISA_traffic_lights/dayTrain/dayClip11
11 raw_images/LISA_traffic_lights/dayTrain/dayClip12
12 raw_images/LISA_traffic_lights/dayTrain/dayClip13
13 raw_images/LISA_traffic_lights/nightTrain/nightClip1
14 raw_images/LISA_traffic_lights/nightTrain/nightClip2
15 raw_images/LISA_traffic_lights/nightTrain/nightClip3
16 raw_images/LISA_traffic_lights/nightTrain/nightClip4
17 raw_images/LISA_traffic_lights/nightTrain/nightClip5


The next step is to create more manageable, usable datasets from this large dataset. To start, I will make two datasets: One will have 2,000 images-- 60% training, 20% validation, and 20% testing. Each component will be 50% day photos, 50% night photos. The other dataset will have the same ratios but only 20 total images, taken from the first dataset. This second dataset will be used only for proof of concept runs-- i.e. I'll use it to quickly train a model to establish that all parts of a workflow work correctly, but the results of this model will never matter.

Note that I also need to, for every dataset, adjust to the number of classes in the output. Basically, once I grab the X images for a dataset and copy them in, I need to detremine how many different classes of objects are in the datasets, adjust the classes listed in my .txt files, and create a corresponding .yaml file

In [4]:
# Creates three folders called 'test', 'train', and 'valid'
# Each with a "images" and "label" folder inside of them
# Inside of this_dir
def create_dataset_skeleton(this_dir):
    # Creating main dataset folder
    if not os.path.exists(this_dir): os.mkdir(this_dir)

    # Creating test/train/valid folders
    test_dir = os.path.join(this_dir, 'test')
    train_dir = os.path.join(this_dir, 'train')
    valid_dir = os.path.join(this_dir, 'valid')
    if not os.path.exists(test_dir): os.mkdir(test_dir)
    if not os.path.exists(train_dir): os.mkdir(train_dir)
    if not os.path.exists(valid_dir): os.mkdir(valid_dir)

    # Create images/labels folders
    for next_dir in ["test", "train", "valid"]:
        img_dir = os.path.join(this_dir, next_dir, 'images')
        txt_dir = os.path.join(this_dir, next_dir, 'labels')
        if not os.path.exists(img_dir): os.mkdir(img_dir)
        if not os.path.exists(txt_dir): os.mkdir(txt_dir)

# Taken from https://github.com/ncallahanml/potential_vehicle_projects/blob/main/Examples/YOLOv8Train%26Deployment.ipynb
# this io could be handled with PyYAML
def write_yaml_config(class_dict, save_path, primary_path=None, train_path="train/", test_path="test/", valid_path="valid/"):
    yaml_content = f"""

path: {primary_path}  # dataset root dir
train: {train_path}  # train images (relative to 'path')
val: {valid_path}  # val images (relative to 'path')
test: {test_path}

names:"""
    for i in sorted(class_dict.keys()):
        yaml_content += f"\n  {i}: {class_dict[i]}"
        
    assert save_path.endswith('.yaml'), 'End file with .yaml extension'
    with open(save_path, 'w') as file:
        file.write(yaml_content)
    return

# 


In [16]:
import glob
import random

""" Creating Dataset"""
# Hardcoding certain parameters
dest_dir1 = "YOLO_data/DatasetMixed_2000"
dataset_size = 2000 # Total number of images in the dataset

day_proportion, night_proportion = 0.5, 0.5 # Must add to 1
train_proportion, val_proportion, test_proportion = 0.6, 0.2, 0.2 # Must add to 1

# Creating Framework for datasets
create_dataset_skeleton(dest_dir1)

# Getting list of image filenames
day_filenames = glob.glob(f"{dest_dir}/day/images/*")
print(dest_dir)
night_filenames = glob.glob(f"{dest_dir}/night/images/*")

# Shuffling list of filenames (to randomly select files to use)
random.shuffle(day_filenames)
random.shuffle(night_filenames)

# Determining which classes appear in the dataset
unique_dict = {0: 'stop', 1: 'stopLeft', 2: 'go', 3: 'goLeft', 4: 'warning', 5: 'warningLeft'} # Hardcoded; all classes in big dataset
this_dataset_classes = set()

# Confirming there are enough files here to use
assert len(day_filenames) >= dataset_size * day_proportion
assert len(night_filenames) >= dataset_size * night_proportion

# Adding each object's class
for i in range(int(dataset_size * day_proportion)):
    old_txt_path = f"{day_filenames[i][:-20]}/labels/{day_filenames[i][-12:-4]}.txt"
    # print(old_txt_path)
    with open(old_txt_path, 'r') as file:
        for line in file:
            # Split the line by the first space and take the first part
            this_class = int(line.split(' ')[0])
            # print(line.split(' ')[0], line.split(' ')[1])
            this_dataset_classes.add(this_class)

for i in range(int(dataset_size * night_proportion)):
    old_txt_path = f"{night_filenames[i][:-20]}/labels/{night_filenames[i][-12:-4]}.txt"
    with open(old_txt_path, 'r') as file:
        for line in file:
            # Split the line by the first space and take the first part
            this_class = int(line.split(' ')[0])
            this_dataset_classes.add(this_class)

# Creating conversion between all possible classes and classes for this dataset
this_dataset_classes = sorted(list(this_dataset_classes))
# print(this_dataset_classes)
conversion_dict = {}
this_dataset_dict = {}
for i in range(len(this_dataset_classes)):
    conversion_dict[this_dataset_classes[i]] = i
    this_dataset_dict[conversion_dict[this_dataset_classes[i]]] = unique_dict[this_dataset_classes[i]]
    # print(this_dataset_dict)
# The result: conversion_dict is used to convert from the classNum in the txt files into the classNum corresponding to the final YAML file
# The result: this_dataset_dict is essentially the key to be used in the final yaml file
    
# Creating this YAML file
dataset_name = dest_dir1.split('/')[-1]
write_yaml_config(
    this_dataset_dict, 
    # {0: 'traffic light'},
    f'./{dataset_name}.yaml',
    primary_path=f'/home/letucker/road_asset_images/YOLO_data/{dataset_name}',
    train_path='train/',
    test_path='test/',
    valid_path='valid/',
)

# Copying files into new dataset
day_range_boundaries = [("day", "train", 0, dataset_size * day_proportion * train_proportion),
                        ("day", "valid", dataset_size * day_proportion * train_proportion, dataset_size * day_proportion * (train_proportion + val_proportion)),
                        ("day", "test", dataset_size * day_proportion * (train_proportion + val_proportion), dataset_size * day_proportion * (train_proportion + val_proportion + test_proportion))]
night_range_boundaries = [("night", "train", 0, dataset_size * night_proportion * train_proportion),
                          ("night", "valid", dataset_size * night_proportion * train_proportion, dataset_size * night_proportion * (train_proportion + val_proportion)),
                          ("night", "test", dataset_size * night_proportion * (train_proportion + val_proportion), dataset_size * night_proportion * (train_proportion + val_proportion + test_proportion))]

second_lookup_dict = {}

# Copying all day files into new dataset
for t in day_range_boundaries:
    for i in range(int(t[2]), int(t[3])):
        # Copying image to new location
        old_img_path = day_filenames[i]
        new_img_path = f"{dest_dir1}/{t[1]}/images/{t[0]}_{i+1}.jpg"
        shutil.copy(old_img_path, new_img_path)
    
        old_txt_path = f"{day_filenames[i][:-20]}/labels/{day_filenames[i][-12:-4]}.txt"
        # print(day_filenames[i][-12:-4], day_filenames[i])
        new_txt_path = f"{dest_dir1}/{t[1]}/labels/{t[0]}_{i+1}.txt"

        # Reading text from old file
        with open(old_txt_path, 'r') as file:
            lines = []
            for line in file:
                # Split the line by the first space and take the first part
                this_class = int(line.split(' ')[0])
                lines.append(f"{conversion_dict[int(line.split(' ')[0])]} {' '.join(line.split(' ')[1:])}")
                # lines.append(f"0 {' '.join(line.split(' ')[1:])}") # For one-class training
                # print(' '.join(line.split(' ')[1]))
                # print(line)
            this_file_text = "\n".join(lines)

        # Writing text to new file
        with open(new_txt_path, 'w') as file:
            file.write(this_file_text)

        second_lookup_dict[new_txt_path] = old_txt_path

# Copying all night files into new dataset
for t in night_range_boundaries:
    for i in range(int(t[2]), int(t[3])):
        # Copying image to new location
        old_img_path = night_filenames[i]
        new_img_path = f"{dest_dir1}/{t[1]}/images/{t[0]}_{i+1}.jpg"
        shutil.copy(old_img_path, new_img_path)
    
        old_txt_path = f"{night_filenames[i][:-20]}/labels/{night_filenames[i][-12:-4]}.txt"
        new_txt_path = f"{dest_dir1}/{t[1]}/labels/{t[0]}_{i+1}.txt"

        # Reading text from old file
        with open(old_txt_path, 'r') as file:
            lines = []
            for line in file:
                # Split the line by the first space and take the first part
                this_class = int(line.split(' ')[0])
                lines.append(f"{conversion_dict[int(line.split(' ')[0])]} {' '.join(line.split(' ')[1:])}")
                # lines.append(f"0 {' '.join(line.split(' ')[1:])}") # For one-class training
            this_file_text = "\n".join(lines)

        # Writing text to new file
        with open(new_txt_path, 'w') as file:
            file.write(this_file_text)

        second_lookup_dict[new_txt_path] = old_txt_path

YOLO_data/LISA_traffic_lights
