# Setup

## Imports

In [1]:
import os
import shutil

import pandas as pd
import numpy as np
import cv2

## Output

### Name of output dataset

In [24]:
dataset_name = "43-classes"
# dataset_name = "12-classes"

### What classes from initial dataset to use

In [25]:
classes = [str(i).zfill(5) for i in range(0, 43)]

In [26]:
# classes = ["00012", "00013", "00014", "00019", "00020", "00021", "00027", "00028", "00031", "00033", "00034", "00035"]

### Image dimensions

In [27]:
dims = (32, 32)

# Separtating images with desired classes

## Define paths

### New dataset paths

In [28]:
path_dataset = os.path.join("datasets", dataset_name)
os.mkdir(path_dataset)
path_raw_dataset = os.path.join("raw-datasets", dataset_name)
os.mkdir(path_raw_dataset)
path_raw_train = os.path.join(path_raw_dataset, "raw_train")
os.mkdir(path_raw_train)
path_raw_test = os.path.join(path_raw_dataset, "raw_test")
os.mkdir(path_raw_test)
path_train = os.path.join(path_dataset, "train")
os.mkdir(path_train)
path_test = os.path.join(path_dataset, "test")
os.mkdir(path_test)

### Initial dataset paths

In [29]:
path_initial_train = os.path.join(
    "initial-dataset",
    "GTSRB_Final_Training_Images",
    "GTSRB", "Final_Training",
    "Images"
)
path_initial_test_csv = os.path.join(
    "initial-dataset",
    "GTSRB_Final_Test_GT",
    "GT-final_test.csv"
)
path_initial_test_images = os.path.join(
    "initial-dataset",
    "GTSRB_Final_Test_Images",
    "GTSRB",
    "Final_Test",
    "Images"
)

## Train data

### Move files and create one dataframe df_train_raw

In [30]:
dfs = []

new_class_id = 0

classes_updates = {}

# assume only subdirectories are in dataset_directory
for cls in classes:
    # path to all images + csv
    dir_path = os.path.join(path_initial_train, cls)
    
    # dataframe with info about all images
    df_dir = pd.read_csv(os.path.join(dir_path, f"GT-{cls}.csv"), sep=";")

    # move all to directory: datasets/{dataset-name}/raw_train
    new_names = []
    for i, name in enumerate(df_dir["Filename"]):
        filename = f"raw_{str(new_class_id).zfill(6)}_{str(i).zfill(6)}.ppm"
        shutil.copy2(os.path.join(dir_path, name), os.path.join(path_raw_train, filename))
        new_names.append(filename)
    df_dir["Filename"] = pd.Series(new_names)
    
    # update classId
    old_class_id = df_dir["ClassId"].iloc[0]
    df_dir["ClassId"] = new_class_id
    classes_updates[old_class_id] = new_class_id
    new_class_id += 1
    
    dfs.append(df_dir)

df_train_raw = pd.concat(dfs).reset_index(drop=True)
df_train_raw.to_csv(os.path.join(path_raw_dataset, "raw_train.csv"), sep=";")

## Test data

### Read csv

In [31]:
df_initial_test = pd.read_csv(path_initial_test_csv, sep=";")

### Drop unused classes

In [32]:
df_initial_test = df_initial_test[df_initial_test["ClassId"].isin(classes_updates.keys())]

### Update ClassId to new one

In [33]:
df_test_raw = df_initial_test.copy(deep=True)
df_test_raw["ClassId"] = df_initial_test["ClassId"].apply(lambda id: classes_updates[id])

### Reset index

In [34]:
df_test_raw.reset_index(drop=True, inplace=True)

### Move files to datasets/{dataset_name}/raw_test

In [35]:
new_names = []
for i, name in enumerate(df_test_raw["Filename"]):
    filename = f"raw_{str(i).zfill(6)}.ppm"
    shutil.copy2(os.path.join(path_initial_test_images, name), os.path.join(path_raw_test, filename))
    new_names.append(filename)
df_test_raw["Filename"] = pd.Series(new_names)

### Save csv

In [36]:
df_test_raw.to_csv(os.path.join(path_raw_dataset, "raw_test.csv"), sep=";")

# Extracting ROI from raw images

### Train

In [37]:
df_train = df_train_raw[["Filename", "ClassId"]]

In [38]:
for index, row in df_train_raw.iterrows():
    img_path = os.path.join(path_raw_train, row["Filename"])
    img = cv2.imread(img_path)
    roi = img[row["Roi.Y1"]:row["Roi.Y2"], row["Roi.X1"]: row["Roi.X2"]]
    roi_resized= cv2.resize(roi, dims)
    new_name = row["Filename"][4:] #  exclued 'raw_'
    df_train.loc[index, "Filename"] = new_name
    cv2.imwrite(os.path.join(path_train, new_name), roi_resized)

In [39]:
df_train.to_csv(os.path.join(path_dataset, "train.csv"), sep=";")

### Test

In [40]:
df_test = df_test_raw[["Filename", "ClassId"]]

In [41]:
for index, row in df_test_raw.iterrows():
    img_path = os.path.join(path_raw_test, row["Filename"])
    img = cv2.imread(img_path)
    roi = img[row["Roi.Y1"]:row["Roi.Y2"], row["Roi.X1"]: row["Roi.X2"]]
    roi_resized= cv2.resize(roi, dims)
    new_name = row["Filename"][4:] #  exclued 'raw_'
    df_test.loc[index, "Filename"] = new_name
    cv2.imwrite(os.path.join(path_test, new_name), roi_resized)

In [42]:
df_test.to_csv(os.path.join(path_dataset, "test.csv"), sep=";")

# Export to archive

In [43]:
shutil.make_archive(os.path.join("zip", dataset_name), 'zip', path_dataset)

'/home/szymonkepinski/internship-cnn-trafic/zip/43-classes.zip'