# Data Augmenation On The PCB Defect Dataset

This notebook contains all the code needed to use TAO augmenation on subsets of the PCB defect dataset to showcase how augmenatation can be used to improve KPIs for small datasets. 

This notebook requires the TAO Launcher, Docker and NGC to be setup

The github readme has steps on setting up the prerequisites 

This notebook also requires preprocess_pcb.py to be in the same directory to function. 

This notebook takes the following steps
1) Download and unpack the PCB defect dataset

2) Convert the dataset to kitti format 

3) Split the dataset into test and train subsets

4) Map local directories the the TAO launcher

5) Generate offline augmenation spec file and apply augmentation to the training sets

6) Generate TF Records for the test and training sets

7) Downloads pretrained object detection weights needed for the trainings

8) Launch trainings and evaluation

The last section of this notebook contains all the commands needed to run training and evaluation on all 6 datasets.  
Steps 1-7 only need to run 1 time. The trainings in step 7 can be run in any order once steps 1-6 have successfully run. 
A common test set of 500 images is used for validation on all trainings

Datasets
100 subset x1  
100 subset x10  
100 subset x20  
500 subset x1  
500 subset x10  
500 subset x20  


In [None]:
!python3 -m pip install matplotlib

In [None]:
import os
from preprocess_pcb import convert_annotation, create_subset

In [None]:
#paths relative to local repository
repo_home = os.path.join(os.getcwd(), "../../")
model_home = os.path.join(repo_home, "workspace/models")
dataset_home = os.path.join(repo_home, "datasets/pcb_defect")
exp_home = os.path.join(repo_home, "workspace/pcb_data_aug")

#paths for inside the container
dataset_home_cont = "/datasets/pcb_defect/"
exp_home_cont = "/tlt_exp/pcb_data_aug/"

## Download and unpack the PCB defect dataset

In [None]:
%cd $dataset_home

In [None]:
#download and unzip
!wget https://www.dropbox.com/s/h0f39nyotddibsb/VOC_PCB.zip 
!unzip VOC_PCB.zip

## Convert the dataset to kitti format

In [None]:
#setup folders for dataset images and labels
os.makedirs("original/images", exist_ok=True)
os.makedirs("original/labels", exist_ok=True)
!cp -r VOC_PCB/JPEGImages/. original/images

In [None]:
#Setup Paths and make label folder
xml_label_path = "VOC_PCB/Annotations"
kitti_label_output = "original/labels"

#Convert labels to kitti and put into output folder
for x in os.listdir(xml_label_path):
    current_label_path = os.path.join(xml_label_path, x)
    convert_annotation(current_label_path, kitti_label_output)

## Split the dataset into test and train subsets

In [None]:
#Setup folders for dataset subset
test_500 = os.path.join(exp_home, "test_500_list.txt")
train_100 = os.path.join(exp_home, "train_100_list.txt")
train_500 = os.path.join(exp_home, "train_500_list.txt")


os.makedirs("500_subset_test_x1", exist_ok=True)
os.makedirs("100_subset_train_x1", exist_ok=True)
os.makedirs("500_subset_train_x1", exist_ok=True)

#Create the subsets based on predefined lists
create_subset("original", test_500, "500_subset_test_x1")
create_subset("original", train_100, "100_subset_train_x1")
create_subset("original", train_500, "500_subset_train_x1")

In [None]:
## Map local directories the the TAO launcher

In [None]:
# Mapping up the local directories to the TAO docker.
import json
mounts_file = os.path.expanduser("~/.tao_mounts.json")

# Define the dictionary with the mapped drives
drive_map = {
    "Mounts": [
        # Mapping the data directory
        {
            "source": os.path.join(repo_home, "datasets"),
            "destination": "/datasets"
        },
        # Mapping the specs directory.
        {
            "source": os.path.join(repo_home, "workspace"),
            "destination": "/tlt_exp"
        },
    ]
}

# Writing the mounts file.
with open(mounts_file, "w") as mfile:
    json.dump(drive_map, mfile, indent=4)

## Generate offline augmenation spec file and apply augmentation to the training sets

In [None]:
from preprocess_pcb import gen_random_aug_spec, combine_kitti, visualize_images
from random import randint

In [None]:
#Input dataset folder to augment, augment output folder and number of augmentations. Requires local paths and container paths
#For each augment a randomized spec file and augmented dataset is produced
#Also outputs a dataset with all combined augmentations
def generate_augments(dataset_folder, dataset_folder_cont,  output_folder, output_folder_cont, num_augments):
    for i in range(0,num_augments):
        spec_out = os.path.join(output_folder, "aug_spec" + str(i) + ".txt")
        spec_out_cont = os.path.join(output_folder_cont, "aug_spec" + str(i) + ".txt")
        gen_random_aug_spec(600,600,"jpg", spec_out)
        !cat $spec_out

        aug_folder = os.path.join(output_folder, "aug" + str(i))
        aug_folder_cont = os.path.join(output_folder_cont, "aug" + str(i))
        !tao augment -a $spec_out_cont -o $aug_folder_cont -d $dataset_folder_cont

        if i == 0:
            d1 = dataset_folder
            d2 = aug_folder
            d3 = os.path.join(output_folder, "combined_x2")
            combine_kitti(d1,d2,d3)
        else:
            d1 = os.path.join(output_folder, "combined_x" + str(i+1))
            d2 = aug_folder
            d3 = os.path.join(output_folder, "combined_x" + str(i+2))
            combine_kitti(d1,d2,d3)

In [None]:
#generate augmentations for 100 image subset
dataset_folder = "100_subset_train_x1" #folder for the existing dataset to be augmented. This folder will not be modified
dataset_folder_cont = os.path.join(dataset_home_cont, "100_subset_train_x1")

output_folder = "100_subset_train_aug" #folder for the augmented output. Does not need to exist
output_folder_cont = os.path.join(dataset_home_cont, output_folder)

num_augments = 19 #number of augmented datasets to generate
os.makedirs(output_folder, exist_ok=True)

generate_augments(dataset_folder,dataset_folder_cont,output_folder, output_folder_cont, num_augments)

In [None]:
#Display some of the augmented images
#Rerun to see new images each time
aug_choice = str(randint(0,num_augments-1))
visualize_images(os.path.join(output_folder, "aug"+aug_choice+"/images"), num_images=8)

In [None]:
#generate augmentations for 500 image subset
dataset_folder = "500_subset_train_x1" #folder for the existing dataset to be augmented. This folder will not be modified
dataset_folder_cont = os.path.join(dataset_home_cont, "500_subset_train_x1")

output_folder = "500_subset_train_aug" #folder for the augmented output. Does not need to exist
output_folder_cont = os.path.join(dataset_home_cont, "500_subset_train_aug")

num_augments = 19 #number of augmented datasets to generate
os.makedirs(output_folder, exist_ok=True)

generate_augments(dataset_folder, dataset_folder_cont, output_folder, output_folder_cont, num_augments)

In [None]:
#Display some of the augmented images
#Rerun to see new images each time
aug_choice = str(randint(0,num_augments-1))
visualize_images(os.path.join(output_folder, "aug"+aug_choice+"/images"), num_images=8)

In [None]:
#Place important datasets in the dataset folder

!mv 100_subset_train_aug/combined_x10 100_subset_train_x10
!mv 100_subset_train_aug/combined_x20 100_subset_train_x20

!mv 500_subset_train_aug/combined_x10 500_subset_train_x10
!mv 500_subset_train_aug/combined_x20 500_subset_train_x20

## Generate TF Records for the test and training sets

In [None]:
#Returns the tf record config as a string with the given dataset path
#root directory path must be inside the container
def gen_tf_spec(dataset_path):

    spec_str = f"""
    kitti_config {{
      root_directory_path: "/datasets/pcb_defect/{dataset_path}"
      image_dir_name: "images"
      label_dir_name: "labels"
      image_extension: ".jpg"
      partition_mode: "random"
      num_partitions: 2
      val_split: 20
      num_shards: 10
    }}
    """
    return spec_str

In [None]:
#Loop through all datasets to generate tf records
dataset_paths = ["500_subset_test_x1", "500_subset_train_x1", "500_subset_train_x10", "500_subset_train_x20", "100_subset_train_x1", "100_subset_train_x10", "100_subset_train_x20"]
for path in dataset_paths:
    record_path = os.path.join(dataset_home, path, "tfrecord_spec.txt")
    record_path_cont = os.path.join(dataset_home_cont, path, "tfrecord_spec.txt")
    
    record_output = os.path.join(dataset_home, path, "tfrecords_rcnn/")
    record_output_cont = os.path.join(dataset_home_cont, path, "tfrecords_rcnn/")
    
    print("************" + record_path)
    with open(record_path, "w+") as spec:
        spec.write(gen_tf_spec(path))
    !tao faster_rcnn dataset_convert -d $record_path_cont -o $record_output_cont

## Downloads pretrained object detection weights needed for the trainings

In [None]:
#requires NGC to be configured
os.makedirs(os.path.join(model_home, "fasterRCNN"), exist_ok=True)
%cd $model_home/fasterRCNN
!ngc registry model download-version "nvidia/tlt_pretrained_object_detection:resnet18"

## Launch trainings and evaluation

Each cell in this section will train and evaluate on 1 dataset in the experiment. The results will be output to the respective experiment folder. 

The trainings may take several hours depending on your hardware. 

In [None]:
experiments_cont = os.path.join(exp_home_cont, "experiments")
experiments = os.path.join(exp_home, "experiments")

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_aug/100_subset_train_x1/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_aug/100_subset_train_x1/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_aug/100_subset_train_x1/eval_log.txt
!cat $experiments/offline_aug/100_subset_train_x1/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_aug/100_subset_train_x10/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_aug/100_subset_train_x10/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_aug/100_subset_train_x10/eval_log.txt
!cat $experiments/offline_aug/100_subset_train_x10/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_aug/100_subset_train_x20/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_aug/100_subset_train_x20/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_aug/100_subset_train_x20/eval_log.txt
!cat $experiments/offline_aug/100_subset_train_x20/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_aug/500_subset_train_x1/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_aug/500_subset_train_x1/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_aug/500_subset_train_x1/eval_log.txt
!cat $experiments/offline_aug/500_subset_train_x1/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_aug/500_subset_train_x10/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_aug/500_subset_train_x10/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_aug/500_subset_train_x10/eval_log.txt
!cat $experiments/offline_aug/500_subset_train_x10/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_aug/500_subset_train_x20/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_aug/500_subset_train_x20/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_aug/500_subset_train_x20/eval_log.txt
!cat $experiments/offline_aug/500_subset_train_x20/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_online_aug/100_subset_train_x1/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_online_aug/100_subset_train_x1/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_online_aug/100_subset_train_x1/eval_log.txt
!cat $experiments/offline_online_aug/100_subset_train_x1/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_online_aug/100_subset_train_x10/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_online_aug/100_subset_train_x10/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_online_aug/100_subset_train_x10/eval_log.txt
!cat $experiments/offline_online_aug/100_subset_train_x10/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_online_aug/100_subset_train_x20/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_online_aug/100_subset_train_x20/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_online_aug/100_subset_train_x20/eval_log.txt
!cat $experiments/offline_online_aug/100_subset_train_x20/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_online_aug/500_subset_train_x1/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_online_aug/500_subset_train_x1/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_online_aug/500_subset_train_x1/eval_log.txt
!cat $experiments/offline_online_aug/500_subset_train_x1/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_online_aug/500_subset_train_x10/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_online_aug/500_subset_train_x10/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_online_aug/500_subset_train_x10/eval_log.txt
!cat $experiments/offline_online_aug/500_subset_train_x10/eval_log.txt

In [None]:
!tao faster_rcnn train -e $experiments_cont/offline_online_aug/500_subset_train_x20/training_spec.txt -k tlt_encode
!tao faster_rcnn evaluate -e $experiments_cont/offline_online_aug/500_subset_train_x20/training_spec.txt -k tlt_encode --log_file $experiments_cont/offline_online_aug/500_subset_train_x20/eval_log.txt
!cat $experiments/offline_online_aug/500_subset_train_x20/eval_log.txt