# Data Preprocess for the Dataset

In this session, we will have 2 parts:
- Baseline Preprocess
- Enhanced Preprocess

Each part will have 2 steps:
- Prepare training data
- Prepare public data and private data

In [None]:
import os
import shutil

In [None]:
# check if `pytorch-CycleGAN-and-pix2pix` is already cloned
if not os.path.exists('../pytorch-CycleGAN-and-pix2pix'):
    os.chdir('../')
    !git clone https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
    os.chdir('./dataset')	# change directory back to `./dataset`
else:
    print('pytorch-CycleGAN-and-pix2pix is already cloned.')

In [None]:
!pwd

## Download Dataset
The script will download the dataset if you haven't downloaded it yet.

In [None]:
!bash ../scripts/download_official_dataset.sh

# Baseline Preprocess

we will cover the following steps:
- Prepare training data
- Prepare public data and private data

## Prepare Raw Training Data (Baseline)

The Training dataset contains two subfolder:
- label_img: contains the draft images
- img: contains the corresponding ground truth images

In [None]:
import zipfile

train_dataset_zip = '34_Competition 1_Training dataset.zip'

# unzip the train_dataset_zip
with zipfile.ZipFile(train_dataset_zip, 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
train_dir = 'training_dataset'

# rename the extracted folder
os.rename('Training dataset', train_dir)

In [None]:
train_dir = './training_dataset'
print(os.listdir(train_dir))

### Rename the subfolders as trainA and trainB
mapping the folder name to the model input:
- `training_dataset/label_img` -> `training_dataset/trainA`
- `training_dataset/img` -> `training_dataset/trainB`

In [None]:
# rename the subfolders
os.rename(train_dir + '/label_img', train_dir + '/trainA')
os.rename(train_dir + '/img', train_dir + '/trainB')

### Data Filter (Optional)

At this session, we filter the data with low-quality images.
We have provided a text file that contains the list of low-quality images.

In [None]:
file_to_delete = []
with open('file_to_delete.txt', 'r') as file:
    # Loop through each line in the file
    for line in file:
        if '#' in line:
            continue
        # Remove any trailing whitespace (like newlines)
        line = line.strip()
        file_to_delete.append(line)
# print(file_to_delete)

In [None]:
# Original file size
print('Original file size')
for subfolder in os.listdir(train_dir):
    print(f"{subfolder} size: {len(os.listdir(os.path.join(train_dir, subfolder)))}")


# Filter
for subfolder in os.listdir(train_dir):
    for image_name in os.listdir(os.path.join(train_dir, subfolder)):
        prefix_name = image_name.split('.')[0]
        if prefix_name in file_to_delete:
            os.remove(os.path.join(train_dir, subfolder, image_name))
            
# New file size
print('After filtering')
for subfolder in os.listdir(train_dir):
    print(f"{subfolder} size: {len(os.listdir(os.path.join(train_dir, subfolder)))}")

### Dataset Resize (Optional)

since `unet256` is the model that we will use, we need to resize the images to 256x256.

In [None]:
import cv2

for subfolder in ['trainA', 'trainB']:
    print(f"Resizing {subfolder} images...")
    for image_name in os.listdir(os.path.join(train_dir, subfolder)):
        # resize the image as 256x256
        image_path = os.path.join(train_dir, subfolder, image_name)
        image = cv2.imread(image_path)
        if subfolder == 'trainA':
            image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_NEAREST)
        elif subfolder == 'trainB':
            image = cv2.resize(image, (256, 256), interpolation=cv2.INTER_AREA)
        cv2.imwrite(image_path, image)
print("Resize completed")


### Augmentation (Optional)

We will apply the following augmentation techniques:
- RandomHorizontalFlip(0.5)
- RandomVerticalFlip(0.5)

the same transform will be applied for the same image filename in trainA and trainB

In [None]:
from augmenter import augmentation_AB

augmentation_AB(root=train_dir)

In [None]:
# New file size
print('After Augmentation')
for subfolder in os.listdir(train_dir):
    print(f"{subfolder} size: {len(os.listdir(os.path.join(train_dir, subfolder)))}")

### Align trainA and trainB

In [None]:
from align_dataset import align_images
align_images(train_dir)

### Copy the folder to the model input folder

move the `training_dataset` folders to `../pytorch-CycleGAN-and-pix2pix/datasets`

In [None]:
# copy the folder to target folder
target_dir = '../pytorch-CycleGAN-and-pix2pix/datasets'

# check if the folders exists
if not os.path.exists(target_dir + '/' + train_dir):
    shutil.copytree(train_dir, target_dir + '/' + train_dir)
else:
    # remove the existing folder
    shutil.rmtree(target_dir + '/' + train_dir)
    # copy the folder
    shutil.copytree(train_dir, target_dir + '/' + train_dir)

## Prepare Public and Private Testing Data (Baseline)

1. The extracted zip file only contains `label_img` folder
2. so we need to create the parent folder `testing_dataset`
3. and move the `label_img` folder to `testing_dataset`

In [None]:
import os

# change directory to the root of the project
try:
	os.chdir('../../dataset')
except:
	print("Already in the root directory")

In [None]:
import zipfile

public_testing_dataset_zip = '34_Competition 1_public testing dataset.zip'
private_testing_dataset_zip = '34_Competition 1_Private Test Dataset.zip'
test_dir = 'testing_dataset'

# unzip the public testing dataset
with zipfile.ZipFile(public_testing_dataset_zip, 'r') as zip_ref:
	zip_ref.extractall(test_dir)

# unzip the private testing dataset
with zipfile.ZipFile(private_testing_dataset_zip, 'r') as zip_ref:
    zip_ref.extractall(test_dir)

In [None]:
test_dir = 'testing_dataset'
print(os.listdir(test_dir))

### Rename the subfolder as testA

since the ground truth images are not provided, we just need to rename the folder `label_img` as `testA`

mapping the folder name to the model input:
- `testing_dataset/label_img` -> `testing_dataset/testA`

In [None]:
os.rename(test_dir + '/label_img', test_dir + '/testA')

### Copy the folders to the model input folder

copy the `testing_dataset` folders to `../pytorch-CycleGAN-and-pix2pix/datasets`

In [None]:
# copy the folder to target folder
target_dir = '../pytorch-CycleGAN-and-pix2pix/datasets'

# check if the folder exists
if not os.path.exists(target_dir + '/' + test_dir):
	shutil.copytree(test_dir, target_dir + '/' + test_dir)
else:
	# remove the existing folder
	shutil.rmtree(target_dir + '/' + test_dir)
	# copy the new folder
	shutil.copytree(test_dir, target_dir + '/' + test_dir)

# Enhanced Preprocess (2 domain datasets)

## Prepare Training Data (Enhanced)

### Extract the Images from Raw Training Data
Each `trainA` and `trainB` subfolders contains 2 types of images:
- River images(e.g. TRA_RI_1000000.png)
- Road images(e.g. TRA_RO_1000000.png)

so we need to create 2 folders:
- River (contains `trainA` and `trainB` subfolders, each contains river images)
- Road (contains `trainA` and `trainB` subfolders, each contains road images)

In [None]:
train_dir = 'training_dataset'
train_river_dir = 'train_RIVER'
train_road_dir = 'train_ROAD'

# create the folders
if not os.path.exists(train_river_dir):
	os.makedirs(train_river_dir)
if not os.path.exists(train_road_dir):
	os.makedirs(train_road_dir)

for subdir in os.listdir(train_dir):
	# create the subfolders if not exist
	if not os.path.exists(train_river_dir + '/' + subdir):
		os.makedirs(train_river_dir + '/' + subdir)
	if not os.path.exists(train_road_dir + '/' + subdir):
		os.makedirs(train_road_dir + '/' + subdir)
	
	# move or copy the files
	for file in os.listdir(train_dir + '/' + subdir):
		if '_RI_' in file:
			shutil.copy(train_dir + '/' + subdir + '/' + file, train_river_dir + '/' + subdir + '/' + file)
		elif '_RO_' in file:
			shutil.copy(train_dir + '/' + subdir + '/' + file, train_road_dir + '/' + subdir + '/' + file)
		else:
			print('ERROR: file name not recognized: ' + file)

### Copy the folders to the model input folder

move the `train_RIVER` and `train_ROAD` folders to `../pytorch-CycleGAN-and-pix2pix/datasets`

In [None]:
# copy the folder to target folder
target_dir = '../pytorch-CycleGAN-and-pix2pix/datasets'
shutil.copytree(train_river_dir, target_dir + '/' + train_river_dir)
shutil.copytree(train_road_dir, target_dir + '/' + train_road_dir)

## Prepare Public and Private Testing Data (Enhanced)

### Extract the Images from Raw Testing Data
`testing_dataset/testA` subfolder contains 2 types of images:
- River images(e.g. PUB_RI_1000000.png or PRI_RI_1000000.png)
- Road images(e.g. PUB_RO_1000459.png or PRI_RO_1000459.png)

so we need to create 2 folders:
- test_RIVER (contains `testA` subfolders, only contains river images)
- test_ROAD (contains `testA` subfolders, only contains road images)

In [None]:
import os

# change directory to the root of the project
try:
	os.chdir('../../dataset')
except:
	print("Already in the root directory")

In [None]:
import os
import shutil

test_dir = 'testing_dataset'
test_river_dir = 'test_RIVER'
test_road_dir = 'test_ROAD'

# create the folders
if not os.path.exists(test_river_dir):
	os.makedirs(test_river_dir)
if not os.path.exists(test_road_dir):
	os.makedirs(test_road_dir)
 

for subdir in os.listdir(test_dir):
	# create the subfolders if not exist
	if not os.path.exists(test_river_dir + '/' + subdir):
		os.makedirs(test_river_dir + '/' + subdir)
	if not os.path.exists(test_road_dir + '/' + subdir):
		os.makedirs(test_road_dir + '/' + subdir)
	
	# move or copy the files
	for file in os.listdir(test_dir + '/' + subdir):
		if '_RI_' in file:
			shutil.copy(test_dir + '/' + subdir + '/' + file, test_river_dir + '/' + subdir + '/' + file)
		elif '_RO_' in file:
			shutil.copy(test_dir + '/' + subdir + '/' + file, test_road_dir + '/' + subdir + '/' + file)
		else:
			print('ERROR: file name not recognized: ' + file)

### Copy the folders to the model input folder

copy the `test_RIVER` and `test_ROAD` folders to `../pytorch-CycleGAN-and-pix2pix/datasets`

In [None]:
# copy the folder to target folder
target_dir = '../pytorch-CycleGAN-and-pix2pix/datasets'
shutil.copytree(test_river_dir, target_dir + '/' + test_river_dir)
shutil.copytree(test_road_dir, target_dir + '/' + test_road_dir)