<a href="https://colab.research.google.com/github/LM1997610/AdavancedML/blob/main/AML_project/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparing the Dataset

In [None]:
import os
import sys
import cv2
import shutil
import random

from tqdm.auto import tqdm
from zipfile import ZipFile

**Portrait dataset** was downloaded from [this link](https://github.com/mahmoudnafifi/HistoGAN?tab=readme-ov-file#portrait-dataset) and uploaded to the shared Drive.\
Here the dataset is processed to resize the images from 1024x1024 to 256x256 pixels.\
Then it is divided into training, validation and testing sets.

In [None]:
! gdown 1tPw_esFGuiZQ7FGGhc8AhENVDPqmvB-i
! unzip -qqq /content/portrait_faces.zip

Downloading...
From: https://drive.google.com/uc?id=1tPw_esFGuiZQ7FGGhc8AhENVDPqmvB-i
To: /content/portrait_faces.zip
100% 586M/586M [00:10<00:00, 54.2MB/s]


In [None]:
directory = "portrait_faces"

new_dir = "resized_portrait_faces/"
os.makedirs(new_dir, exist_ok=True)

new_shape = (256, 256)

for filename in tqdm(os.listdir(directory)):

    file_path = os.path.join(directory, filename)

    try :

      image = cv2.imread(file_path)
      sys.stdout.write('\r Processing: %s → from size: %s ' % (filename, image.shape))
      sys.stdout.flush()
      out = cv2.resize(image, new_shape)
      cv2.imwrite(new_dir + filename, out)

    except Exception as e: print(file_path)

  0%|          | 0/7009 [00:00<?, ?it/s]

 Processing: image_00439.jpg → from size: (1024, 1024, 3) 

In [None]:
lista_file = os.listdir(new_dir)

print(f'number of original images: {len(os.listdir(directory))}')
print(f'number of resized images: {len(lista_file)}')

number of original images: 7009
number of resized images: 7009


**Splitting** in train, validation and test:

In [None]:
def split_dataset(source_directory,
                  destination_directory,
                  train_percent=0.8, validation_percent=0.1, test_percent=0.1):

    random.seed(123)
    all_files = os.listdir(source_directory)
    num_files = len(all_files)

    num_train = int(num_files * train_percent)
    num_validation = int(num_files * validation_percent)
    num_test = num_files - num_train - num_validation

    random.shuffle(all_files)

    for dataset in ['train', 'validation', 'test']:
        dataset_path = os.path.join(destination_directory, dataset)
        os.makedirs(dataset_path, exist_ok=True)

    for i, file in enumerate(all_files):
        source_path = os.path.join(source_directory, file)
        if i < num_train:
            destination_path = os.path.join(destination_directory, 'train', file)
        elif i < num_train + num_validation:
            destination_path = os.path.join(destination_directory, 'validation', file)
        else:
            destination_path = os.path.join(destination_directory, 'test', file)

        shutil.copy(source_path, destination_path)

    print("Splitting complete \n")
    print(f"{len(os.listdir(destination_directory+'/train'))} files in train")
    print(f"{len(os.listdir(destination_directory+'/validation'))} files in validation")
    print(f"{len(os.listdir(destination_directory+'/test'))} files in test")

In [None]:
source_dir = "resized_portrait_faces"
destination_dir = "train_val_test_split"

split_dataset(source_dir, destination_dir, train_percent=0.85, validation_percent=0.1, test_percent=0.05)

Splitting complete 

5957 files in train
700 files in validation
352 files in test


### Directory **train_val_test_split**

The folder we want to create has the following structure:

├── train_val_test_split \
├   └── test_sets \
│    &emsp;&emsp; ├── mask_large_256 \
│    &emsp;&emsp; ├── mask_small_256 \
│    &emsp;&emsp; └── test \
├ └── train.zip \
└ └── validation.zip \

 Then we will incorporate the '`/train_val_test_split'` directory into the [**MAT repo**](https://github.com/fenglinglwb/MAT) clone.\
Same for **CelebA-HQ_256.pkl**, the pre-trained model file. It will be placed in its designated directory named `'/pretrained_dir'`


In [None]:
def zip_folder(folder_path, zip_path):

    with ZipFile(zip_path, 'w') as zipf:

        for root, _, files in os.walk(folder_path):

            for file in files:

                file_path = os.path.join(root, file)
                arc_name = os.path.relpath(file_path, folder_path)
                zipf.write(file_path, arc_name)

In [None]:
base_path = '/content'
input_folder = 'train_val_test_split'
output_folder = 'train_val_test_split.zip'
test_sets_folder = 'test_sets'

# zip train and validation
for folder_name in ['train', 'validation']:
    folder_path = os.path.join(base_path, input_folder, folder_name)
    zip_path = os.path.join(base_path, input_folder, f'{folder_name}.zip')
    zip_folder(folder_path, zip_path) # zip folder
    shutil.rmtree(folder_path)        # remove folder

# move test in test_sets
test_sets_path = os.path.join(base_path, input_folder, test_sets_folder)
os.makedirs(test_sets_path, exist_ok=True)
shutil.move(os.path.join(base_path, input_folder, 'test'), os.path.join(test_sets_path, 'test'))


'/content/train_val_test_split/test_sets/test'

### Mask
The masks we employ have been provided by MAT researchers and downloaded from [this link](https://mycuhk-my.sharepoint.com/personal/1155137927_link_cuhk_edu_hk/_layouts/15/onedrive.aspx?id=%2Fpersonal%2F1155137927%5Flink%5Fcuhk%5Fedu%5Fhk%2FDocuments%2FRelease%2FMAT&ga=1) into our shared Drive.\
Same for the pre-trained model.

In order to ensure a one-to-one correspondence between **masks** and **test images**, the masks are randomly selected.\
A total of 352 masks per group (small and large) were chosen from the downloaded collection.\
Only **large masks** will be utilized next for testing.


In [None]:
# get small masks

! gdown 1kfitYjSOCe0Y3M6eeiot1q0b7SG-NJvm
! unzip -q masks_small_celebahq_val_256.zip
! mv masks_val_256_small_eval masks_small_256
! rm -r masks_small_celebahq_val_256.zip

Downloading...
From: https://drive.google.com/uc?id=1kfitYjSOCe0Y3M6eeiot1q0b7SG-NJvm
To: /content/masks_small_celebahq_val_256.zip
  0% 0.00/2.73M [00:00<?, ?B/s]100% 2.73M/2.73M [00:00<00:00, 164MB/s]


In [None]:
# get large masks

! gdown 1i1anLUkHomUFZFwQ4SP0ssXKu0-S6SwF
! unzip -q masks_large_celebahq_val_256.zip
! mv masks_val_256_eval masks_large_256
! rm -r masks_large_celebahq_val_256.zip

Downloading...
From: https://drive.google.com/uc?id=1i1anLUkHomUFZFwQ4SP0ssXKu0-S6SwF
To: /content/masks_large_celebahq_val_256.zip
100% 4.40M/4.40M [00:00<00:00, 24.4MB/s]


In [None]:
small_masks = len(os.listdir("masks_small_256"))
print(f'small masks: {small_masks} imgs')

large_masks = len(os.listdir("masks_large_256"))
print(f'large masks: {large_masks} imgs')

small masks: 2993 imgs
large masks: 2993 imgs


In [None]:
mask_dir_list = ["masks_small_256", "masks_large_256"]
test_dataset_path = 'train_val_test_split/test_sets/test'

number_test_img = len(os.listdir(test_dataset_path))
print(f'number_test_img: {number_test_img}\n')

for mask_dir in mask_dir_list:

    print(mask_dir)

    all_masks = os.listdir(mask_dir)
    file_png = [file for file in all_masks if file.lower().endswith(".png")]
    print(f' number masks: {len(all_masks)}')

    numero_file_da_eliminare = len(file_png) - number_test_img
    print(f' numero_file_da_eliminare: {numero_file_da_eliminare}')

    file_da_eliminare = random.sample(file_png,  numero_file_da_eliminare)

    for file in file_da_eliminare:
        percorso_completo = os.path.join(mask_dir, file)
        os.remove(percorso_completo)

    print(f' >> final number masks: {len(os.listdir(mask_dir))} in {mask_dir}\n')

    # Move the mask directory to the test dataset path
    target_path = 'train_val_test_split/test_sets'
    new_mask_dir_path = os.path.join(target_path, os.path.basename(mask_dir))
    shutil.move(mask_dir, new_mask_dir_path)

number_test_img: 352

masks_small_256
 number masks: 2993
 numero_file_da_eliminare: 2641
 >> final number masks: 352 in masks_small_256

masks_large_256
 number masks: 2993
 numero_file_da_eliminare: 2641
 >> final number masks: 352 in masks_large_256



In [None]:
n_mask_small = len(os.listdir("train_val_test_split/test_sets/masks_small_256"))
print(f'small masks: {n_mask_small} imgs')

n_mask_large = len(os.listdir("train_val_test_split/test_sets/masks_large_256"))
print(f'large masks: {n_mask_large} imgs')

small masks: 352 imgs
large masks: 352 imgs


### Zip and save files

In [None]:
# -------------------------------
from google.colab import drive
drive.mount('/content/drive')
# -------------------------------

Mounted at /content/drive


In [None]:
# save train_val_test_split on drive:

! zip -q -r  train_val_test_split.zip train_val_test_split
! cp train_val_test_split.zip /content/drive/MyDrive/AML_project

In [None]:
# save resized images on drive:

! zip -q -r resized_portrait_faces.zip resized_portrait_faces
! cp resized_portrait_faces.zip /content/drive/MyDrive/AML_project