# BDC - Satria Data 2021

Task : Preprocessing Image

## Authors

1. Muhammad Amanda
2. Naufal Zhafran A.
3. Wahyu Setianto

## Running On

| Environtment | Keterangan | Link |
| ------------ | ---------- | ---- |
| Kaggle       | **Main Environtment** menggunakan GPU NVIDIA TESLA P100 GPU's | [![kaggle](https://img.shields.io/badge/%20-Kaggle-blue?logo=kaggle)](https://www.kaggle.com/wahyusetianto/preprocessing-bdc-2021) |
| Colab        | **Secondary**  menggunakan GPU NVIDIA TESLA K80's | [![colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Hyuto/bdc-2021/blob/master/notebook/%5BPREPROCESS%5D%20BDC%20-%202021.ipynb) |

## First Thing First

Menginstall library yang diperlukan dan mengimport library - library yang akan digunakan serta menseting variable config yang akan digunakan di dalam notebook ini.

1. Menginstal library `MTCNN` dan `Albumentations`
   
   Library `MTCNN` dan `Albumentations` adalah library yang digunakan untuk preprocessing data gambar pada notebook ini

In [1]:
!pip -q install mtcnn --upgrade
!pip -q install albumentations --upgrade
!pip -q install opencv-python-headless==4.1.2.30

2. Importing library
   
   Mengimport library yang akan digunakan dalam notebook ini.

In [2]:
# Umum
import os, random, zipfile  
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

# Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Plotting
import matplotlib.pyplot as plt

# Image Preprocessing
import cv2
import albumentations as A
from PIL import Image
from mtcnn import MTCNN

print("Tensorflow :", tf.__version__)

3. Setup `CONFIG`
   
   Mensetup varible - variable yang digunakan sebagai config pada notebook ini

In [3]:
SEED = 2021
SIZE = (256, 256)
BATCH_SIZE = 32
FACE_THRESHOLD = 0.95
FACE_DETECTOR = MTCNN()

## Dataset

Mendownload dataset dan Load dataset yang mengandung informasi `path` dari data gambar.

1. Mendownload dataset dari [repository Github](https://github.com/Hyuto/bdc-2021)
   
   **Note** : Pastikan `git` telah terinstall

In [None]:
!git clone https://github.com/Hyuto/bdc-2021.git

In [None]:
MAIN_DIR = os.path.join(".", "bdc-2021")
DATA_DIR = os.path.join(MAIN_DIR, "data")

2. Read Dataset
   
   Membaca data `csv` untuk mendapatkan informasi `path`, `jenis kelamin` dan `umur` dari data gambar

In [4]:
train = pd.read_csv(f"{DATA_DIR}/train.csv")
test = pd.read_csv(f"{DATA_DIR}/submission.csv")
train.head()

memperjelas `path` ke setiap data gambar

In [5]:
images = []
labels = []
ages = []
test_images = []

TRAIN_DIR = f"{DATA_DIR}/Training"
TEST_DIR = f"{DATA_DIR}/Testing"

for no, label, usia in train[["nomor", "jenis kelamin", "usia"]].values:
    TEMP_DIR = os.path.join(TRAIN_DIR, str(no))
    for file in os.listdir(TEMP_DIR):
        file_dir = os.path.join(TEMP_DIR, file)
        if ".ini" not in file_dir:
            images.append(file_dir)
            labels.append(label)
            ages.append(usia)

for no in test.id.values:
    file_dir = os.path.join(TEST_DIR, f"{no}.jpg")
    if os.path.isfile(file_dir):
        test_images.append(file_dir)
    else:
        test_images.append(None)
        print(file_dir)

menampilkan dan mengecek beberapa gambar pada data `train`

In [6]:
def read(path):
    """
    Read data gambar
    """
    img = Image.open(path)
    return img

def show_images(list_dir, label = None, age = None, load_image = read, seed = SEED):
    """
    Menampilkan Gambar Secara acak sebanyak 5 buah.
    """
    random.seed(seed)
    unique = ["init"]
    if label:
        unique = list(set(label))
    fig, axes = plt.subplots(len(unique), 5, figsize = (20, 5 * len(unique)))
    for i in range(len(unique)):
        if i == 0 and unique[i] == "init":
            data = random.sample(list_dir, 5)
        elif age != None:
            data = random.sample([x for x in zip(list_dir, label, age) if x[1] == unique[i]], 5)
        else:
            data = random.sample([x for x in zip(list_dir, label) if x[1] == unique[i]], 5)
        for j in range(5):
            if unique[0] != "init":
                img = load_image(data[j][0])
                text = f'Label : {data[j][1]}'
                if age != None:
                    text += f'\nAge : {data[j][2]}'
                axes[i, j].imshow(img)
                axes[i, j].set_title(text, fontsize = 14)
                axes[i, j].axis('off')
            else:
                img = load_image(data[j])
                axes[j].imshow(img)
                axes[j].axis('off')
    fig.tight_layout()
    plt.show()

In [7]:
show_images(images, label = labels, age = ages, seed= 20)

### 1. MTCNN

Mendapatkan koordinat wajah dengan menggunakan model `MTCNN` dengan nilai confidence lebih dari `0.95` lalu dilakukan cropping.

![mtcnn flow](https://raw.githubusercontent.com/Hyuto/bdc-2021/master/assets/MTCNN.png)

### 2. Resizing

Melakukan resize pada setiap gambar menjadi ukuran `y x y` sehingga seluruh gambar memiliki ukuran yang sama.

In [8]:
def get_faces(path):
    image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
    faces = FACE_DETECTOR.detect_faces(image)
    return faces

def load_and_preprocess_image(path: str, size = SIZE):
    """
    Load & Preprocess data gambar
    """
    image = img_to_array(load_img(path))
    faces = [x['box'] for x in get_faces(path) if x['confidence'] > FACE_THRESHOLD]
    if len(faces) > 0:
        x, y, w, h = faces[0]
        image = image[y:y+h, x:x+w]
    img = tf.convert_to_tensor(image, dtype=tf.float32)
    if len(faces) == 0:
        shapes = tf.shape(img)
        h, w = shapes[-3], shapes[-2]
        dim = tf.minimum(h, w)
        img = tf.image.resize_with_crop_or_pad(img, dim, dim)
    img = tf.image.resize(img, size)
    img = tf.cast(img, tf.float32) / 255.0
    return img.numpy()

In [9]:
show_images(images, labels, ages, load_image = load_and_preprocess_image, seed=20)

### 3. Augmentation

Melakukan augmentasi untuk memperbanyak data. Metode augmentasi yang digunakan yaitu:

1. Horizontal flip dengan peluang `0.4`
2. Donwscale kualitas gambar pada range `0.6 - 0.9` dengan peluang `0.3`
3. Random rotate dengan rentang `-30` sampai `30` derajad dengan peluang `0.6`
4. Shift, scale, dan rotate gambar dengan peluang `0.4`
5. Blur dengan peluang `0.4`
6. Random brightness pada rentang limit `-0.25` sampai `0.15` dengan peluang `0.4`

In [10]:
aug = A.Compose([
    A.HorizontalFlip(p=0.4),
    A.Downscale(scale_min=0.6, scale_max=0.9, p=0.3),
    A.Rotate(limit=(-30,30), p=0.6),
    A.ShiftScaleRotate(shift_limit=(-0.07, 0.07), scale_limit=(-0.05, 0.1), rotate_limit=(-15, 15), p=0.4),
    A.OneOf([
        A.MotionBlur(p=.4),
        A.MedianBlur(blur_limit=3, p=0.4),
        A.Blur(blur_limit=3, p=0.4),
    ], p=0.4),
    A.RandomBrightnessContrast(brightness_limit=(-0.25, 0.15), p=0.4),
])

**Visualisasi** augmentasi data

In [11]:
def visualize_aug(path):
    fig, axes = plt.subplots(1, 5, figsize = (20, 5))
    image = load_and_preprocess_image(path)
    axes[0].imshow(image)
    axes[0].axis('off')
    for i in range(1, 5):
        augmented = aug(image=image)['image']
        axes[i].imshow(augmented)
        axes[i].axis('off')
    fig.tight_layout()
    plt.show()

In [12]:
random.seed(SEED)
for i in range(3):
    visualize_aug(images[i])

## Running Preprocessing

Melakukan seluruh proses `preprocessing` terhadap data gambar lalu menyimpan data gambar yang telah di preprocess menjadi data baru.

In [13]:
import shutil

def image_preprocessing(new_dir, images, labels=None, ages=None):
    if os.path.isdir(new_dir):
        shutil.rmtree(new_dir)
    os.mkdir(new_dir)
    
    new_images, new_labels, new_ages = [], [], []
    labels = [None for _ in range(len(images))] if not labels else labels
    ages = [None for _ in range(len(images))] if not ages else ages
    
    for path, label, age in tqdm(zip(images, labels, ages), total=len(images)):
        image = img_to_array(load_img(path))
        if label != None:
            faces = [x['box'] for x in sorted(get_faces(path), key=lambda x: x['confidence'], 
                                              reverse=True) if x['confidence'] > FACE_THRESHOLD]
        else:
            faces = [x['box'] for x in sorted(get_faces(path), key=lambda x: x['confidence'], reverse=True)]
        if len(faces) > 0:
            if label != None:
                for j, (x, y, w, h) in enumerate(faces):
                    img = image[y:y+h, x:x+w]
                    img = tf.convert_to_tensor(img, dtype=tf.float32)
                    img = tf.image.resize(img, SIZE)
                    img = tf.cast(img, tf.float32) / 255.0

                    img_dir = os.path.join(new_dir, f'{j}_{path.split("/")[-1]}')
                    new_images.append(img_dir)
                    new_labels.append(label)
                    if age != None:
                        new_ages.append(age)
                    tf.keras.preprocessing.image.save_img(img_dir, img)
                    
                    for k in range(3):
                        augmented = aug(image=img.numpy())['image']
                        img_dir = os.path.join(new_dir, f'aug-{k}_{j}_{path.split("/")[-1]}')
                        new_images.append(img_dir)
                        new_labels.append(label)
                        if age != None:
                            new_ages.append(age)
                        tf.keras.preprocessing.image.save_img(img_dir, augmented)
                        
            else:
                x, y, w, h = faces[0]
                img = image[y:y+h, x:x+w]
                img = tf.convert_to_tensor(img, dtype=tf.float32)
                img = tf.image.resize(img, SIZE)
                img = tf.cast(img, tf.float32) / 255.0
            
                img_dir = os.path.join(new_dir, path.split('/')[-1])
                new_images.append(img_dir)
                tf.keras.preprocessing.image.save_img(img_dir, img)
                
        else :
            img = tf.convert_to_tensor(image, dtype=tf.float32)
            shapes = tf.shape(img)
            h, w = shapes[-3], shapes[-2]
            dim = tf.minimum(h, w)
            img = tf.image.resize_with_crop_or_pad(img, dim, dim)
            img = tf.image.resize(img, SIZE)
            img = tf.cast(img, tf.float32) / 255.0

            img_dir = os.path.join(new_dir, path.split('/')[-1])
            new_images.append(img_dir)
            new_labels.append(label)
            if age != None:
                new_ages.append(age)
            tf.keras.preprocessing.image.save_img(img_dir, img)
            
            if label != None:
                for k in range(3):
                    augmented = aug(image=img.numpy())['image']
                    img_dir = os.path.join(new_dir,  f'aug-{k}_{path.split("/")[-1]}')
                    new_images.append(img_dir)
                    new_labels.append(label)
                    if age != None:
                        new_ages.append(age)
                    tf.keras.preprocessing.image.save_img(img_dir, augmented)
    
    return new_images, new_labels, new_ages

In [14]:
new_train_dir = "./train"
new_test_dir = "./test"

random.seed(SEED)
new_images, new_labels, new_ages = image_preprocessing(new_train_dir, images, labels, ages)
new_test_images, _, _ = image_preprocessing(new_test_dir, test_images)

**Membuat csv**

In [15]:
preprocessed = pd.DataFrame({"image" : new_images, "label" : new_labels, "age" : new_ages})
preprocessed.to_csv("preprocessed.csv", index=False)

**Mengkompress menjadi ZIP**

In [16]:
def zipit(folders, zip_filename):
    """ 
    Modified from https://stackoverflow.com/a/46359964
    """
    zip_file = zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED)
    for folder in folders:
        if os.path.isdir(folder):
            for dirpath, dirnames, filenames in os.walk(folder):
                for filename in filenames:
                    zip_file.write(
                        os.path.join(dirpath, filename),
                        os.path.relpath(os.path.join(dirpath, filename), os.path.join(folders[0], '..')))
        else:
            zip_file.write(folder)
    zip_file.close()

zipit(["./train", "./test", "./preprocessed.csv"], "preprocessed.zip")