# BDC - Satria Data 2021

Task : Gender Detection

## Authors

1. Muhammad Amanda
2. Naufal Zhafran A.
3. Wahyu Setianto

## Running On

Kaggle [using GPU]

## First Thing First

Menginstall library yang diperlukan dan mengimport library - library yang akan digunakan serta menseting variable config yang akan digunakan di dalam notebook ini.

1. Menginstal library`MTCNN`

Library `MTCNN` adalah library yang digunakan untuk preprocessing data gambar pada notebook ini

In [None]:
!pip -q install mtcnn

2. Importing library

Mengimport library yang akan digunakan dalam notebook ini.

In [None]:
# Umum
import os, random, re
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from PIL import Image

# Tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array

# Metrics & Splitting data
from sklearn.metrics import *
from sklearn.model_selection import *

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
import cv2
from mtcnn import MTCNN
import albumentations as A

print("Tensorflow :", tf.__version__)

3. Setup `CONFIG`

Mensetup varible - variable yang digunakan sebagai config pada notebook ini

In [None]:
SEED = 2021
SIZE = (200, 200)
BATCH_SIZE = 32
FACE_THRESHOLD = 0.95
FACE_DETECTOR = MTCNN()

## Dataset

Load dataset yang mengandung informasi `path` dari data gambar

In [None]:
train = pd.read_csv("../input/bdc-2021/train.csv")
test = pd.read_csv("../input/bdc-2021/submission.csv")
train.head()

memperjelas `path` ke setiap data gambar

In [None]:
images = []
labels = []
test_images = []

TRAIN_DIR = "../input/bdc-2021/Training"
TEST_DIR = "../input/bdc-2021/Testing"

for no, label in train[["nomor", "jenis kelamin"]].values:
    TEMP_DIR = os.path.join(TRAIN_DIR, str(no))
    for file in os.listdir(TEMP_DIR):
        file_dir = os.path.join(TEMP_DIR, file)
        if ".ini" not in file_dir:
            images.append(file_dir)
            labels.append(label)

for no in test.id.values:
    file_dir = os.path.join(TEST_DIR, f"{no}.jpg")
    if os.path.isfile(file_dir):
        test_images.append(file_dir)
    else:
        test_images.append(None)
        print(file_dir)

menampilkan dan mengecek beberapa gambar pada data `train`

In [None]:
def read(path):
    """
    Read data gambar
    """
    img = Image.open(path)
    return img

def show_images(list_dir, label = None, load_image = read, seed = SEED):
    """
    Menampilkan Gambar Secara acak sebanyak 5 buah.
    """
    random.seed(seed)
    unique = ["init"]
    if label:
        unique = list(set(label))
    fig, axes = plt.subplots(len(unique), 5, figsize = (20, 5 * len(unique)))
    for i in range(len(unique)):
        if i == 0 and unique[i] == "init":
            data = random.sample(list_dir, 5)
        else:
            data = random.sample([x for x in zip(list_dir, label) if x[1] == unique[i]], 5)
        for j in range(5):
            if unique[0] != "init":
                img = load_image(data[j][0])
                axes[i, j].imshow(img)
                axes[i, j].set_title(f'Label : {data[j][1]}', fontsize = 14)
                axes[i, j].axis('off')
            else:
                img = load_image(data[j])
                axes[j].imshow(img)
                axes[j].axis('off')
    fig.tight_layout()
    plt.show()

In [None]:
show_images(images, labels, seed=20)

## Preprocess Data

Metode yang digunakan:

1. Mengekstrak wajah - wajah yang terdapat pada gambar menjadi gambar - gambar baru dengan label yang sama dengan menggunakan model `MTCNN`
2. Pada data test jika terdapat dua wajah yang terdeteksi pada satu gambar akan di ambil wajah dengan tingkat confidence terbesar yang diberikan oleh model `MTCNN`.
3. Jika tidak terdetect wajah pada salah satu gambar maka akan dilakukan crop pada bagian tengah gambar sehingga gambar berbentuk persegi atau `jxj` pixel.
4. Selanjutnya gambar akan di resize menjadi ukuran `256x256` pixel

berikut adalah contoh hasil preprocess data gambar.

In [None]:
def get_faces(path):
    image = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
    faces = FACE_DETECTOR.detect_faces(image)
    return faces

def load_and_preprocess_image(path: str, size = SIZE):
    """
    Load & Preprocess data gambar
    """
    image = img_to_array(load_img(path))
    faces = [x['box'] for x in get_faces(path) if x['confidence'] > FACE_THRESHOLD]
    if len(faces) > 0:
        x, y, w, h = faces[0]
        image = image[y:y+h, x:x+w]
    img = tf.convert_to_tensor(image, dtype=tf.float32)
    if len(faces) == 0:
        shapes = tf.shape(img)
        h, w = shapes[-3], shapes[-2]
        dim = tf.minimum(h, w)
        img = tf.image.resize_with_crop_or_pad(img, dim, dim)
    img = tf.image.resize(img, size)
    img = tf.cast(img, tf.float32) / 255.0
    return img.numpy()

In [None]:
show_images(images, labels, load_image = load_and_preprocess_image, seed=20)

## Augmentasi Data

Melakukan augmentasi untuk memperbanyak data. Metode augmentasi yang digunakan yaitu:

1. Horizontal flip
2. Donwscale kualitas gambar
3. Random rotate dengan rentang -30 sampai 30 derajad
4. Shift, scale, dan rotate gambar
5. Blur
6. Random brightness

In [None]:
aug = A.Compose([
    A.HorizontalFlip(p=0.4),
    A.Downscale(scale_min=0.6, scale_max=0.9, p=0.3),
    A.Rotate(limit=(-30,30), p=0.6),
    A.ShiftScaleRotate(shift_limit=(-0.07, 0.07), scale_limit=(-0.05, 0.1), rotate_limit=(-15, 15), p=0.4),
    A.OneOf([
        A.MotionBlur(p=.4),
        A.MedianBlur(blur_limit=3, p=0.4),
        A.Blur(blur_limit=3, p=0.4),
    ], p=0.4),
    A.RandomBrightnessContrast(brightness_limit=(-0.25, 0.15), p=0.4),
])

In [None]:
def visualize_aug(path):
    fig, axes = plt.subplots(1, 5, figsize = (20, 5))
    image = load_and_preprocess_image(path)
    axes[0].imshow(image)
    axes[0].axis('off')
    for i in range(1, 5):
        augmented = aug(image=image)['image']
        axes[i].imshow(augmented)
        axes[i].axis('off')
    fig.tight_layout()
    plt.show()

In [None]:
random.seed(SEED)
for i in range(3):
    visualize_aug(images[i])

Running preprocessing pada data gambar secara keseluruhan

In [None]:
def image_preprocessing(new_dir, images, labels=None):
    if os.path.isdir(new_dir):
        !rm -rf {new_dir}
    os.mkdir(new_dir)
    
    new_images, new_labels = [], []
    if not labels:
        labels = [None for _ in range(len(images))]
    
    for path, label in tqdm(zip(images, labels), total=len(images)):
        image = img_to_array(load_img(path))
        if label != None:
            faces = [x['box'] for x in sorted(get_faces(path), key=lambda x: x['confidence'], 
                                              reverse=True) if x['confidence'] > FACE_THRESHOLD]
        else:
            faces = [x['box'] for x in sorted(get_faces(path), key=lambda x: x['confidence'], reverse=True)]
        if len(faces) > 0:
            if label != None:
                for j, (x, y, w, h) in enumerate(faces):
                    img = image[y:y+h, x:x+w]
                    img = tf.convert_to_tensor(img, dtype=tf.float32)
                    img = tf.image.resize(img, SIZE)
                    img = tf.cast(img, tf.float32) / 255.0

                    img_dir = os.path.join(new_dir, f'{j}_{path.split("/")[-1]}')
                    new_images.append(img_dir)
                    new_labels.append(label)
                    tf.keras.preprocessing.image.save_img(img_dir, img)
                    
                    for k in range(3):
                        augmented = aug(image=img.numpy())['image']
                        img_dir = os.path.join(new_dir, f'aug-{k}_{j}_{path.split("/")[-1]}')
                        new_images.append(img_dir)
                        new_labels.append(label)
                        tf.keras.preprocessing.image.save_img(img_dir, augmented)
            else:
                x, y, w, h = faces[0]
                img = image[y:y+h, x:x+w]
                img = tf.convert_to_tensor(img, dtype=tf.float32)
                img = tf.image.resize(img, SIZE)
                img = tf.cast(img, tf.float32) / 255.0
                
                img_dir = os.path.join(new_dir, path.split('/')[-1])
                new_images.append(img_dir)
                new_labels.append(label)
                tf.keras.preprocessing.image.save_img(img_dir, img)
        else :
            img = tf.convert_to_tensor(image, dtype=tf.float32)
            shapes = tf.shape(img)
            h, w = shapes[-3], shapes[-2]
            dim = tf.minimum(h, w)
            img = tf.image.resize_with_crop_or_pad(img, dim, dim)
            img = tf.image.resize(img, SIZE)
            img = tf.cast(img, tf.float32) / 255.0

            img_dir = os.path.join(new_dir, path.split('/')[-1])
            new_images.append(img_dir)
            new_labels.append(label)
            tf.keras.preprocessing.image.save_img(img_dir, img)
            
            if label != None:
                for k in range(3):
                    augmented = aug(image=img.numpy())['image']
                    img_dir = os.path.join(new_dir,  f'aug-{k}_{path.split("/")[-1]}')
                    new_images.append(img_dir)
                    new_labels.append(label)
                    tf.keras.preprocessing.image.save_img(img_dir, augmented)
    
    return new_images, new_labels

Untuk menghemat waktu running akan di skip bagian ini dan di ganti dengan meload data hasil preprocess yang sudah di save pada run sebelumnya. Namun jika ingin melakukan preprocess pada run sekarang maka uncomment code di bawah ini.

**Peringatan** : running block code di bawah memakan waktu sekitar 50 menit dengan GPU Nvidia Tesla P100-PCIE.

In [None]:
# new_train_dir = "./train"
# new_test_dir = "./test"

# random.seed(SEED)
# new_images, new_labels = image_preprocessing(new_train_dir, images, labels)
# new_test_images, _ = image_preprocessing(new_test_dir, test_images)

**Note** : Comment dua block kode di bawah jika melakukan preprocess pada run saat ini.

In [None]:
preprocessed = pd.read_csv("../input/bdc-2021/preprocessed-augmented/preprocessed.csv")
preprocessed.head()

In [None]:
!wget https://raw.githubusercontent.com/Hyuto/fun/master/excluded.txt
    
with open("./excluded.txt") as f:
    excluded = f.read().split("\n")
    
patterns = fr'{"|".join(excluded)}'

In [None]:
preprocessed_dir = "../input/bdc-2021/preprocessed-augmented"
new_images, new_labels = [], []

for image, label in preprocessed[["image", "label"]].values:
    if not re.search(patterns, image):
        new_images.append(os.path.join(preprocessed_dir, image))
        new_labels.append(label)

new_test_images = np.asarray([os.path.join(preprocessed_dir, "test", f"{x}.jpg") for x in test.id.values])

new_images = np.asarray(new_images)
new_labels = np.asarray(new_labels)

Mengecek distribusi label pada data

In [None]:
plt.figure(figsize=(5, 5))
sns.countplot(x=new_labels)
plt.show()

Jumlah data yang berlabel `0` dan `1` cenderung sama.

**Splitting Data**

Split data train menjadi data `train` dan data `valid` dengan proporsi `85:15`

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(new_images, new_labels, test_size=0.15, 
                                                      stratify=new_labels, random_state=SEED)

**Tensorflow Data**

Load data gambar menggunakan `Tensorflow Data` agar pada saat pelatihan model penggunaan memmori dapat lebih optimal

In [None]:
def decode_image(filename, label=None, image_size=SIZE):
    """
    Decode Image from String Path Tensor
    """
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, SIZE)

    if label is None: # if test
        return image
    else:
        return image, label

In [None]:
def build_model(kernel_s=(3,3)):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32,kernel_s,activation='relu',input_shape=(200,200,3),
                            kernel_regularizer=tf.keras.regularizers.l2(0.001),padding="VALID"),
        tf.keras.layers.MaxPooling2D((2,2)),
        #tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv2D(64,kernel_s,activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2)),
        #tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv2D(64,kernel_s,activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2)),
        #tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv2D(128,kernel_s,activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2)),
        #tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv2D(128,kernel_s,activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2)),
        #tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu', 
                              kernel_regularizer=tf.keras.regularizers.l2(5e-4)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    return model

In [None]:
split = 5
prediksi = np.zeros((len(new_test_images), 1), dtype=np.float32)
acc_scores, f1_scores = [], []

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((new_test_images))
    .map(decode_image)
    .batch(BATCH_SIZE)
)

cv = StratifiedKFold(n_splits=split, shuffle=True, random_state=SEED)
for i, (train_index, test_index) in enumerate(cv.split(new_images, new_labels)):
    tf.keras.backend.clear_session()
    x_train, x_valid = new_images[train_index], new_images[test_index]
    y_train, y_valid = new_labels[train_index], new_labels[test_index]

    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .map(decode_image, num_parallel_calls=tf.data.AUTOTUNE)
        .cache()
        .repeat()
        .shuffle(1024)
        .batch(BATCH_SIZE)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .map(decode_image, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(tf.data.AUTOTUNE)
    )

    model = build_model()
    checkpoint = tf.keras.callbacks.ModelCheckpoint(f'{i}_Resnet50_best_model.h5', monitor='val_accuracy', 
                                                save_best_only=True, save_weights_only=True, 
                                                mode='max')
    print(f"\nCV {i+1}")
    model.fit(train_dataset, epochs=30, validation_data=valid_dataset, 
              steps_per_epoch=len(x_train) // BATCH_SIZE,
              callbacks = [checkpoint])
    model.load_weights(f'{i}_Resnet50_best_model.h5')
    val_pred_classes = np.array(model.predict(valid_dataset).flatten() >= .5, dtype = 'int')
    acc, f1 = accuracy_score(y_valid, val_pred_classes), f1_score(y_valid, val_pred_classes)

    acc_scores.append(acc)
    f1_scores.append(f1)
    prediksi += model.predict(test_dataset)

    del train_dataset
    del valid_dataset

In [None]:
for i in range(split):
    print(f"Split {i + 1} : {acc_scores[i]} acc - {f1_scores[i]} f1")
    
print("\nMean Acc", sum(acc_scores) / split)
print("Mean F1 ", sum(f1_scores) / split)

## Membuat Submission

In [None]:
submission = pd.DataFrame({'id' :[x.split('/')[-1].split('.')[0] for x in new_test_images],
                           'jenis kelamin': np.array((prediksi / split).flatten() >= .5, dtype = 'int')})
test = test.merge(submission, on="id")
test.head()

In [None]:
test.to_csv("submission-1_1.csv", index=False)