# Face Detector Training

The training is done on the [Retinaface official annotations](https://drive.google.com/file/d/1vgCABX1JI3NGBzsHxwBXlmRjaLV3NIsG/view) for the [WIDER FACE dataset](http://shuoyang1213.me/WIDERFACE/index.html); a set containing thousands of different images taken from different contexts (e.g. marching band at a parade, people during demonstrations/riots, people inside shopping malls and so on). The official annotations differ from the original ones (consisting of only the bounding box ground truth), since RetinaFace is trained to do Classification, BoundingBox regression and Landmark regression, example:
>\# 0--Parade/0_Parade_marchingband_1_849.jpg
>
> \# [subdir/image_name]
>
>449 330 122 149  488.906 373.643 0.0  542.089 376.442 0.0  515.031 412.83 0.0  485.174 425.893 0.0  538.357 431.491 0.0  0.82
>
>[bounding box]  [left eye]  [right eye]  [nose]  [left corner mouth]  [right corner mouth]  [validity]

The folder structure for converting the dataset and then training should be like this:

`
face_detection/data/widerface/train/
                               images/
                               label.txt`
        
The structure of this notebook is as follows (internal hyperlinks do not work on colab):
 - [Converting the dataset to tensorflow record for training](#convert)
 - [Training the model](#train)

### Imports

In [1]:
# due to relative imports from modules, this notebook should be run from the main folder hence the 'cd'
%cd ..

C:\Users\cirib\Desktop\MainFolder


In [2]:
# imports
from absl import logging
import random
import os
import time
import numpy as np
import tensorflow as tf
import tqdm

from face_detection.models import RetinaFaceModel
from face_detection.lr_scheduler import MultiStepWarmUpLR
from face_detection.losses import MultiBoxLoss
from face_detection.anchor import prior_box
from face_detection.utils import (set_memory_growth, load_yaml, load_dataset, ProgressBar)



set_memory_growth()# avoid memory problems
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  

# get rid of tensorflow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logger = tf.get_logger()
logger.disabled = True
logger.setLevel(logging.FATAL)

<a id ='convert'></a>
## Converting the dataset

From the WIDER FACE dataset, the training set contains 12880 images with ground truth for face bounding boxes, facial landmarks and validity of face (e.g. '-1.0' faces do not have landmarks because they are too blurred or otherwise undetectable).

The .zip doesn't contain the tfrecord used for training, the next two cells will easily create it.

#### Functions

In [3]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))


def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def make_example(img_name, img_path, target):
    # Create a dictionary with features.
    feature = {'image/img_name': _bytes_feature([img_name]),
               'image/object/bbox/xmin': _float_feature(target[:, 0]),
               'image/object/bbox/ymin': _float_feature(target[:, 1]),
               'image/object/bbox/xmax': _float_feature(target[:, 2]),
               'image/object/bbox/ymax': _float_feature(target[:, 3]),
               'image/object/landmark0/x': _float_feature(target[:, 4]),
               'image/object/landmark0/y': _float_feature(target[:, 5]),
               'image/object/landmark1/x': _float_feature(target[:, 6]),
               'image/object/landmark1/y': _float_feature(target[:, 7]),
               'image/object/landmark2/x': _float_feature(target[:, 8]),
               'image/object/landmark2/y': _float_feature(target[:, 9]),
               'image/object/landmark3/x': _float_feature(target[:, 10]),
               'image/object/landmark3/y': _float_feature(target[:, 11]),
               'image/object/landmark4/x': _float_feature(target[:, 12]),
               'image/object/landmark4/y': _float_feature(target[:, 13]),
               'image/object/landmark/valid': _float_feature(target[:, 14])}
    img_str = open(img_path, 'rb').read()
    feature['image/encoded'] = _bytes_feature([img_str])
    
    return tf.train.Example(features=tf.train.Features(feature=feature))


def load_info(txt_path):
    """reads the annotations from the label.txt"""
    img_paths = []
    words = []

    f = open(txt_path, 'r')
    lines = f.readlines()
    isFirst = True
    labels = []
    for line in lines:
        line = line.rstrip()
        if line.startswith('#'): # path to image indicator
            if isFirst is True:
                isFirst = False
            else:
                labels_copy = labels.copy()
                words.append(labels_copy)
                labels.clear()
            path = line[2:]
            path = txt_path.replace('label.txt', 'images/') + path
            img_paths.append(path)
        else:
            line = line.split(' ')
            label = [float(x) for x in line]
            labels.append(label)

    words.append(labels)
    return img_paths, words


def get_target(labels):
    annotations = np.zeros((0, 15))
    if len(labels) == 0:
        return annotations
    for idx, label in enumerate(labels):
        annotation = np.zeros((1, 15))
        # bbox
        annotation[0, 0] = label[0]  # x1
        annotation[0, 1] = label[1]  # y1
        annotation[0, 2] = label[0] + label[2]  # x2
        annotation[0, 3] = label[1] + label[3]  # y2

        # landmarks
        annotation[0, 4] = label[4]    # l0_x
        annotation[0, 5] = label[5]    # l0_y
        annotation[0, 6] = label[7]    # l1_x
        annotation[0, 7] = label[8]    # l1_y
        annotation[0, 8] = label[10]   # l2_x
        annotation[0, 9] = label[11]   # l2_y
        annotation[0, 10] = label[13]  # l3_x
        annotation[0, 11] = label[14]  # l3_y
        annotation[0, 12] = label[16]  # l4_x
        annotation[0, 13] = label[17]  # l4_y
        if (annotation[0, 4] < 0):
            annotation[0, 14] = -1  # w/o landmark
        else:
            annotation[0, 14] = 1

        annotations = np.append(annotations, annotation, axis=0)
    target = np.array(annotations)

    return target

In [4]:
# set dataset path
dataset_path = 'face_detection/data/widerface/train'

# reads info
img_paths, words = load_info(os.path.join(dataset_path, 'label.txt'))
samples = list(zip(img_paths, words))
random.shuffle(samples)

if os.path.exists('face_detection/data/widerface_train_bin.tfrecord'):
    raise Exception("tfrecord already exists")

# writing tfredcord
with tf.io.TFRecordWriter('face_detection/data/widerface_train_bin.tfrecord') as writer:
    for img_path, word in tqdm.tqdm(samples):
        target = get_target(word)
        img_name = os.path.basename(img_path).replace('.jpg', '')

        tf_example = make_example(img_name=str.encode(img_name),
                                  img_path=str.encode(img_path),
                                  target=target)

        writer.write(tf_example.SerializeToString())

100%|███████████████████████████████████████████████████████████████████████████| 12880/12880 [00:38<00:00, 338.86it/s]


<a id ='train'></a>
## Training

<p style="color:#FF0000";> Do not run if you're on cpu</p>

In [None]:
# instance network
cfg = load_yaml('face_detection/configs/retinaface_res50.yaml') # res50 or mbv2
model = RetinaFaceModel(cfg, training=True)
model.summary(line_length=80)

# define prior box
priors = prior_box((cfg['input_size'], cfg['input_size']),
                   cfg['min_sizes'],  cfg['steps'], cfg['clip'])

# load dataset
train_dataset = load_dataset(cfg, priors, shuffle=True)

# define optimizer
steps_per_epoch = cfg['dataset_len'] // cfg['batch_size']
learning_rate = MultiStepWarmUpLR(
    initial_learning_rate=cfg['init_lr'],
    lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']],
    lr_rate=cfg['lr_rate'],
    warmup_steps=cfg['warmup_epoch'] * steps_per_epoch,
    min_lr=cfg['min_lr'])
optimizer = tf.keras.optimizers.SGD(
    learning_rate=learning_rate, momentum=0.9, nesterov=True)

# define losses function
multi_box_loss = MultiBoxLoss()

# load checkpoint
checkpoint_dir = 'face_detection/checkpoints/' + cfg['sub_name']
checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                 optimizer=optimizer,
                                 model=model)
manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                     directory=checkpoint_dir,
                                     max_to_keep=3)
if manager.latest_checkpoint:
    checkpoint.restore(manager.latest_checkpoint)
    print('[*] load ckpt from {} at step {}.'.format(manager.latest_checkpoint, checkpoint.step.numpy()))
else:
    print("[*] training from scratch.")
    
# define training step function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)

        losses = {}
        losses['reg'] = tf.reduce_sum(model.losses)
        losses['loc'], losses['landm'], losses['class'] = \
            multi_box_loss(labels, predictions)
        total_loss = tf.add_n([l for l in losses.values()])

    grads = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return total_loss, losses

summary_writer = tf.summary.create_file_writer('face_detection/logs/' + cfg['sub_name'])       # tf log
remain_steps = max(steps_per_epoch * cfg['epoch'] - checkpoint.step.numpy(), 0)   # remaining steps from last training
prog_bar = ProgressBar(steps_per_epoch,checkpoint.step.numpy() % steps_per_epoch) # progress bar for visualization

# training loop
for inputs, labels in train_dataset.take(remain_steps):
    checkpoint.step.assign_add(1)
    steps = checkpoint.step.numpy()

    total_loss, losses = train_step(inputs, labels)

    prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
        ((steps - 1) // steps_per_epoch) + 1, cfg['epoch'],
        total_loss.numpy(), optimizer.lr(steps).numpy()))

    if steps % 10 == 0:
        with summary_writer.as_default():
            tf.summary.scalar('loss/total_loss', total_loss, step=steps)
            for k, l in losses.items():
                tf.summary.scalar('loss/{}'.format(k), l, step=steps)
            tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps)

    # make checkpoint
    if steps % cfg['save_steps'] == 0:
        manager.save()
        print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint))

manager.save()
print("\n[*] training done! save ckpt file at {}".format(manager.latest_checkpoint))



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "RetinaFaceModel"
________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
input_image (InputLayer)  [(None, 640, 640, 0                                   
________________________________________________________________________________
tf_op_layer_strided_slice [(None, 640, 640, 0        input_image[0][0]          
________________________________________________________________________________
tf_op_layer_BiasAdd (Tens [(None, 640, 640, 0        tf_op_layer_strided_slice[0
________________________________________________________________________________
ResNet50_extractor (Funct ((None, 80, 80, 5 23587712 tf_op_layer_BiasAdd[0][0]  
________________________________________________________________________________
FPN (FPN)             