In [3]:
import sys
sys.path.append("../../")

In [4]:
from facenet_pytorch import MTCNN

In [5]:
from torch.utils import data
from torch import optim
import torch
from torch import nn
from torch.optim import lr_scheduler

In [6]:
from glob import glob 
import json
import pandas
import typing
import os

In [7]:
from src.training.losses import losses
from src.training.metrics import metrics as eval_metrics
from src.training.trainers import regularization
from src.preprocessing import augmentations

In [6]:
TRAIN_DIR = "../experiments/experiment1/data/train_data"
TRAIN_ANNOTATIONS = "../experiments/experiment1/labels/train_labels"
VALIDATION_DIR = "../experiments/experiment1/data/validation_data"
VALIDATION_ANNOTATIONS = "../experiments/experiment1/labels/validation_labels"
DATA_CONFIG_DIR = "../experiments/experiment1/data/"

In [7]:
class XMLAnnotationParser(object):
    """
    Class for extracting annotations
    from .xml files.
    """
    def parse_annotations(self, input_path: str) -> pandas.DataFrame:
        pass

In [8]:
def load_xml_annotations(annotations_path: str) -> pandas.DataFrame:
    """
    Loads xml annotations from the specified 
    'annotations_path' directory folder.

    Parameters:
    -----------
        annotations_path - path, containing xml annotations for data
    """
    parser = XMLAnnotationParser()
    output_anns = None 
    
    for path in glob(pathname="**/*.%s" % file_ext, root_dir=annotations_path):
        ann_path = os.path.join(annotations_path, path)
        
        annotations = parser.parse_annotations(ann_path)
        if output_anns is None:
            output_anns = annotations
        else:
            output_anns = pandas.concat([output_anns, annotations], axis=0)
    return output_anns

In [9]:
def load_image_data(root_dir: str, file_extensions: typing.List):
    output_images = []
    
    for ext in file_extensions:
        found_images = glob(pathname="**/*.%s" % ext, root_dir=root_dir)
        output_images.extend(found_images)
        
    return output_images

# Loading annotations for training and validation data

In [9]:
train_annotations = load_xml_annotations(TRAIN_ANNOTATIONS)
validation_annotations = load_xml_annotations(VALIDATION_ANNOTATIONS)

# Loading image train and validation datasets

In [None]:
train_dataset = load_image_data(TRAIN_DATASET_DIR, ["jpeg", "png", "jpg"])
validation_dataset = load_image_data(VALIDATION_DATASET_DIR, ["jpeg", "png", "jpg"])

In [None]:
train_hashes = []
train_annotations.set_index("video_name")
sorted_train_annotations = train_annotations.reindex(train_hashes)

In [None]:
validation_hashes = []
validation_annotations.set_index("video_name")
sorted_validation_annotations = validation_annotations.reindex(validation_hashes)

# Augmentations

In [None]:
%%time

train_augmentations = albumentations.Compose(
    transforms=[
        albumentations.ImageCompression(compression_type=0, quality_upper=100, quality_lower=60),
        albumentations.GaussianNoise(p=0.05),
        albumentations.GaussianBlur(p=0.1),
        albumentations.RandomGamma(p=0.5),
        albumentations.OneOf([
            resize.IsoptropicResize(
                MTCNN_IMAGE_SIZE, 
                interpolation_up=cv2.INTER_LINEAR, 
                interpolation_down=cv2.INTER_NEAREST
            ),
            resize.IsotropicResize(
                MTCNN_IMAGE_SIZE, 
                interpolation_up=cv2.INTER_CUBIC, 
                interpolation_down=cv2.INTER_LINEAR
            ),
            resize.IsotropicResize(
                MTCNN_IMAGE_SIZE,
                interpolation_up=cv2.INTER_NEAREST,
                interpolation_down=cv2.INTER_LINEAR
            ),
        ]),
        albumentations.HorizontalFlip(p=0.5),
        albumentations.OneOf(
            transforms=[
                albumentations.RandomBrightnessContrast(
                    
                ),
                albumentations.FancyPCA(),
                albumentations.HueSaturationValue(),
            ]
        )
        albumentations.ShiftScaleRotate(
            rotation_limit=0,
            scale_limit=0.2,
            shift_limit=1,
            border_mode=cv2.BORDER_CONSTANT
        )
    ], bbox_params=albumentations.BboxParams(
        format='pascal_voc', 
        label_fields=["bbox_labels"] # when calling 'transforms()' 
        # you need to pass bounding box labels, under `bbox_labels`
    )
)

validation_augmentations = albumentations.Compose(
    transforms=[
        albumentations.OneOf([
            resize.IsoptropicResize(
                MTCNN_IMAGE_SIZE, 
                interpolation_up=cv2.INTER_LINEAR, 
                interpolation_down=cv2.INTER_NEAREST
            ),
            resize.IsotropicResize(
                MTCNN_IMAGE_SIZE, 
                interpolation_up=cv2.INTER_CUBIC, 
                interpolation_down=cv2.INTER_LINEAR
            ),
            resize.IsotropicResize(
                MTCNN_IMAGE_SIZE,
                interpolation_up=cv2.INTER_NEAREST,
                interpolation_down=cv2.INTER_LINEAR
            ),
        ]),
        albumentations.HorizontalFlip(p=0.5)
    ], bbox_params=albumentations.BboxParams(
        format='pascal_voc', 
        label_fields=['bbox_classes']
    )
)

# Initializing datasets

In [None]:
%%time

train_dataset = datasets.MTCNNFineTuneDataset(
    image_paths=train_dataset,
    boxes=sorted_train_annotations['boxes'].tolist(),
    transformations=train_augmentations
)

validation_dataset = datasets.MTCNNFineTuneDataset(
    image_paths=validation_dataset,
    boxes=sorted_validation_annotations['boxes'].tolist(),
    transformations=validation_augmentations
)

early_stop_dataset = datasets.MTCNNFineTuneDataset(
    image_paths=early_dataset,
    boxes=early_annotations['boxes'].tolist(),
    transformations=validation_augmentations
)

# Initializing configuration

In [14]:
max_epochs = 20
learning_rate = 1e-5
weight_decay = 0.02

if torch.cuda.is_available():
    device = torch.device("cpu")
else:
    device = torch.device("cpu")

# early stopping patience

early_patience = 5
min_diff = 0.05
early_start = 2
early_stopper = regularization.EarlyStopping(early_patience, min_diff)

# initialization data loaders

workers_per_loader = max(os.cpu_count()-1, 0) // 3 # for train, val and early loaders

In [22]:
# eval metric and loss function for fine-tuning
loss_function = losses.CIOULoss()
eval_metric = eval_metrics.IOUScore()

# network, optimizer and LR Scheduling techniques

network = MTCNN(
    min_face_size=160, 
    margin=10, 
    post_process=False, 
    thresholds=[0.8, 0.9, 0.95],
    device=device
)

optimizer = optim.Adam(
    network.parameters(), 
    lr=learning_rate, 
    weight_decay=weight_decay
)

scheduler = lr_scheduler.MultiStepLR(
    optimizer, 
    [5, 10, 15, 20], 
    gamma=0.6
)

In [13]:
%%time

train_loader = data.DataLoader(
    dataset=train_dataset,
    shuffle=True,
    num_workers=workers_per_loader,
    batch_size=train_batch_size
)

val_loader = data.DataLoader(
    batch_size=1,
    dataset=validation_dataset,
    shuffle=True,
    num_workers=workers_per_loader
)

early_loader = data.DataLoader(
    batch_size=1,
    dataset=early_dataset,
    shuffle=True,
    num_workers=workers_per_loader
)

NameError: name 'train_dataset' is not defined

In [17]:
def print_model_architecture(model):
    
    for name, module in model.named_modules():
        print("Module name: ", name)
        print("Module type: ", type(module))
        if hasattr(module, '_modules'):
            print("Submodules: ", module._modules)

def print_model_layers(model):
    for name, param in model.named_parameters():
        print('param name: %s' % name)
        print('param type: %s' % type(param))

In [15]:
print_model_architecture(MTCNN())

Module name:  
Module type:  <class 'facenet_pytorch.models.mtcnn.MTCNN'>
Submodules:  OrderedDict([('pnet', PNet(
  (conv1): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1))
  (prelu1): PReLU(num_parameters=10)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): Conv2d(10, 16, kernel_size=(3, 3), stride=(1, 1))
  (prelu2): PReLU(num_parameters=16)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
  (prelu3): PReLU(num_parameters=32)
  (conv4_1): Conv2d(32, 2, kernel_size=(1, 1), stride=(1, 1))
  (softmax4_1): Softmax(dim=1)
  (conv4_2): Conv2d(32, 4, kernel_size=(1, 1), stride=(1, 1))
)), ('rnet', RNet(
  (conv1): Conv2d(3, 28, kernel_size=(3, 3), stride=(1, 1))
  (prelu1): PReLU(num_parameters=28)
  (pool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): Conv2d(28, 48, kernel_size=(3, 3), stride=(1, 1))
  (prelu2): PReLU(num_parameters=48)
  (pool2): MaxPool2d(kernel_size=3, stride=2, padding

In [18]:
print_model_layers(MTCNN())

param name: pnet.conv1.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv1.bias
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.prelu1.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv2.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv2.bias
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.prelu2.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv3.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv3.bias
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.prelu3.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv4_1.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv4_1.bias
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv4_2.weight
param type: <class 'torch.nn.parameter.Parameter'>
param name: pnet.conv4_2.bi

In [23]:
%%time

class TunePipeline(object):
    """
    Training pipeline for tuning
    MTCNN face detector
    """
    def __init__(self,
        network: nn.Module,
        loss_function: nn.Module,
        eval_metric: nn.Module,
        max_epochs: int,
        batch_size: int,
        optimizer: nn.Module,
        lr_scheduler: nn.Module,
        snapshot_path: str,
        inf_device
    ):
        self.network = network.to(device)
        self.loss_function = loss_function
        self.eval_metric = eval_metric
        self.max_epochs: int = max_epochs
        self.batch_size: int = batch_size
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler
        self.snapshot_path = snapshot_path
        self.inf_device = inf_device
        self.freeze_mtcnn_layers()

    def freeze_mtcnn_layers(self, layers: typing.Dict[str, int]):
        """
        Freezes layers of MTCNN face detector
        Parameters:
        ----------
            layers - (typing.Dict) - dict structure: {"block_name": "layer_name"}
            Example:
                layers = {"onet": "conv1", "rnet": "dense3"}
        """
        for name, param in self.network.named_parameters():
            if name[:4].lower() not in layers:
                param.requires_grad = True
            else:
                layer_name = name[4:].split(".")[0]
                if layer_name in layers.values():
                    param.requires_grad = False
                else:
                    param.requires_grad = True

    def save_checkpoint(self, epoch: int, loss: float):
        
        snapshot_full_path = os.path.join(
            self.snapshot_path, 
            "model_epoch_%s.pth" % epoch
        )
        
        snapshot = {
            'model_state': self.network.state_dict(),
            'optimizer_state': self.optimizer.state_dict(),
            'lr_scheduler_state': self.lr_scheduler.state_dict(),
        }
        torch.save(
            obj=snapshot, 
            f=snapshot_full_path
        )

    def train(self, loader: data.DataLoader):
        """
        Runs fine-tuning process
        of MTCNN detector.

        Parameters:
        -----------
        loader - data loader, containing train dataset

        Returns:
            - train_loss - (float) value, representing
            the best loss across all executed epochs
        """
        self.network.train()
        train_loss = float('inf')
        
        for epoch in range(self.max_epochs):
            
            epoch_loss = []
            for images, boxes in loader:
                
                predictions = self.network.detect_faces()
                pred_boxes, act_boxes = self.match_boxes(predictions, boxes)
                
                loss = self.loss_function(pred_boxes, act_boxes)
                loss.backward()
                
                epoch_loss.append(loss.item())
                
            train_loss = min(train_loss, numpy.mean(epoch_loss))
            self.optimizer.step()

            if self.lr_scheduler is not None:
                self.lr_scheduler.step()

            if epoch % self.save_eer
        return train_loss
        
    def evaluate(self, loader: data.DataLoader):
        
        eval_metric = 0
        with torch.no_grad():
            for image, boxes in loader:
                pass
        return eval_metric
    

CPU times: user 59 µs, sys: 20 µs, total: 79 µs
Wall time: 68.2 µs


In [None]:
%%time

trainer = TuningPipeline(
    network=network,
    loss_function=loss_function,
    eval_metric=eval_metric,
    max_epochs=max_epochs,
    early_stopper=early_stopper,
    early_dataset=early_dataset,
    device=device
)

In [None]:
train_loss = trainer.train(train_loader)

In [None]:
print('loss on fine tuning dataset: %s ' % train_loss)

In [None]:
eval_metric = trainer.evaluate(val_loader)

In [None]:
print('evaluation metric on validation dataset: %s ' % eval_metric)

# Explaining MTCNN predictions manually

In [None]:
fg, ax = plt.subplots(ncols=2, nrows=3)

for idx in range(6):
    
    img, _ = validation_dataset[idx]
    output_img = img.copy()
    boxes = trainer.predict(img)
    
    for box in range(len(boxes)):
        x1, y1, x2, y2 = box
        cv2.rectangle(img, (x1, y1), (x2, y2), color=(255,0,0), thickness=2)
    ax[idx, 0].imshow(img)
    ax[idx, 1].imshow(output_img)
plt.imshow()

# Measuring speed of the network

In [52]:
import time
import numpy 
import gc

def flush_cache():
    torch.cuda.empty_cache()
    _ = gc.collect()
    
def measure_face_detector_inference_time(
    detector: MTCNN,
    input_images: list,
    total_repetitions: int,
    warmup_iters: int,
    device: str
):
    """
    Measures the approximate
    inference time of the face detector
    on a given set of input images
    
    Parameters:
    -----------
    detector: MTCNN face detector
    input_images - list of numpy images
    total_repetitions - total number of times to repeat measure iteration
    warmup_iters - number of iterations for gpu warmup
    device - device, used for inference test (usually the one used in production env)
    """
    detector.device = torch.device(device)
    data = [torch.from_numpy(img) for img in input_images]
    
    for _ in range(warmup_iters):
        _, _ = detector.detect(data[0].unsqueeze(0).to(device))

    if device.startswith("cuda"):
        starter = torch.cuda.Event(enable_timing=True)
        ender = torch.cuda.Event(enable_timing=True)
        
    avg_times = []

    data = torch.stack(data).to(device)
    
    for _ in range(total_repetitions):
            
        flush_cache()
            
        if device.startswith("cuda"):
            starter.record()
            _, _ = detector.detect(data, landmarks=False)
            ender.record()
            torch.cuda.synchronize()
            total_time = ender.elapsed_time(starter) / len(input_images)
                
        elif device.lower() == "cpu":
            start_time = time.time()
            _, _ = detector.detect(data, landmarks=False)
            end_time = time.time()
            total_time = (end_time - start_time) / len(input_images)
                
        avg_times.append(total_time)
    return numpy.mean(avg_times)

In [53]:
from facenet_pytorch import MTCNN
import torch
import cv2

model = MTCNN(margin=0, min_face_size=100)
data = cv2.imread("../../test_input_3.jpeg", cv2.IMREAD_UNCHANGED)


In [54]:
print(model.detect(data))

(array([[837.1934204101562, 109.53155517578125, 933.6088256835938,
        229.70932006835938],
       [386.7955017089844, 125.48906707763672, 467.9786376953125,
        246.12094116210938],
       [655.2368774414062, 116.82803344726562, 736.0848999023438,
        228.511962890625]], dtype=object), array([0.9985974431037903, 0.9821616411209106, 0.9999810457229614],
      dtype=object))


In [55]:
measure_face_detector_inference_time(model, [data], 100, 10, "cpu")

0.013583757877349854