# Face Detection and Recognition CNN Models

In [33]:
import numpy as np
import torch
from torch import nn
import scipy.io
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
import math

# mirar diferencias entre estos dos y elegir uno
from torch.utils.data import random_split

In [34]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [35]:
# Hyperparameters
lr = 0.001
epoch_step = 20
batch_size = 32 # o 16
images_path = 'TRAINING/'

## Loading the dataset

In [82]:
class MyData(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_name, label = self.data[idx]
        with Image.open(images_path + image_name) as image:
            # Apply transformations if specified
            if self.transform:
                image = image.convert('RGB')
                # image = np.float32(image)
                image = self.transform(image)
        return image, label

In [83]:
def load_data(labels_path, labels_wanted='boundaries', tr_size=0.8, val_size=0.1):
    mat = scipy.io.loadmat(labels_path)['AGC_Challenge3_TRAINING'][0]
    data = []
    for entry in mat:
        key = entry[1][0]
        if (labels_wanted == 'boundaries'):
            data.append([key, entry[2]])
        elif (labels_wanted == 'identity'):
            data.append([key, entry[0][0][0]])

    return data

In [84]:
# mean and std extracted from the train_dataset part of AGC_Challenge3_TRAINING
def compute_mean_std(dataset):
    all_pixels = []
    for image_path, _ in dataset:
        with Image.open(images_path + image_path) as image:
            image_array = np.array(image)
            all_pixels.append(image_array)

    all_images = np.stack(all_pixels, axis=0)

    mean = np.mean(all_images, axis=(0, 1, 2)) / 255.0 
    std_dev = np.std(all_images, axis=(0, 1, 2)) / 255.0

    return mean, std_dev

In [85]:
# transforms son copiados, revisar
# calcular valores de mean y std del dataset para cada channel
# o las de imagenet
# flipping, cambios de colores, grayscale
# contraste de saturacion 
# color spaces
################

all_data = load_data('AGC_Challenge3_Training.mat')
train_data, val_data, test_data = random_split(all_data, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(42))

# train_mean, train_std = compute_mean_std(train_data)

transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
    # transforms.Normalize(mean=train_mean, std=train_std)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = MyData(train_data, transform)
val_dataset = MyData(val_data, transform)
test_dataset = MyData(test_data, transform)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## The detection model

In [86]:
class DetectionCNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.cnn_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

    def forward(self, data):
        x = self.cnn_layers(data)
        # x = self.flatten(x) # !!! antes del linear layer 
        return x

    def fit(self, training_data, loss_fn, optimizer: optim.Optimizer): # cross entropy con softmax + adam
        self.train()
        for batch_idx, (batch_data, target) in enumerate(training_data):
            optimizer.zero_grad()  # Clear the gradients
            output = self.forward(batch_data)
            loss = loss_fn(output, target)  # Compute the loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update the model's parameters

            if batch_idx % epoch_step == 0:
                print(f'Epoch {batch_idx} has loss {loss.item()}')

    def evaluate(self, validation_data, loss_fn):
        self.eval()
        total_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for images, labels in validation_data:
            images = images.to(device)
            labels = labels.to(device)
            
            output = self.forward(images)
            loss = loss_fn(output, labels)
            total_loss += loss.item()

            # Compute accuracy
            _, predicted = torch.max(output, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()

        average_loss = total_loss / len(validation_data)
        accuracy = correct_predictions / total_predictions

        return average_loss, accuracy

    def predict(self, test_image):
        self.eval()
        with torch.inference_mode(mode=True):
            return self.forward(test_image)

In [87]:
detection = DetectionCNNModel().to(device)
print(detection)

DetectionCNNModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (cnn_layers): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
)


In [88]:
pytorch_total_params = sum(p.numel() for p in detection.parameters())
print(pytorch_total_params)

23808


In [89]:
# train loop as train and then test
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(detection.parameters(), lr=lr)

detection.fit(train_loader, loss, optimizer)

TypeError: can't convert np.ndarray of type numpy.uint16. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
torch.save(detection, 'detection_model.pth') # not model.state_dict() because we want to store the class also

## The recognition model

In [None]:
class RecognitionCNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()

# 2 linear layers
# va a sacar un vector y hacer argmax
# multimodal ??

In [None]:
model = RecognitionCNNModel().to(device)
print(model)

In [None]:
torch.save(model, 'recognition_model.pth') # not model.state_dict() because we want to store the class also