# Face Detection Challenge

In [1]:
import numpy as np 
import pandas as pd

### Objectives
-  visualize images
-  preprocess images for Neural Networks
-  fit a custom CNN for a regression task
-  fine-tune a analyse model performance

## 1. Visualize images

👉 Load the dataset (200 Mo)

In [None]:
data = pd.read_csv("https://wagon-public-datasets.s3.amazonaws.com/certification_paris_2021Q1/images.csv")
data

Each row represent the image of a face in black and white
- `age` is the age in year
- `pixels` contains the 2304 (= 48 * 48) flatten values of each pixel in the image, in a black scale from 0 to 255, stored as string

❓ Plot the histogram of age in your dataset

In [None]:
data.age.hist()

❓ Visually display one image of your choice.

In [None]:
#Function to transform the string into a list of integers
import matplotlib.pyplot as plt

def get_img(img):
    new_img = []
    for nb in img.split(" "):
        new_img.append(int(nb))
    return new_img

In [None]:
#Ploting a few faces from the dataframe
for i in [0,5000,10000,15000,20000]:
    label = data.age[i]
    img = get_img(data.pixels[i])
    img = np.reshape(img, (48,48))
    
    plt.figure(figsize=(2,2))
    plt.imshow(img, cmap='gray')
    plt.title(label)
    plt.show()

❓ What is the physical characteristic of the image at **index** `10000`?

In [None]:
#The face of index 10000
label = data.age[10000]
img = get_img(data.pixels[10000])
img = np.reshape(img, (48,48))
plt.figure(figsize=(2,2))
plt.imshow(img, cmap='gray')
plt.title(label)
plt.show()

In [None]:
# Store your result in the variable below

gender = "male"
# gender = "female"

# smile = True
smile = False

In [None]:
from nbresult import ChallengeResult
result = ChallengeResult('C14',
                         gender=gender,
                         smile=smile)
result.write()

## 2. Processing

❓ Your goal is to train a convolutional neural network model to predict the age of a person based solely on a picture of his/her face.
- Create your feature matrix `X` as nparray of shape suited for a CNN, scaled between 0 and 1
- Create your target vector `y`
- Create a holdout set (`X_train`, `y_train`) (`X_test`, `y_test`) keeping `30%` in the test set, randomly sampled out of the whole dataset

😌 Don't worry, you will be given the solution in the next question in case you don't make it

In [None]:
#Function to transform string into list of scaled numbers and then to reshape the image
def get_img(img):
    new_img = []
    for nb in img.split(" "):
        new_img.append(int(nb)/255.)
    return new_img
def transform_pix(img):
    img = get_img(img)
    img = np.reshape(img, (48,48,1))
    return img

In [None]:
#Creation of X
X = [transform_pix(img) for img in data.pixels]
X = np.asarray(X)

In [None]:
np.shape(X)

In [None]:
#Creation of y
y = data.age

In [None]:
np.shape(y)

In [None]:
#Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from nbresult import ChallengeResult
result = ChallengeResult('C1415', 
                         X_train_shape = X_train.shape,
                         y_train_shape = y_train.shape,
                         first_image = X_train[0]
                        )
result.write()

## Convolutional Neural Network for Age prediction

❓ Build a convolutional neural network

- Do not use transfer learning
- Store number of trainable parameters in a variable `params_number`
- Plot your metric & loss at each epoch
- make sure your model does not overfit with appropriate control techniques
- Compute the mean absolute error on your test set and store it as `mae_test`
- Compute, store and compare it with a `mae_baseline`

💡 You will not be judged by the computing power of your computer, but you should obtain significantly better performance than the baseline in less than 3 minutes, even without GPUs

👉 Feel free to start back from the solution by running the cell below (~500Mo download)

In [None]:
import pickle

file_names = ["images_X_train.pickle", "images_X_test.pickle", "images_y_train.pickle", "images_y_test.pickle"]
pickles = dict()

for file_name in file_names:
    url = f"https://wagon-public-datasets.s3.amazonaws.com/certification_paris_2021Q1/{file_name}"
    command = f"curl --output {file_name} {url}"
    ! eval {command}
    with open(file_name, 'rb') as handle:
        pickles[file_name] = pickle.load(handle)
    command = f"rm {file_name}"
    ! eval {command}
        
X_train, X_test, y_train, y_test = pickles.values()

In [None]:
#Baseline model => predict average age in any case
pred = y_train.mean()
mae_baseline = abs(y_test-pred).mean()
mae_baseline

In [None]:
#Definition of the CNN model
from tensorflow.keras import Sequential, layers
def initialize_model():
    model = Sequential()
    
    model.add(layers.Conv2D(32, (3,3), input_shape=(48, 48, 1), activation='relu', padding='same'))
    model.add(layers.MaxPool2D(pool_size=(2,2)))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dropout(rate=0.5))
    model.add(layers.Dense(1, activation='linear'))
    
    return model

In [None]:
model = initialize_model()
model.summary()

In [None]:
params_number = 276831

In [None]:
#Compilation function for the model
def compile_model(model):
    model.compile(
        optimizer='adam',
        loss='mse', 
        metrics = ['mae'])
    return model

In [None]:
#Init, compile and fit the model
from tensorflow.keras import callbacks
es = callbacks.EarlyStopping(patience=5, restore_best_weights=True)

model = initialize_model()
model = compile_model(model)
history = model.fit(X_train, y_train,
          batch_size=32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es],
          verbose=1)

In [None]:
#The history of the model
def plot_history(history, title='', axs=None, exp_name=""):
    if axs is not None:
        ax1, ax2 = axs
    else:
        f, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    if len(exp_name) > 0 and exp_name[0] != '_':
        exp_name = '_' + exp_name
    ax1.plot(history.history['loss'], label='train' + exp_name)
    ax1.plot(history.history['val_loss'], label='val' + exp_name)
    ax1.set_ylim(0., 600.)
    ax1.set_title('Loss')
    ax1.legend()

    ax2.plot(history.history['mae'], label='train mae'  + exp_name)
    ax2.plot(history.history['val_mae'], label='val mae'  + exp_name)
    ax2.set_ylim(0., 20.)
    ax2.set_title('MAE')
    ax2.legend()
    return (ax1, ax2)

plot_history(history)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
#How the model 
mae_test = model.evaluate(X_test, y_test, verbose=0)[1]

In [None]:
#If this fails we have a problem captain
assert mae_test < mae_baseline

In [None]:
from nbresult import ChallengeResult
result = ChallengeResult('C1516',
                         params_number=params_number,
                         mae_baseline = mae_baseline,
                         mae_test = mae_test,
                        )
result.write()

In [None]:
#What do the predictions look like for the previous images ?
for i in [0,5000,10000,15000,20000]:
    pred = model.predict(np.reshape(X[i],(1,48,48,1)))[0][0]
    pred = round(pred)
    label = f'{data.age[i]} => {pred}'
    img = get_img(data.pixels[i])
    img = np.reshape(img, (48,48))
    
    plt.figure(figsize=(2,2))
    plt.imshow(img, cmap='gray')
    plt.title(label)
    plt.show()