In [15]:
import pickle
X, y = pickle.load(open("input/dataset.pkl", "rb"))

In [16]:
import numpy as np

It contains images of people smiling `y=1` or not `y=0`.

You will create a classifier based on those features and labels using three methods:
- A PCA that retains 99% of the information followed by a random forest
- A CNN
- A facial landmark extraction followed by a random forest

For each case, you may have to handle the data a bit differently. In each case, compute the accuracy and compare the final results: what method is the most accurate? What method gives the best results for a limited development time?

In [17]:
print(X.shape)
print(y.shape)

size_x = X.shape[1]
size_y = X.shape[2]

(800, 350, 350)
(800,)


In [18]:
# split train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# Make the data preparation
# flatten arrays
X_flat_train = X_train.reshape(X_train.shape[0],-1)
X_flat_test  = X_test.reshape(X_test.shape[0],-1)

## SCALE THE FEATURES (pixels grey levels)
# numeric features scaling
from sklearn.preprocessing import MinMaxScaler

X_flat_train.shape

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_flat_train)
X_test  = scaler.transform(X_flat_test)

## PCA 90% + RF

In [20]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
X_train_PCA = pca.fit_transform(X_train)
X_test_PCA = pca.transform(X_test)

In [21]:
# RF apply to PCA
print(np.unique(y))

[0. 1.]


In [22]:
X_train_PCA.shape ## il reste 98 features out of 350*350 initially

(640, 98)

In [23]:
## 1er hyperparm : max_depth !!! pour la régularization

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train_PCA, y_train)

y_pred_train = clf.predict(X_train_PCA)
y_pred_test = clf.predict(X_test_PCA)

f1score = f1_score(y_test, y_pred_test, average=None)
print(f'f1score_test: {f1score}')

acc_score_train = accuracy_score(y_train, y_pred_train)
print(f'acc_score_train: {acc_score_train}')

acc_score_test = accuracy_score(y_test, y_pred_test)
print(f'acc_score_test: {acc_score_test}')

f1score_test: [0.82894737 0.8452381 ]
acc_score_train: 0.8875
acc_score_test: 0.8375


## CNN

In [24]:
## ne pas flatten les images
X_train = X_train.reshape(X_train.shape[0], size_x, size_y)
X_test  = X_test.reshape(X_test.shape[0], size_x, size_y)

X_train.shape

(640, 350, 350)

In [25]:
### tester le CNN avec du dropout : désactiver le % indiqué des couches

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling2D, Conv2D, Flatten, Dense


def lenet5():
    model = Sequential()

    # Layer C1
    model.add(Conv2D(filters=6, kernel_size=(3, 3), activation='relu', input_shape=(size_x,size_y,1)))
    # Layer S2
    model.add(MaxPooling2D(pool_size=(2, 2)))
    # Layer C3
    model.add(Conv2D(filters=16, kernel_size=(3, 3), activation='relu'))
    # Layer S4
    model.add(MaxPooling2D(pool_size=(2, 2)))
    # Before going into layer C5, we flatten our units
    model.add(Flatten())
    # Layer C5
    model.add(Dense(units=120, activation='relu'))
    # Layer F6
    model.add(Dense(units=84, activation='relu'))
    # Output layer
    model.add(Dense(units=1, activation = 'sigmoid'))
    
    return model

In [26]:
## CNN as seen on June,5th in 03-CNN
## bien vérfier que les images sont scalées

from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

# Instantiate the model
model = lenet5()

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
            TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)]


# Finally fit the model
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=callbacks)

### optimum biais-variance à epoch 6: acc_train: 0.90 et acc_test: 0.90
### regarder les courbes acc_train et acc_test dans TensorBoard
##Epoch 6/10
##10/10 [==============================] - 4s 452ms/step - loss: 0.2686 - accuracy: 0.9031 - val_loss: 0.2561 - val_accuracy: 0.9062


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5c00621210>

In [27]:
# Compute the accuracy
print('accuracy on train with CNN:', model.evaluate(X_train, y_train, verbose=0)[1])
print('accuracy on test with CNN:', model.evaluate(X_test, y_test, verbose=0)[1])

accuracy on train with CNN: 0.9984375238418579
accuracy on test with CNN: 0.9375


In [None]:
## voir avec le CCN imagenet + Dense layers vu en 03-CNN

### A facial landmark extraction followed by a random forest

je n'ai que 1h30 à consacrer aux challenges du J.06/06
voir avec le formateur pour la mise à jour de CMake si besoin dans la suite

In [34]:
! pip install dlib

Collecting dlib
  Using cached dlib-19.24.4.tar.gz (3.3 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: dlib
  Building wheel for dlib (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for dlib [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[10 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_ext
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/home/michaelfaivre/.pyenv/versions/3.10.12/envs/artefact/bin/cmake", line 5, in <module>
  [31m   [0m     from cmake import cmake
  [31m   [0m ModuleNotFoundError: No module named 'cmake'
  [31m   [0m 
  [31m   [0m ERROR: CM

In [35]:
### je suis bloqué dans l'installation de la dépence dlib à cause de:
### ModuleNotFoundError: No module named 'cmake'
### ERROR: CMake must be installed to build dlib

##  X_train = X_train.reshape(*X_train.shape, 1)  en partant d'un X_train avec une shape (xxx, 350, 350)
## Rappel : face_recognition est une image embedding


import face_recognition

landmarks_train = face_recognition.face_landmarks(X_train)

landmarks_train[0]

## construire un vecteur de caractéristiques à partir des landmarks
## les landmarks sont numérotées
## matrice de caractéristiques
## 

ModuleNotFoundError: No module named 'face_recognition'