<a href="https://www.kaggle.com/code/giovanniimbesi/progettoml-real-and-fake-faces-classification?scriptVersionId=164878936" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Real and Fake Faces Classification

In questo notebook verrà presentato un modello di machine learning per la Real and Fake Faces Classification. Il notebook prende spunto da una challenge proposta un paio di anni fa ed ha come obbiettivo lo sviluppo di un modello che possa discriminare tra immagini reali o create artificialmente. 

**Di default il training è disabilitato. Per testare il modello occorre:**
1. Importare le librerie
2. Inizializzare l'oggetto config
3. Eseguire le celle nella sezione Live Test



# Librerie

Di seguito vengono riportate le librerie utilizzate nel notebook.

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import numpy as np
from tabulate import tabulate
import os
import glob
import json
from sklearn import metrics
import shutil

# Config

La classe Config è stata creata per memorizzare le configurazioni utilizzate nel modello. Ogni attributo della classe rappresenta una specifica configurazione utilizzata durante l'addestramento.

In [None]:
class Config:
    def __init__(self):
        self.image_width = 128
        self.image_height = 128
        self.epoch = 15
        self.seed = 42
        self.batch_size = 64
        self.dataset_path = '/kaggle/input/140k-real-and-fake-faces/real_vs_fake/real-vs-fake/'
        self.checkpoint_filepath = 'model_checkpoint.h5'
        self.logs_path = '/kaggle/working/logs'


In [None]:
config = Config()

In [None]:
if os.path.exists(config.logs_path):
    shutil.rmtree(config.logs_path)

# Dataset

Inizialmente il dataset viene caricato in un dataframe al fine di poterlo analizzare.

In [None]:
dataset = {"image_path":[],"img_status":[],"where":[]}
for where in os.listdir(config.dataset_path):
    for status in os.listdir(config.dataset_path+"/"+where):
        for image in glob.glob(config.dataset_path+where+"/"+status+"/"+"*.jpg"):
            dataset["image_path"].append(image)
            dataset["img_status"].append(status)
            dataset["where"].append(where)
dataset = pd.DataFrame(dataset)

In [None]:
dataset.head()

In [None]:
status_counts = dataset["img_status"].value_counts()
plt.figure(figsize=(8, 5))
plt.bar(status_counts.index, status_counts)
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Distribution of Image Classes")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))

fake_images = dataset[dataset['img_status'] == 'fake'].sample(4)
for i in range(4):
    plt.subplot(2, 4, i+1)
    plt.imshow(plt.imread(fake_images["image_path"].values[i]))
    plt.title('Fake', size=10)
    plt.xticks([])
    plt.yticks([])

plt.show()

In [None]:
plt.figure(figsize=(20, 10))

real_images = dataset[dataset['img_status'] == 'real'].sample(4)
for i in range(4):
    plt.subplot(2, 4, i+5)
    plt.imshow(plt.imread(real_images["image_path"].values[i]))
    plt.title('Real', size=10)
    plt.xticks([])
    plt.yticks([])
    
plt.show()


# Data Augmentation

Prima di procedere con il training del modello, i dati sono stati sottoposti ad augmentation al fine di aumentare la variabilità al loro interno. Questo permette di ottenere un modello che sia più generalizzato e robusto

In [None]:
image_gen = ImageDataGenerator(
    vertical_flip=False,
    horizontal_flip=True,
    rescale=1./255
)

In [None]:
train_generator = image_gen.flow_from_directory(
    config.dataset_path + 'train/',
    target_size=(config.image_width, config.image_height),
    batch_size=64,
    class_mode='binary',
    shuffle=True,
    seed=config.seed
)

In [None]:
valid_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
    config.dataset_path + 'valid/',
    target_size=(config.image_width, config.image_height),
    batch_size=64,
    class_mode='binary',
    shuffle=False
)

In [None]:
test_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
    config.dataset_path + 'test/',
    target_size=(config.image_width, config.image_height),
    batch_size=1,
    shuffle=False,
    class_mode='binary'
)

Indicheremo con la classe 0 tutte quelle immagini classificate come fake, viceversa con la classe 1 indicheremo quelle immagini che rappresentano volti reali. 

In [None]:
def show_training_images(generator):
    images, labels = generator.next()
    labels = labels.astype(int)

    label_names = ['Fake', 'Real']
    label_names = [label_names[label] for label in labels]

    fig, axes = plt.subplots(1, 4)
    axes = axes.ravel()

    for i in range(len(axes)):
        axes[i].imshow(images[i])
        axes[i].axis('off')
        axes[i].set_title(format(label_names[i]))
        axes[i].figure.set_size_inches(15,15)

    plt.tight_layout()
    plt.show()


In [None]:
show_training_images(train_generator)

# Modello

Il modello prevede l'utilizzo di una resNet50, la quale è stata usata come fixed feature extractor. Successivamente sono stati aggiunti dai layer finali per la classificazione binaria

In [None]:
resNet = ResNet50(weights='imagenet',
    include_top=False,
    input_shape=(128,128,3)
)

In [None]:
model = Sequential([
    resNet,
    layers.GlobalAveragePooling2D(),
    layers.Dense(512,activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
plot_model(model,show_shapes=True)

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

## Callbacks

Di seguito vengono inizializzate le callbacks del modello, che aiutano nella fase di training attraverso numerosi aggiustamenti sugli iperparametri

In [None]:
checkpoint_callback = ModelCheckpoint(config.checkpoint_filepath, 
                                      monitor='val_loss', 
                                      save_weights_only=True, 
                                      save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', 
                              factor=0.2, 
                              patience=3, 
                              verbose=1, 
                              min_delta=0.0001
                             )

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=5,
                               verbose=1,
                               restore_best_weights=True
                              )

tensorboard_callback = tf.keras.callbacks.TensorBoard("logs")

callbacks = [tensorboard_callback, reduce_lr, early_stopping, checkpoint_callback]

# Training

In [None]:
class ModelTrainer:
    def __init__(self, model, train_generator, valid_generator, config, callbacks):
        self.model = model
        self.train_generator = train_generator
        self.valid_generator = valid_generator
        self.steps_per_epoch = train_generator.n // config.batch_size
        self.validation_steps = valid_generator.n // config.batch_size
        self.epochs = config.epoch
        self.callbacks = callbacks
    
    def train(self):
        history = self.model.fit(
            self.train_generator,
            steps_per_epoch=self.steps_per_epoch,
            validation_data=self.valid_generator,
            validation_steps=self.validation_steps,
            epochs=self.epochs,
            callbacks=self.callbacks
        )
        return history


In [None]:
trainer = ModelTrainer(model, train_generator, valid_generator, config, callbacks)
history = trainer.train()

In [None]:
model.save(config.checkpoint_filepath)

Dopo il training il modello viene salvato, usando i pesi che hanno ottenuto i risultati migliori sul validation set.

## Analisi dei risultati

Sono state analizzate le curve di loss del modello e l'accuracy ottenuta sia per il train che per il validation set.

In [None]:
train_loss = history.history['loss']
train_acc = history.history['accuracy']

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_loss, label='Train Loss', linewidth=2, color='blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Set Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_acc, label='Train Accuracy', linewidth=2, color='green')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Set Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
val_loss = history.history['val_loss']
val_acc = history.history['val_accuracy']

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(val_loss, label='Validation Loss', linewidth=2, color='blue')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Validation Set Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(val_acc, label='Validation Accuracy', linewidth=2, color='green')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Validation Set Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
mean_loss = sum(history.history['loss']) / len(history.history['loss'])
mean_accuracy = sum(history.history['accuracy']) / len(history.history['accuracy'])
table = [
    ['Mean Loss', mean_loss],
    ['Mean Accuracy', mean_accuracy],
]

print(tabulate(table, headers=['Metric', 'Value'], tablefmt='fancy_grid'))

## Valutazione sul Test

In [None]:
y_pred = model.predict(test_generator)
y_test = test_generator.classes

In [None]:
plt.figure(figsize = (10,8))
sns.heatmap(metrics.confusion_matrix(y_test, y_pred.round()), annot = True,fmt="d",cmap = "Blues")
plt.show()

In [None]:
y_pred_binary = np.where(y_pred > 0.5, 1, 0)
print(classification_report(y_test, y_pred_binary))

# Live Test


Di seguito un piccolo test che sceglie delle immagini random dal test set e predice le relative etichette.

In [None]:
model_path = '/kaggle/input/ig-model/model_checkpoint.h5'
model = keras.models.load_model(model_path)

In [None]:
test_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
    config.dataset_path + 'test/',
    target_size=(config.image_width, config.image_height),
    batch_size=1,
    shuffle=False,
    class_mode='binary'
)

In [None]:
num_images = len(test_generator.filenames)
num_random_images = 5
random_indices = np.random.choice(num_images, size=num_random_images, replace=False)

predictions = []
labels = []
predicted_labels = []

In [None]:
for index in random_indices:
    image, label = test_generator[index]
    prediction = model.predict(image)
    predictions.append(prediction)
    labels.append(label)
    predicted_labels.append(prediction > 0.5)

predictions = np.array(predictions)
labels = np.array(labels)
predicted_labels = np.array(predicted_labels)

In [None]:
fig, axes = plt.subplots(1, num_random_images, figsize=(15, 5))

for i in range(num_random_images):
    image = np.squeeze(test_generator[random_indices[i]][0][0])
    ax = axes[i]
    ax.imshow(image)
    ax.axis('off')
    if predicted_labels[i] == labels[i]:
        color = 'green'
    else:
        color = 'red'
    if labels[i] == 1:
        label_text = "real"
    else:
        label_text = "fake"
    ax.set_title(f'Predicted: {predicted_labels[i][0][0]}\nLabel: {label_text}', color=color)

plt.tight_layout()
plt.show()