In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime as dt
import glob
from PIL import Image

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from tensorflow.keras.layers import Dense, Input, Dropout, Rescaling, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Precision, Recall

from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

import warnings 
warnings.filterwarnings('ignore')

2024-11-08 09:25:21.127562: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-08 09:25:21.139397: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-08 09:25:21.142798: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 09:25:21.153401: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


I0000 00:00:1731054323.665095    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731054323.716408    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731054323.716616    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


#### Train/test sets

In [4]:
# X_train, y_train
train_imgs = glob.glob(f'../CIFAKE/train/*/*')
X_train = []
y_train = []

for path, img_path in enumerate(train_imgs):
    class_ = img_path.split('/')[-2]

    # FAKE : 0 -  REAL : 1
    if class_=='FAKE': img_class = 0
    if class_=='REAL': img_class = 1

    # X_train to tensor
    img_pil = Image.open(img_path)
    img_np = np.asarray(img_pil)

    X_train.append(img_np)
    y_train.append(img_class)

X_train = np.asarray(X_train, dtype='uint8')
y_train = np.asarray(y_train)

# X_test, y_test
test_imgs = glob.glob(f'../CIFAKE/test/*/*')

X_test = []
y_test = []

for path, img_path in enumerate(test_imgs):
    class_ = img_path.split('/')[-2]

    # FAKE : 0 -  REAL : 1
    if class_=='FAKE': img_class = 0
    if class_=='REAL': img_class = 1

    # X_test to tensor
    img_pil = Image.open(img_path)
    img_np = np.asarray(img_pil)

    X_test.append(img_np)
    y_test.append(img_class)
    
X_test = np.asarray(X_train, dtype='uint8')
y_test = np.asarray(y_train)

In [5]:
print(f'X_train shape: {X_train.shape} | y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape} | y_test shape: {y_test.shape}')

X_train shape: (100000, 32, 32, 3) | y_train shape: (100000,)
X_test shape: (100000, 32, 32, 3) | y_test shape: (100000,)


#### Class balance

In [6]:
import glob
print('Class distribution: ')
print( f'Train REAL images: {len(glob.glob('../CIFAKE/train/REAL/*'))}'  )
print( f'Train FAKE images: {len(glob.glob('../CIFAKE/train/FAKE/*'))}'  )

print( f'Test REAL images: {len(glob.glob('../CIFAKE/test/REAL/*'))}'  )
print( f'Test FAKE images: {len(glob.glob('../CIFAKE/test/FAKE/*'))}'  )

Class distribution: 
Train REAL images: 50000
Train FAKE images: 50000
Test REAL images: 10000
Test FAKE images: 10000


#### Build model

In [7]:
# Build model
def CNN_model01(n_neurons_layer01=128, n_neurons_layer02=128, activation='relu', dropout_rate=.2):
    model = Sequential([
        Rescaling(1./255),

        Conv2D(n_neurons_layer01, 3, activation=activation),
        MaxPooling2D(),

        Conv2D(n_neurons_layer02, 3, activation=activation),
        MaxPooling2D(),

        Flatten(),
        Dense(24, activation=activation),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])

    model.compile( optimizer=Adam(), 
                  loss=BinaryCrossentropy(),
                  metrics = ['accuracy', Precision(), Recall()],                
                  )

    model.build(input_shape=(None, 32, 32, 3))
    return model

#### Train model

In [8]:
N_EPOCHS = 30

In [None]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier
from datetime import datetime as dt


def train(model):
  time_start = dt.now()
  print(f'Start training, time: {time_start.time()}')

  history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=N_EPOCHS,
    verbose=1
  )

  print(f'Time elapsed: {dt.now() - time_start}')

  return history

# Train with GRID search
def train_gridSearch(model):

  keras_classifier = KerasClassifier(model=model)

  param_grid = dict(
      epochs=[30],
      batch_size= [32, 64],
      model__n_neurons_layer01 = np.arange(64,513,64),
      model__n_neurons_layer02 = np.arange(64,513,64),
      model__dropout_rate = np.arange(.2, .51, .1),
      model__activation = ['relu', 'tanh'],
  )

  grid = GridSearchCV(estimator=keras_classifier, param_grid=param_grid,
                      # cv=3,
                       scoring='accuracy')

  start_t = dt.now()
  print(f'GridSearch start time: {start_t.hour}:{start_t.minute:02}:{start_t.second:02}')
  grid_result = grid.fit(X_train, y_train, validation_data=(X_test, y_test))

  print(f'GridSearch complete. Time elapsed: {dt.now()-start_t}')

  # Best results 
  print(f'Best parameters: {grid_result.best_params_}')
  print(f'Best result (R2): {grid_result.best_score_}')

  history = grid_result.best_estimator_.history_

  return [history, grid_result.best_estimator_]

In [10]:
history, best_model = train_gridSearch(CNN_model01)

GridSearch start time: 9:25:52


I0000 00:00:1731054352.947458    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731054352.947668    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731054352.947823    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1731054353.006189    1900 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Epoch 1/30


2024-11-08 09:25:53.549878: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 245760000 exceeds 10% of free system memory.
2024-11-08 09:25:53.776144: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 245760000 exceeds 10% of free system memory.
I0000 00:00:1731054354.795290    1980 service.cc:146] XLA service 0x7821cc00bc40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731054354.795323    1980 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1060 6GB, Compute Capability 6.1
2024-11-08 09:25:54.820443: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-08 09:25:54.937082: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m   8/1250[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m17s[0m 14ms/step - accuracy: 0.4440 - loss: 0.7143 - precision: 0.4382 - recall: 0.4583

I0000 00:00:1731054356.961238    1980 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1247/1250[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.7471 - loss: 0.5062 - precision: 0.7551 - recall: 0.7179

2024-11-08 09:26:13.062989: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 307200000 exceeds 10% of free system memory.
2024-11-08 09:26:13.371183: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 307200000 exceeds 10% of free system memory.


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 18ms/step - accuracy: 0.7474 - loss: 0.5060 - precision: 0.7551 - recall: 0.7180 - val_accuracy: 0.8880 - val_loss: 0.2715 - val_precision: 0.8678 - val_recall: 0.9155
Epoch 2/30
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.8810 - loss: 0.2983 - precision: 0.8777 - recall: 0.8858 - val_accuracy: 0.9171 - val_loss: 0.2096 - val_precision: 0.9015 - val_recall: 0.9365
Epoch 3/30
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.9045 - loss: 0.2454 - precision: 0.9012 - recall: 0.9078 - val_accuracy: 0.9195 - val_loss: 0.2006 - val_precision: 0.9515 - val_recall: 0.8841
Epoch 4/30
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.9180 - loss: 0.2141 - precision: 0.9142 - recall: 0.9230 - val_accuracy: 0.9311 - val_loss: 0.1789 - val_precision: 0.9364 - val_recall: 0.9249
Epoch 5/30
[1m1250

2024-11-08 09:36:05.075286: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 245760000 exceeds 10% of free system memory.


[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 17ms/step - accuracy: 0.7836 - loss: 0.4535 - precision_2: 0.7808 - recall_2: 0.7851 - val_accuracy: 0.8944 - val_loss: 0.2562 - val_precision_2: 0.9158 - val_recall_2: 0.8687
Epoch 2/30
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.8886 - loss: 0.2758 - precision_2: 0.8910 - recall_2: 0.8848 - val_accuracy: 0.9242 - val_loss: 0.1982 - val_precision_2: 0.9264 - val_recall_2: 0.9217
Epoch 3/30
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 15ms/step - accuracy: 0.9137 - loss: 0.2217 - precision_2: 0.9156 - recall_2: 0.9124 - val_accuracy: 0.8977 - val_loss: 0.2455 - val_precision_2: 0.8479 - val_recall_2: 0.9691
Epoch 4/30
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - accuracy: 0.9219 - loss: 0.2001 - precision_2: 0.9209 - recall_2: 0.9229 - val_accuracy: 0.9362 - val_loss: 0.1638 - val_precision_2: 0.9526 - val_reca

#### Visualisation

In [11]:
hist = pd.DataFrame(history)
cols = hist.columns

n = int(cols[-1].split('_')[-1])


dict_ = { 'loss' : hist['loss'],
         'accuracy' : hist[f'accuracy'],
         'precision' : hist[f'precision_{n}'],
         'recall' : hist[f'recall_{n}'],
         'val_loss' : hist['val_loss'],
         'val_accuracy' : hist[f'val_accuracy'],
         'val_precision' : hist[f'val_precision_{n}'],
         'val_recall' : hist[f'val_recall_{n}']

}
hist = pd.DataFrame(dict_)

In [None]:
# ---------- Metrics over epoch ----------
fig_metrics = make_subplots(rows=2, cols=2, subplot_titles=("Loss", f"Accuracy", "Recall", "Precision"), vertical_spacing=0.07)

fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['loss']))), y=hist['loss'], mode='lines+markers', name='Training Loss'), row=1, col=1 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_loss']))), y=hist['val_loss'], mode='lines+markers', name='Validation Loss'), row=1, col=1 )

fig_metrics.add_trace(  go.Scatter(x=list(range(len(hist['precision']))), y=hist['precision'],  mode='lines+markers', name=f'Training precision'),  row=2, col=2 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_precision']))), y=hist['val_precision'], mode='lines+markers', name=f'Validation precision'), row=2, col=2)

fig_metrics.add_trace(  go.Scatter(x=list(range(len(hist['recall']))), y=hist['recall'],  mode='lines+markers', name=f'Training recall'),  row=2, col=1 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_recall']))), y=hist['val_recall'], mode='lines+markers', name=f'Validation recall'), row=2, col=1)

fig_metrics.add_trace(  go.Scatter(x=list(range(len(hist['accuracy']))), y=hist['accuracy'],  mode='lines+markers', name=f'Training accuracy'),  row=1, col=2 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_accuracy']))), y=hist['val_accuracy'], mode='lines+markers', name=f'Validation accuracy'), row=1, col=2)

fig_metrics.update_layout(
    showlegend=True,
    margin=dict(l=10, r=10, b=10, t=30),
    width=1400, height=800
)

for annotation in fig_metrics['layout']['annotations']:
    annotation['y'] = annotation['y'] + 0.002


# ---------- Confusion matrix ----------
y_pred = best_model.predict(X_test)
y_pred
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

cm = confusion_matrix(y_test, y_pred)
class_labels = ['FAKE', 'REAL']

fig_confMatrix = go.Figure(data=go.Heatmap(
    z=cm,
    x= class_labels ,   # Predicted labels
    y= class_labels,    # True labels
    hoverongaps=False,
    colorscale='Blues',
    showscale=True,
    text=cm,
    texttemplate="%{text}",
    textfont={"size":15}
))

fig_confMatrix.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted Label',
    yaxis_title='True Label',
    width=600,
    height=500,
)

# ---------- ROC AUC ----------
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

fig_rocauc = go.Figure()

fig_rocauc.add_trace(go.Scatter(
    x=fpr, y=tpr,
    mode='lines',
    line=dict(color='blue', width=2),
    name=f'ROC curve (AUC = {roc_auc:0.2f})'
))

fig_rocauc.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines',
    line=dict(color='black', dash='dash'),
    showlegend=False,
    hoverinfo='skip'
))

fig_rocauc.update_layout(
    title='ROC AUC for Binary Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    width=700,
    height=600,
    legend=dict(x=0.6, y=0.1),
    margin=dict(l=40, r=40, t=40, b=40),)



# ---------- Classification Report ----------
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, target_names=class_labels, 
                               zero_division=False,
                               labels = [0, 1])

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step


In [None]:
fig = make_subplots(
    rows=4, cols=2, 
    subplot_titles=("", "",
                    "Loss", "Precision", 
                    "Accuracy", 'Recall',
                    'Confusion Matrix', 'ROC-AUC curve'), 
    horizontal_spacing=0.05, 
    vertical_spacing=0.05  
)

fig.add_trace(
    go.Scatter(
        x=[0.5], y=[0.5], 
        text=[
            # top_layers_text
        ],
        mode='text',
        showlegend=False,
    ),
    row=1, col=1  
)

fig.add_trace(
    go.Scatter(
        x=[0.5], y=[0.5], 
        text=[
            f"Classification report<br>"
            f"{report.replace('\n','<br>')}"
        ],
        mode='text',
        showlegend=False,
    ),
    row=1, col=2  
)

fig.update_xaxes(visible=False, row=1, col=1)
fig.update_yaxes(visible=False, row=1, col=1)
fig.update_xaxes(visible=False, row=1, col=2)
fig.update_yaxes(visible=False, row=1, col=2)

# Loss and Precision
fig.add_trace(fig_metrics['data'][0], row=2, col=1)
fig.add_trace(fig_metrics['data'][1], row=2, col=1)
fig.add_trace(fig_metrics['data'][2], row=2, col=2)
fig.add_trace(fig_metrics['data'][3], row=2, col=2)

# Accuracy and Recall 
fig.add_trace(fig_metrics['data'][4], row=3, col=1)
fig.add_trace(fig_metrics['data'][5], row=3, col=1)
fig.add_trace(fig_metrics['data'][6], row=3, col=2)
fig.add_trace(fig_metrics['data'][7], row=3, col=2)

# Confusion Matrix and ROC-AUC curve 
fig.add_trace(fig_confMatrix['data'][0], row=4, col=1)
fig.add_trace(fig_rocauc['data'][0], row=4, col=2)
fig.add_trace(fig_rocauc['data'][1], row=4, col=2)

fig.update_layout(
    height=400*4, 
    width=1400, 
    showlegend=False, 
    margin=dict(l=10, r=10, t=50, b=10),  
)

fig.show()

#### Save model

In [14]:
model_name = 'GridSearch_customModel'
best_model.model().save(f'{model_name}.keras')