In [1]:
# ! pip install pandas numpy scikit-learn plotly matplotlib
# ! pip install nbformat
# ! pip install --upgrade nbformat
# ! pip install tensorflow[and-cuda]

#### Notes

### Imports

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime as dt
import glob

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from tensorflow.keras.applications import VGG16, VGG19
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Input, Dropout, Rescaling, Conv2D, MaxPooling2D, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import Precision, Recall, F1Score
import visualkeras

from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

import warnings 
warnings.filterwarnings('ignore')

2024-10-18 13:24:02.018746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-18 13:24:02.029973: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-18 13:24:02.033585: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-18 13:24:02.044130: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Check if GPUs are available for training 
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


I0000 00:00:1729250644.310442   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729250644.347130   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729250644.347385   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [4]:
DATASET_FOLDER_TRAIN = 'CIFAKE/train'
DATASET_FOLDER_TEST = 'CIFAKE/test'

BATCH_SIZE = 24
COLOR_MODE = 'grayscale'
CLASS_MODE = 'categorical'
TARGET_SIZE = (32, 32)
LEARN_RATE = 0.0005
SEED = 42

N_EPOCHS = 50
MODEL = VGG16(weights='imagenet', include_top=False, input_shape=(32,32,3))      

I0000 00:00:1729250644.368905   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729250644.369175   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729250644.369358   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1729250644.440047   41477 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

### Train/test sets

In [5]:
train_datagen = ImageDataGenerator(
    rescale=1./255,  
    rotation_range=45,
    width_shift_range=0.5,
    height_shift_range=0.5,
    zoom_range=0.5,
    horizontal_flip=True,  
    vertical_flip=True,  
    validation_split=0.3,  
)

# Load training data from directory and apply transformations
train_generator = train_datagen.flow_from_directory(
    DATASET_FOLDER_TRAIN,  
    target_size=TARGET_SIZE,    
    color_mode=COLOR_MODE,  
    batch_size=BATCH_SIZE,
    class_mode=CLASS_MODE,  
    subset='training', 
    seed = SEED
)

# Load validation data (20% of the training data)
validation_generator = train_datagen.flow_from_directory(
    DATASET_FOLDER_TRAIN, 
    target_size=TARGET_SIZE,
    color_mode=COLOR_MODE,
    batch_size=BATCH_SIZE,
    class_mode=CLASS_MODE,
    subset='validation', 
    shuffle=False,
)

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_directory(
    DATASET_FOLDER_TEST,  
    target_size=TARGET_SIZE,
    color_mode=COLOR_MODE,
    batch_size=BATCH_SIZE,
    class_mode=CLASS_MODE,
)

Found 70000 images belonging to 2 classes.
Found 30000 images belonging to 2 classes.
Found 20000 images belonging to 2 classes.


### EDA

In [6]:
print('Class distribution: ')
print( f'Train REAL images: {len(glob.glob('CIFAKE/train/REAL/*'))}'  )
print( f'Train FAKE images: {len(glob.glob('CIFAKE/train/FAKE/*'))}'  )

print( f'Test REAL images: {len(glob.glob('CIFAKE/test/REAL/*'))}'  )
print( f'Test FAKE images: {len(glob.glob('CIFAKE/test/FAKE/*'))}'  )

Class distribution: 
Train REAL images: 50000
Train FAKE images: 50000
Test REAL images: 10000
Test FAKE images: 10000


In [7]:
# Show random image

#### Image normalisation

In [8]:
# Image normalisation

### Load model

In [9]:
# Convert to rgb
def convert_grayscale_to_rgb(batch):
    tensor_batch = tf.convert_to_tensor(batch)

    rgb_batch = tf.image.grayscale_to_rgb(tensor_batch)
    return rgb_batch

def rgb_wrapper(generator):
    for batch, labels in generator:
        yield convert_grayscale_to_rgb(batch), labels

train_generator_rgb = rgb_wrapper(train_generator)
validation_generator_rgb = rgb_wrapper(validation_generator)
test_generator_rgb = rgb_wrapper(test_generator)

In [10]:
n_classes = train_generator.num_classes

VGG_model = MODEL

for layer in VGG_model.layers:
    layer.trainable = False

x = Flatten()(VGG_model.output)
x = Dense(32, activation='relu')(x)
x = Dense(32, activation='relu')(x)
x = Dense(n_classes, activation='sigmoid')(x)

# Custom model
customVGG = Model(inputs=VGG_model.input, outputs=x)

customVGG.compile(loss=BinaryCrossentropy(), 
                  optimizer=Adam(learning_rate=LEARN_RATE), 
                  metrics=['accuracy', Precision(), Recall()])

### Train model

In [11]:
history = customVGG.fit(train_generator_rgb, 
                             callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
                             epochs=N_EPOCHS, 
                             validation_data=validation_generator_rgb,
                             steps_per_epoch=train_generator.samples // BATCH_SIZE,
                             validation_steps=validation_generator.samples // BATCH_SIZE,
                        )

Epoch 1/50


I0000 00:00:1729250647.388067   41549 service.cc:146] XLA service 0x70ea9c004200 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1729250647.388085   41549 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1060 6GB, Compute Capability 6.1
2024-10-18 13:24:07.429134: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-18 13:24:07.608934: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m  17/2916[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m30s[0m 11ms/step - accuracy: 0.5413 - loss: 0.6919 - precision: 0.5315 - recall: 0.5676

I0000 00:00:1729250649.860922   41549 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2916/2916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 15ms/step - accuracy: 0.6623 - loss: 0.6129 - precision: 0.6588 - recall: 0.6621 - val_accuracy: 0.7011 - val_loss: 0.5711 - val_precision: 0.6990 - val_recall: 0.7037
Epoch 2/50
[1m2916/2916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 15ms/step - accuracy: 0.7008 - loss: 0.5709 - precision: 0.6995 - recall: 0.7035 - val_accuracy: 0.7136 - val_loss: 0.5586 - val_precision: 0.7107 - val_recall: 0.7197
Epoch 3/50
[1m2916/2916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 15ms/step - accuracy: 0.7074 - loss: 0.5651 - precision: 0.7060 - recall: 0.7097 - val_accuracy: 0.7128 - val_loss: 0.5566 - val_precision: 0.7128 - val_recall: 0.7147
Epoch 4/50
[1m2916/2916[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 15ms/step - accuracy: 0.7122 - loss: 0.5615 - precision: 0.7120 - recall: 0.7115 - val_accuracy: 0.7162 - val_loss: 0.5514 - val_precision: 0.7154 - val_recall: 0.7187
Epoch 5/50
[1m2916

In [12]:
hist = history.history
cols = list(history.history.keys())
cols

['accuracy',
 'loss',
 'precision',
 'recall',
 'val_accuracy',
 'val_loss',
 'val_precision',
 'val_recall']

In [13]:
# n = int(cols[3][-1])

# dict_ = { 'loss' : hist['loss'],
#          'accuracy' : hist[f'accuracy'],
#          'precision' : hist[f'precision_{n}'],
#          'recall' : hist[f'recall_{n}'],
#          'val_loss' : hist['val_loss'],
#          'val_accuracy' : hist[f'val_accuracy'],
#          'val_precision' : hist[f'val_precision_{n}'],
#          'val_recall' : hist[f'val_recall_{n}']

# }
# hist = pd.DataFrame(dict_)

### Evaluation

In [14]:
fig_metrics = make_subplots(rows=2, cols=2, subplot_titles=("Loss", f"Precision", "Accuracy", "Recall"), vertical_spacing=0.07)

# Loss
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['loss']))), y=hist['loss'], mode='lines+markers', name='Train Loss'), row=1, col=1 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_loss']))), y=hist['val_loss'], mode='lines+markers', name='Val Loss'), row=1, col=1 )

# Precision
fig_metrics.add_trace(  go.Scatter(x=list(range(len(hist['precision']))), y=hist['precision'],  mode='lines+markers', name=f'Train precision'),  row=1, col=2 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_precision']))), y=hist['val_precision'], mode='lines+markers', name=f'Val precision'), row=1, col=2)

# Accuracy
fig_metrics.add_trace(  go.Scatter(x=list(range(len(hist['accuracy']))), y=hist['accuracy'],  mode='lines+markers', name=f'Train accuracy'),  row=2, col=1 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_accuracy']))), y=hist['val_accuracy'], mode='lines+markers', name=f'Val accuracy'), row=2, col=1)

# Recall
fig_metrics.add_trace(  go.Scatter(x=list(range(len(hist['recall']))), y=hist['recall'],  mode='lines+markers', name=f'Train recall'),  row=2, col=2 )
fig_metrics.add_trace( go.Scatter(x=list(range(len(hist['val_recall']))), y=hist['val_recall'], mode='lines+markers', name=f'Val recall'), row=2, col=2)

# fig.update_xaxes(title_text="Epochs", row=1, col=1)
fig_metrics.update_yaxes(title_text="Loss", row=1, col=1)
fig_metrics.update_yaxes(title_text=f"Precision", row=1, col=2)
fig_metrics.update_yaxes(title_text=f"Accuracy", row=2, col=1)
fig_metrics.update_yaxes(title_text=f"Recall", row=2, col=2)

fig_metrics.update_layout(
    # title_text="Training and validation metrics over epochs",
    showlegend=True,
    margin=dict(l=10, r=10, b=10, t=30),
    width=1400, height=800
)

for annotation in fig_metrics['layout']['annotations']:
    annotation['y'] = annotation['y'] + 0.002

In [15]:
# Evaluate the model on test data
test_loss, test_acc, test_prec, test_recall = customVGG.evaluate(
    test_generator_rgb,
    steps=test_generator.samples // test_generator.batch_size
)

[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.7818 - loss: 0.4582 - precision: 0.7820 - recall: 0.7824


In [16]:
from sklearn.metrics import confusion_matrix
y_true = test_generator.classes
y_pred = customVGG.predict(test_generator_rgb, steps=test_generator.samples // test_generator.batch_size)
y_pred_classes = np.argmax(y_pred, axis=1)
# y_pred_classes = (y_pred > 0.5).astype(int)
y_true = y_true[:len(y_pred_classes)]

[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step


In [17]:
y_true.shape, y_pred_classes.shape, test_generator.class_indices.keys()

((19976,), (19976,), dict_keys(['FAKE', 'REAL']))

In [18]:
# Generate the confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)

class_labels = list(test_generator.class_indices.keys())

# Plotly heatmap for confusion matrix
fig_confMatrix = go.Figure(data=go.Heatmap(
    z=cm,
    x=['REAL', 'FAKE'] ,  # Predicted labels
    y= ['FAKE', 'REAL'],  # True labels
    hoverongaps=False,
    colorscale='Blues',
    showscale=True,
    text=cm,
    texttemplate="%{text}",
    textfont={"size":15}
))

# Update layout to add labels and title
fig_confMatrix.update_layout(
    title='Confusion Matrix',
    xaxis_title='Predicted Label',
    yaxis_title='True Label',
    width=600,
    height=500,
)
print()




In [19]:
test_generator.classes

array([0, 0, 0, ..., 1, 1, 1], dtype=int32)

In [20]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc


# Compute ROC curve and ROC AUC
fpr, tpr, _ = roc_curve(y_true, y_pred_classes)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve using Plotly
fig_rocauc = go.Figure()

# Add the ROC curve
fig_rocauc.add_trace(go.Scatter(
    x=fpr, y=tpr,
    mode='lines',
    line=dict(color='blue', width=2),
    name=f'ROC curve (AUC = {roc_auc:0.2f})'
))

# Add the diagonal line (random classifier)
fig_rocauc.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines',
    line=dict(color='black', dash='dash'),
    showlegend=False,
    hoverinfo='skip'
))

# Update layout with axis titles and legend
fig_rocauc.update_layout(
    title='ROC AUC for Binary Classification',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    width=700,
    height=600,
    legend=dict(x=0.6, y=0.1),
    margin=dict(l=40, r=40, t=40, b=40),)
print()




In [21]:
# Classification report 
from sklearn.metrics import classification_report

report = classification_report(y_true, y_pred_classes, target_names=class_labels, 
                               zero_division=False,
                               labels = [0, 1])

### Evaluation plots+

In [22]:
# print(hist)
# hist

In [23]:
fig = make_subplots(
    rows=4, cols=2, 
    subplot_titles=("", "",
                    "Loss", "Precision", 
                    "Accuracy", 'Recall',
                    'Confusion Matrix', 'ROC-AUC curve'), 
    horizontal_spacing=0.05, 
    vertical_spacing=0.05  
)

fig.add_trace(
    go.Scatter(
        x=[0.5], y=[0.5], 
        text=[
            f"""x = Flatten()(VGG_model.output)<br>x = Dense(32, activation='relu')(x)<br>x = Dense(32, activation='relu')(x)<br>x = Dense(n_classes, activation='sigmoid')(x)"""       
        ],
        mode='text',
        showlegend=False,
    ),
    row=1, col=1  
)

fig.add_trace(
    go.Scatter(
        x=[0.5], y=[0.5], 
        text=[
            f"Test loss: {test_loss:.4f}<br>"
            f"Test accuracy: {test_acc:.4f}<br>"
            f"Test precision: {test_prec:.4f}<br>"
            f"Test recall: {test_recall:.4f}<br><br>"
            f"ROC-AUC: {roc_auc:.4f}<br><br>"
            f"Classification report<br>"
            f"{report.replace('\n','<br>')}"
        ],
        mode='text',
        showlegend=False,
    ),
    row=1, col=2  
)


# fig.update_layout(
#     annotations=[dict(text="Model Metrics", x=0.25, y=1.05, showarrow=False, xref="paper", yref="paper", font=dict(size=16))], 
# )

# Hide the axes for the annotation row
fig.update_xaxes(visible=False, row=1, col=1)
fig.update_yaxes(visible=False, row=1, col=1)


# Loss and Precision
fig.add_trace(fig_metrics['data'][0], row=2, col=1)
fig.add_trace(fig_metrics['data'][1], row=2, col=1)
fig.add_trace(fig_metrics['data'][2], row=2, col=2)
fig.add_trace(fig_metrics['data'][3], row=2, col=2)

# Accuracy and Recall 
fig.add_trace(fig_metrics['data'][4], row=3, col=1)
fig.add_trace(fig_metrics['data'][5], row=3, col=1)
fig.add_trace(fig_metrics['data'][6], row=3, col=2)
fig.add_trace(fig_metrics['data'][7], row=3, col=2)

# Confusion Matrix and ROC-AUC curve 
fig.add_trace(fig_confMatrix['data'][0], row=4, col=1)
fig.add_trace(fig_rocauc['data'][0], row=4, col=2)
fig.add_trace(fig_rocauc['data'][1], row=4, col=2)

fig.update_layout(
    height=400*4, 
    width=1400, 
    title_text=f"--- {MODEL.name} ---",
    showlegend=False, 
    margin=dict(l=10, r=10, t=50, b=10),  
)

fig.show()

### Class activation map