In [None]:

# ==========================================
# 0) IMPORT & CONFIG
# ==========================================
import os, sys, platform, random, json, shutil, math, time, pathlib, glob
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

print("TensorFlow:", tf.__version__)
print("GPU:", tf.config.list_physical_devices('GPU'))


In [None]:

# ==========================================
# 1) HYPERPARAMETERS
# ==========================================
IMG_SIZE = (224, 224)      # EfficientNetB0 input size
BATCH_SIZE = 64
SHUFFLE_BUFFER = 2048

N_SHARDS = 15              # số đợt train nhỏ
EPOCHS_PER_ROUND = 5       # epoch mỗi shard
STEPS_PER_EPOCH_CAP = 300  # giới hạn steps/epoch 


In [None]:

# ==========================================
# 2) DATA ROOT & CLASS LOADING
# ==========================================
DATA_ROOT = "/kaggle/input/deepfake-and-real-images/Dataset"
TRAIN_DIR = os.path.join(DATA_ROOT, "Train")
VAL_DIR   = os.path.join(DATA_ROOT, "Validation")
TEST_DIR  = os.path.join(DATA_ROOT, "Test")

# mapping class name -> int label
classes = sorted(os.listdir(TRAIN_DIR))
class_indices = {cls: idx for idx, cls in enumerate(classes)}
print("Classes:", class_indices)

def load_files_labels(root_dir):
    files, labels = [], []
    for cls in classes:
        cls_dir = os.path.join(root_dir, cls)
        for f in glob.glob(os.path.join(cls_dir, "*.jpg")):
            files.append(f)
            labels.append(class_indices[cls])
    return files, labels

train_files, train_labels = load_files_labels(TRAIN_DIR)
val_files,   val_labels   = load_files_labels(VAL_DIR)
test_files,  test_labels  = load_files_labels(TEST_DIR)

print("Train:", len(train_files), "Val:", len(val_files), "Test:", len(test_files))


In [None]:

# ==========================================
# 3) SHARD SPLITTING
# ==========================================
def make_shards(files, labels, n_shards):
    files = np.array(files)
    labels = np.array(labels)
    idx = np.arange(len(files))
    np.random.shuffle(idx)
    files, labels = files[idx], labels[idx]
    return np.array_split(files, n_shards), np.array_split(labels, n_shards)

train_shards = list(zip(*make_shards(train_files, train_labels, N_SHARDS)))


In [None]:

# ==========================================
# 4) DATA PIPELINE
# ==========================================
def process_path(file_path, label):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)   # scale [0,1]
    img = tf.image.resize(img, IMG_SIZE)                  # (224,224,3)
    return img, label

def make_ds(files, labels, training=True):
    ds = tf.data.Dataset.from_tensor_slices((files, labels))
    ds = ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    if training:
        ds = ds.shuffle(SHUFFLE_BUFFER)
    ds = ds.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    return ds

val_ds  = make_ds(val_files, val_labels, training=False)
test_ds = make_ds(test_files, test_labels, training=False)


In [None]:

# ==========================================
# 5) MODEL
# ==========================================
base = keras.applications.EfficientNetB0(
    include_top=False,
    input_shape=(224,224,3),
    pooling="avg",
    weights="imagenet"
)
base.trainable = False

x = keras.layers.Dense(128, activation="relu")(base.output)
x = keras.layers.Dropout(0.5)(x)
output = keras.layers.Dense(len(classes), activation="softmax")(x)

model = keras.Model(inputs=base.input, outputs=output)

model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy", keras.metrics.AUC(name="auc")]
)
model.summary()


In [None]:

# ==========================================
# 6) CALLBACKS
# ==========================================
ckpt_path = "efficientnetb0_deepfake_best.keras"
callbacks = [
    keras.callbacks.ModelCheckpoint(ckpt_path, monitor="val_auc", mode="max", save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=5, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1),
]


In [None]:

# ==========================================
# 7) TRAINING WITH SHARDS
# ==========================================
history_all = []
round_counter = 0

for shard_idx, (files_shard, labels_shard) in enumerate(train_shards):
    print("\n" + "=" * 70)
    print(f">>> ROUND {shard_idx+1}/{N_SHARDS} | shard size = {len(files_shard)}")
    print("=" * 70)

    train_ds = make_ds(files_shard, labels_shard, training=True)

    steps_per_epoch = None
    if STEPS_PER_EPOCH_CAP is not None:
        steps_per_epoch = min(
            math.ceil(len(files_shard) / BATCH_SIZE),
            STEPS_PER_EPOCH_CAP
        )

    hist = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=EPOCHS_PER_ROUND,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        verbose=1,
    )
    history_all.append(hist.history)

    round_counter += 1
    if round_counter == max(2, N_SHARDS // 2):
        print("\n>> Unfreezing base model for fine-tuning...")
        base.trainable = True
        model.compile(
            optimizer=keras.optimizers.Adam(1e-4),
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy", keras.metrics.AUC(name="auc")]
        )


In [None]:

# ==========================================
# 8) EVALUATION
# ==========================================
# Validation
val_probs = model.predict(val_ds, verbose=1)
val_pred = np.argmax(val_probs, axis=1)
val_true = []
for _, y in val_ds:
    val_true.extend(y.numpy())

print("\nClassification report (Validation):")
print(classification_report(val_true, val_pred, target_names=classes, digits=4))

cm = confusion_matrix(val_true, val_pred)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)
print("\nConfusion matrix:\n", df_cm)
df_cm.to_csv("/kaggle/working/confusion_matrix.csv", index=True)
print(">> Saved confusion_matrix.csv")

# Test
test_probs = model.predict(test_ds, verbose=1)
test_pred = np.argmax(test_probs, axis=1)
test_true = []
for _, y in test_ds:
    test_true.extend(y.numpy())

print("\nClassification report (Test):")
print(classification_report(test_true, test_pred, target_names=classes, digits=4))


In [None]:

# ==========================================
# 9) SAVE FINAL MODEL
# ==========================================
model.save("/kaggle/working/efficientnetb0_final.keras")
print(">> Saved final model.")
