In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
import cv2

In [2]:
def load_images(data_dir, img_size=(224, 224)):
    images = []
    labels = []
    
    for label in os.listdir(data_dir):
        label_dir = os.path.join(data_dir, label)
        if not os.path.isdir(label_dir):
            continue  # Skip non-directory files
        
        for img_file in os.listdir(label_dir):
            img_path = os.path.join(label_dir, img_file)

            # Read the image
            img = cv2.imread(img_path)

            if img is None:
                print(f"Warning: Could not read image {img_path}. Skipping...")
                continue  # Skip unreadable images

            # Resize and normalize the image
            img = cv2.resize(img, img_size)
            img = img.astype('float32') / 255.0  
            
            images.append(img)
            labels.append(label)

    return np.array(images), np.array(labels)  # Move return here

# Define dataset path
data_dir = r'E:\Deepfake detection\Dataset'

if os.path.exists(data_dir):
    print("Directory exists:", data_dir)
    print("Subfolders:", os.listdir(data_dir))
else:
    print("Error: Dataset directory does not exist!")

Directory exists: E:\Deepfake detection\Dataset
Subfolders: ['Test', 'Train', 'Validation']


In [3]:
for subset in ['Train', 'Test', 'Validation']:
    subset_path = os.path.join(data_dir, subset)
    
    if os.path.exists(subset_path):
        total_images = 0
        for label in os.listdir(subset_path):  # Iterate over "REAL" and "FAKE" folders
            label_dir = os.path.join(subset_path, label)
            if os.path.isdir(label_dir):  # Ensure it's a directory
                images = [f for f in os.listdir(label_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
                print(f"{subset}/{label} - Found {len(images)} images")
                total_images += len(images)

        print(f"Total images in {subset}: {total_images}\n")
    else:
        print(f"Error: {subset} folder not found!")


Train/Fake - Found 70001 images
Train/Real - Found 70001 images
Total images in Train: 140002

Test/Fake - Found 5492 images
Test/Real - Found 5413 images
Total images in Test: 10905

Validation/Fake - Found 19641 images
Validation/Real - Found 19787 images
Total images in Validation: 39428



In [4]:
broken_images = []

for subset in ['Train', 'Test', 'Validation']:
    subset_path = os.path.join(data_dir, subset)
    
    for label in os.listdir(subset_path):  # Iterate over "REAL" and "FAKE"
        label_dir = os.path.join(subset_path, label)
        
        for img_file in os.listdir(label_dir):
            img_path = os.path.join(label_dir, img_file)
            img = cv2.imread(img_path)
            
            if img is None: 
                broken_images.append(img_path)

print(f"Found {len(broken_images)} broken images")
if broken_images:
    print("Example broken images:", broken_images[:5])


Found 0 broken images


In [5]:
# Define paths to dataset folders
train_dir = r'E:\Deepfake detection\Dataset\Train'
val_dir = r'E:\Deepfake detection\Dataset\Validation'
test_dir = r'E:\Deepfake detection\Dataset\Test'

# Define parameters
IMG_SIZE = (128, 128)  # Resize images to 128x128
BATCH_SIZE = 16  # Reduce batch size to save memory

# Load dataset using TensorFlow pipeline
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    train_dir,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="binary"
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    val_dir,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="binary"
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    test_dir,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="binary"
)

Found 140002 files belonging to 2 classes.
Found 39428 files belonging to 2 classes.
Found 10905 files belonging to 2 classes.


In [6]:
# Optimize data loading
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

print("Efficient data pipeline created. No MemoryErrors!")

# Define CNN model
def create_model():
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
        layers.MaxPooling2D((2, 2)),

        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),

        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),

        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')  # Binary classification: Real (0) or Fake (1)
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

Efficient data pipeline created. No MemoryErrors!


In [7]:
# Initialize model
model = create_model()
model.summary()  # Check model architecture

# Train the model
EPOCHS = 10  

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 126, 126, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 63, 63, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 30, 30, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 28, 28, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 14, 14, 128)      0

In [8]:
# Evaluate model
test_loss, test_acc = model.evaluate(test_ds)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

Test Accuracy: 87.08%


In [9]:
model.save(r"E:\Deepfake detection\deepfake_detection_updated_v2.h5")