This is the python notebook to train various ML models for ai face image identification task. The first step is to download the dataset using the script below.

In [8]:
!pip install kaggle




In [10]:
import os, requests, tarfile, kaggle, zipfile; 


In [18]:
datasets = {
    "ai_faces": "chelove4draste/5k-ai-generated-faces",
    "real_faces": "atulanandjha/lfwpeople"
}

# Local download paths
download_dir = "./datasets"
os.makedirs(download_dir, exist_ok=True)


In [20]:
# Function to download and unzip Kaggle datasets
def download_and_extract(dataset_name, kaggle_path, target_dir):
    print(f"Downloading {dataset_name}...")
    kaggle.api.dataset_download_files(kaggle_path, path=target_dir, unzip=True)
    print(f"{dataset_name} downloaded and extracted to {target_dir}")
# Function to check if a dataset is already downloaded
def is_dataset_downloaded(target_dir):
    # Check if the directory exists and contains files
    return os.path.exists(target_dir) and any(os.scandir(target_dir))



In [21]:
# Download and extract datasets if not already downloaded
for name, kaggle_path in datasets.items():
    dataset_path = os.path.join(download_dir, name)
    os.makedirs(dataset_path, exist_ok=True)
    
    if is_dataset_downloaded(dataset_path):
        print(f"{name} dataset is already downloaded at {dataset_path}.")
    else:
        download_and_extract(name, kaggle_path, dataset_path)

print("Datasets are checked, downloaded, and organized.")


Downloading ai_faces...
Dataset URL: https://www.kaggle.com/datasets/chelove4draste/5k-ai-generated-faces
ai_faces downloaded and extracted to ./datasets/ai_faces
Downloading real_faces...
Dataset URL: https://www.kaggle.com/datasets/atulanandjha/lfwpeople
real_faces downloaded and extracted to ./datasets/real_faces
Datasets are checked, downloaded, and organized.


In [22]:
lfw_tgz_path = os.path.join(download_dir, "real_faces", "lfw-funneled.tgz")
extract_path = os.path.join(download_dir, "real_faces")

# Function to extract .tgz files
def extract_tgz(file_path, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir, exist_ok=True)
    with tarfile.open(file_path, "r:gz") as tar:
        tar.extractall(path=target_dir)
    print(f"Extracted {file_path} to {target_dir}")

# Extract the LFW dataset
extract_tgz(lfw_tgz_path, extract_path)

Extracted ./datasets/real_faces/lfw-funneled.tgz to ./datasets/real_faces


Now, all the initial datasets are downloaded. We will load 4000 of each of the dataset and then randomly pick 2500 of each to be the trainingset, and 750 of each to be validation set, and the rest to be test set.

In [None]:
ai_faces_dir = "./datasets/ai_faces/5k"
real_faces_dir = "./datasets/real_faces/lfw_funneled"

# Function to list files in a directory
def list_files(directory):
    if not os.path.exists(directory):
        print(f"Directory does not exist: {directory}")
        return []
    print(f"Listing files in directory: {directory}")
    files = []
    for root, _, filenames in os.walk(directory):  # Walk through the directory
        for file in filenames:
            if file.endswith(('.png', '.jpg', '.jpeg')):  # Filter for image files
                files.append(os.path.join(root, file))
    return files

# List files in AI faces directory
ai_faces_files = list_files(ai_faces_dir)
print(f"Found {len(ai_faces_files)} files in AI faces directory.")
print(f"First 5 files: {ai_faces_files[:5]}")

# List files in Real faces directory
real_faces_files = list_files(real_faces_dir)
print(f"Found {len(real_faces_files)} files in Real faces directory.")
print(f"First 5 files: {real_faces_files[:5]}")


Listing files in directory: ./datasets/ai_faces/5k
Found 5000 files in AI faces directory.
First 5 files: ['./datasets/ai_faces/5k/seed303843.png', './datasets/ai_faces/5k/seed302585.png', './datasets/ai_faces/5k/seed300392.png', './datasets/ai_faces/5k/seed301932.png', './datasets/ai_faces/5k/seed917028.png']
Listing files in directory: ./datasets/real_faces/lfw_funneled
Found 13233 files in Real faces directory.
First 5 files: ['./datasets/real_faces/lfw_funneled/German_Khan/German_Khan_0001.jpg', './datasets/real_faces/lfw_funneled/Stefano_Gabbana/Stefano_Gabbana_0001.jpg', './datasets/real_faces/lfw_funneled/Dragan_Covic/Dragan_Covic_0001.jpg', './datasets/real_faces/lfw_funneled/Jeff_Hornacek/Jeff_Hornacek_0001.jpg', './datasets/real_faces/lfw_funneled/Sureyya_Ayhan/Sureyya_Ayhan_0001.jpg']


In [63]:
!pip install numpy 
!pip install pillow
from PIL import Image
import numpy as np



In [65]:
# Function to load and preprocess images into an array
def load_images_to_array(file_paths, img_size):
    images = []
    for filepath in file_paths:
        try:
            img = Image.open(filepath).convert("RGB")  # Ensure RGB format
            img = img.resize(img_size)  # Resize to target size
            images.append(np.array(img) / 255.0)  # Normalize pixel values
        except Exception as e:
            print(f"Error processing {filepath}: {e}")
    return np.array(images)


In [66]:
# Define constants
IMG_SIZE = (128, 128)  # Resize all images to 128x128
MAX_IMAGES = 2000  # Limit the number of images

# Preprocess AI faces
ai_faces_array = load_images_to_array(ai_faces_files[:MAX_IMAGES], IMG_SIZE)
print(f"AI faces array shape: {ai_faces_array.shape}")

# Preprocess Real faces
real_faces_array = load_images_to_array(real_faces_files[:MAX_IMAGES], IMG_SIZE)
print(f"Real faces array shape: {real_faces_array.shape}")

# Save to numpy arrays
np.save("ai_faces.npy", ai_faces_array)
np.save("real_faces.npy", real_faces_array)

print("Images have been processed and saved to numpy arrays.")

AI faces array shape: (2000, 128, 128, 3)
Real faces array shape: (2000, 128, 128, 3)
Images have been processed and saved to numpy arrays.


In [67]:
# Load the numpy arrays
ai_faces = np.load("ai_faces.npy")
real_faces = np.load("real_faces.npy")

# Define dataset sizes
train_size = 1250
val_size = 400

# Function to split the data randomly
def split_data(data, train_size, val_size):
    indices = np.random.permutation(len(data))  # Shuffle indices
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]
    return data[train_indices], data[val_indices], data[test_indices]

# Split AI faces
ai_train, ai_val, ai_test = split_data(ai_faces, train_size, val_size)
print(f"AI Train: {ai_train.shape}, Validation: {ai_val.shape}, Test: {ai_test.shape}")

# Split Real faces
real_train, real_val, real_test = split_data(real_faces, train_size, val_size)
print(f"Real Train: {real_train.shape}, Validation: {real_val.shape}, Test: {real_test.shape}")

# Combine training, validation, and test sets
X_train = np.concatenate((ai_train, real_train), axis=0)
y_train = np.array([0] * len(ai_train) + [1] * len(real_train))  # 0 = AI, 1 = Real

X_val = np.concatenate((ai_val, real_val), axis=0)
y_val = np.array([0] * len(ai_val) + [1] * len(real_val))

X_test = np.concatenate((ai_test, real_test), axis=0)
y_test = np.array([0] * len(ai_test) + [1] * len(real_test))

# Shuffle the datasets for better training
train_indices = np.random.permutation(len(X_train))
val_indices = np.random.permutation(len(X_val))
test_indices = np.random.permutation(len(X_test))

X_train, y_train = X_train[train_indices], y_train[train_indices]
X_val, y_val = X_val[val_indices], y_val[val_indices]
X_test, y_test = X_test[test_indices], y_test[test_indices]

# Output the shapes of the final datasets
print(f"Training Set: X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Validation Set: X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Test Set: X_test: {X_test.shape}, y_test: {y_test.shape}")

AI Train: (1250, 128, 128, 3), Validation: (400, 128, 128, 3), Test: (350, 128, 128, 3)
Real Train: (1250, 128, 128, 3), Validation: (400, 128, 128, 3), Test: (350, 128, 128, 3)
Training Set: X_train: (2500, 128, 128, 3), y_train: (2500,)
Validation Set: X_val: (800, 128, 128, 3), y_val: (800,)
Test Set: X_test: (700, 128, 128, 3), y_test: (700,)
