As our own contribution, we are making the experiment again with cat images, instead of human faces, to identify if the models' perform is the same with animals and humans'
As for the score methodology, the FID score is model-agnostic, it compares the distribution of features of two image sets, so it can be used with any kind of sets, not only human faces

In [1]:
# ====================================
# STEP 1: IMPORT LIBRARIES
# ====================================
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from scipy import linalg
import torch.nn as nn

Unfortunatly, Midjourney requires a paid subcription, at least 10 USD/month for the first tier, and DALLE2 gives only a few free credits for the free tier.
Stable Diffusion is open-source, so this analysis will only perform on this model

In [2]:
# ====================================
# STEP 2: DEFINE PATHS (LOCAL SETUP)
# ====================================
real_path = r'C:\Users\hugo_\Documents\hugo_garcia\Big_Data_Analytics_georgian_college\AI management\1 semester\Machine Learning Programming\Final Project\datasets\Project_Cat_images\Real_data\Real_images\content\OID\Dataset\train\Cat\images'
sd_path = r'C:\Users\hugo_\Documents\hugo_garcia\Big_Data_Analytics_georgian_college\AI management\1 semester\Machine Learning Programming\Final Project\datasets\Project_Cat_images\Stable_Diffusion\Generated_images'

In [3]:
# ====================================
# STEP 3: IMAGE LOADING FUNCTION
# ====================================
def get_activations(folder, model, batch_size=50, max_images=2000):
    model.eval()
    preprocess = transforms.Compose([
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5]*3, std=[0.5]*3),
    ])

    images = []
    image_files = [f for f in os.listdir(folder) if f.lower().endswith(('jpg', 'png', 'jpeg'))]
    image_files = image_files[:max_images]  # limit number of images

    for file in tqdm(image_files, desc=f'Loading images from {os.path.basename(folder)}'):
        img_path = os.path.join(folder, file)
        img = Image.open(img_path).convert('RGB')
        img = preprocess(img)
        images.append(img)

    images = torch.stack(images)
    activations = []

    with torch.no_grad():
        for i in range(0, len(images), batch_size):
            batch = images[i:i+batch_size]
            output = model(batch)
            activations.append(output.cpu().numpy())

    activations = np.concatenate(activations, axis=0)
    return activations


In [4]:
# ====================================
# STEP 4: LOAD INCEPTIONV3 MODEL
# ====================================
from torchvision.models import inception_v3, Inception_V3_Weights

weights = Inception_V3_Weights.DEFAULT
inception = inception_v3(weights=weights, aux_logits=True)  
inception.fc = nn.Identity()
inception.eval()


Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192, kernel_size=(3, 3), stri

In [5]:
# STEP 5: FID CALCULATION FUNCTION
# ====================================
def calculate_fid(act1, act2):
    mu1, sigma1 = act1.mean(axis=0), np.cov(act1, rowvar=False)
    mu2, sigma2 = act2.mean(axis=0), np.cov(act2, rowvar=False)

    diff = mu1 - mu2
    covmean, _ = linalg.sqrtm(sigma1 @ sigma2, disp=False)
    if np.iscomplexobj(covmean):
        covmean = covmean.real
    fid = diff @ diff + np.trace(sigma1 + sigma2 - 2 * covmean)
    return fid

In [6]:
# ====================================
# STEP 6: LOAD ACTIVATIONS
# ====================================
print("Extracting activations...")

act_real = get_activations(real_path, inception)
act_sd = get_activations(sd_path, inception)


Extracting activations...


Loading images from images: 100%|██████████████████████████████████████████████████| 2000/2000 [02:10<00:00, 15.36it/s]
Loading images from Generated_images: 100%|██████████████████████████████████████████| 100/100 [00:04<00:00, 23.07it/s]


In [9]:
# ====================================
# STEP 7: CALCULATE FID SCORES
# ====================================
print("\nCalculating FID Scores...\n")

fid_sd = calculate_fid(act_real, act_sd)

print(f'FID (Stable Diffusion vs Real): {fid_sd:.2f}')


Calculating FID Scores...

FID (Stable Diffusion vs Real): 187.67
