In [None]:
!pip install torch torchvision datasets scikit-learn matplotlib tqdm --quiet


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
from scipy.linalg import sqrtm
import random
from PIL import Image


class ResNetFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
        self.features = nn.Sequential(*list(model.children())[:-1])
        self.features.eval()

    def forward(self, x):
        with torch.no_grad():
            x = self.features(x)
        return x.view(x.size(0), -1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNetFeatureExtractor().to(device)

def extract_features(dataloader, model, device):
    features = []
    labels = []
    with torch.no_grad():
        for images, lbls in tqdm(dataloader, desc="Extracting Features"):
            images = images.to(device)
            output = model(images)
            features.append(output.cpu().numpy())
            labels.extend(lbls)
    return np.concatenate(features), np.array(labels)

Function for finding cosine similarity

In [None]:
def cosine_dataset_similarity(features_A, features_B):
    mean_A = np.mean(features_A, axis=0)
    mean_B = np.mean(features_B, axis=0)
    return cosine_similarity([mean_A], [mean_B])[0][0]

Function for finding Fid score

In [None]:

def fid_like_score(features_A, features_B):
    mu1, sigma1 = np.mean(features_A, axis=0), np.cov(features_A, rowvar=False)
    mu2, sigma2 = np.mean(features_B, axis=0), np.cov(features_B, rowvar=False)
    diff = mu1 - mu2
    covmean = sqrtm(sigma1.dot(sigma2))
    if np.iscomplexobj(covmean):
        covmean = covmean.real
    return diff.dot(diff) + np.trace(sigma1 + sigma2 - 2 * covmean)

Plotting heat map for different classes of ciphar10 by using cosine similarity score

In [None]:

def plot_heatmap(matrix, labels, title="Similarity Heatmap"):
    plt.figure(figsize=(8, 6))
    plt.imshow(matrix, cmap='viridis')
    plt.xticks(range(len(labels)), labels, rotation=45)
    plt.yticks(range(len(labels)), labels)
    plt.colorbar(label="Cosine Similarity")
    plt.title(title)
    plt.show()

Tranforming the data (images) for the corresponding shape of resnet50.

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

Experiment-1 :
 finding similarity score and fid score for ciphar10

In [None]:
cifar10 = datasets.CIFAR10(root="./data", download=True, train=True, transform=transform)
classes = cifar10.classes
print("\nCIFAR-10 Classes:", classes)

def get_class_subset(dataset, class_idx, n=200):
    indices = [i for i, (_, label) in enumerate(dataset) if label == class_idx]
    return Subset(dataset, random.sample(indices, n))

class_features = {}
for i, cls in enumerate(classes):
    subset = get_class_subset(cifar10, i)
    loader = DataLoader(subset, batch_size=32, shuffle=False)
    feats, _ = extract_features(loader, model, device)
    class_features[cls] = feats

sim_matrix = np.zeros((10, 10))
for i in range(10):
    for j in range(10):
        sim_matrix[i, j] = cosine_dataset_similarity(class_features[classes[i]], class_features[classes[j]])

plot_heatmap(sim_matrix, classes, title="CIFAR-10 Semantic Similarity (ResNet50)")

Experiment -2:
     finding similairty and fid score for cifake real and fake data.  preprocessing using integer labels: 0 = REAL, 1 = FAKE

In [None]:
cifake = load_dataset("dragonintelligence/CIFAKE-image-dataset")


def preprocess_hf(dataset, label_int, n=500):
    filtered = [x for x in dataset["train"] if x["label"] == label_int]
    subset = filtered[:n]
    if len(subset) == 0:
        raise ValueError(f"No images found for label {label_int}")
    images = [transform(x["image"]) for x in subset]
    labels = torch.zeros(len(images))
    return list(zip(images, labels))

real_subset = preprocess_hf(cifake, 0, n=500)
fake_subset = preprocess_hf(cifake, 1, n=500)

def create_loader(subset, batch_size=32):
    images, labels = zip(*subset)
    return DataLoader(list(zip(images, labels)), batch_size=batch_size, shuffle=False)

real_loader = create_loader(real_subset)
fake_loader = create_loader(fake_subset)

real_feats, _ = extract_features(real_loader, model, device)
fake_feats, _ = extract_features(fake_loader, model, device)

cos_sim = cosine_dataset_similarity(real_feats, fake_feats)
fid_score = fid_like_score(real_feats, fake_feats)


print(f"Cosine Semantic Similarity: {cos_sim:.4f}")
print(f"FID-like Semantic Distance: {fid_score:.2f}")