# Implementation of NS-NET (Yan et al. 2025)
Since NS-NET model is not available publicly, we have replicated the model based on the paper (https://www.arxiv.org/abs/2508.01248).
Training and Testing is done on DALLE Recognition Dataset available on Kaggle (https://www.kaggle.com/datasets/superpotato9/dalle-recognition-dataset).

Note: We have made use of LLMs for debugging of code only.

## Importing the Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("superpotato9/dalle-recognition-dataset")

print("Path to dataset files:", path)

## Importing BLIP for captioning of images

In [None]:
import sys, types, importlib, open_clip

if not hasattr(open_clip, "get_tokenizer"):
    def _get_tokenizer(model_name):
        from open_clip import tokenizer
        return tokenizer._tokenizer  # fallback; v2.x internal
    open_clip.get_tokenizer = _get_tokenizer

if "open_clip.tokenizer" not in sys.modules:
    tok_mod = types.SimpleNamespace(get_tokenizer=open_clip.get_tokenizer)
    sys.modules["open_clip.tokenizer"] = tok_mod

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m0.8/1.5 MB[0m [31m24.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h



## NS-NET architecture
Individual components have been labelled accordingly in the code.

In [None]:
%%writefile nsnet_cpu.py

import os, math, json, random, numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForConditionalGeneration
import open_clip, loralib
from sklearn.metrics import accuracy_score, average_precision_score
import os, json
from tqdm import tqdm
from PIL import Image, UnidentifiedImageError
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

# ============== helpers ==============
def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============== PATCH SELECTION ==============
def spectral_entropy(patch):
    arr = np.mean(np.array(patch), axis=2)
    mag = np.abs(np.fft.fft2(arr))
    mag = mag.flatten()
    mag = mag / (mag.sum() + 1e-8)
    return float(-(mag * np.log(mag + 1e-8)).sum())

def patch_select(img, patch_size=32, out_size=224):
    img = img.convert("RGB")
    w, h = img.size
    new_w = math.ceil(w / patch_size) * patch_size
    new_h = math.ceil(h / patch_size) * patch_size
    padded = Image.new("RGB", (new_w, new_h))
    padded.paste(img, (0,0))
    patches = []
    for r in range(0, new_h, patch_size):
        for c in range(0, new_w, patch_size):
            patch = padded.crop((c, r, c+patch_size, r+patch_size))
            patches.append(patch)
    ents = [spectral_entropy(p) for p in patches]
    idx = np.argsort(ents)
    topk = idx[-len(idx)//4:]; lowk = idx[:len(idx)//4]
    sel = [patches[i] for i in np.concatenate([topk, lowk])]
    random.shuffle(sel)
    grid = int(out_size / patch_size)
    new = Image.new("RGB", (out_size, out_size))
    for i in range(grid*grid):
        p = sel[i % len(sel)]
        r, c = divmod(i, grid)
        new.paste(p.resize((patch_size, patch_size)), (c*patch_size, r*patch_size))
    return new

# ============== DATASET ==============
class DALLEDataset(Dataset):
    def __init__(self, root, split, captions, transform=None,
                 train_limit_real=2000, train_limit_fake=2000,
                 test_limit_real=200, test_limit_fake=200):
        self.samples = []
        self.transform = transform
        self.captions = captions

        real_folder = os.path.join(root, "real")
        fake_folder = os.path.join(root, "fakeV2", "fake-v2")

        real_imgs, fake_imgs = [], []

        if os.path.exists(real_folder):
            real_imgs = [
                os.path.join(real_folder, f)
                for f in os.listdir(real_folder)
                if f.lower().endswith((".png", ".jpg", ".jpeg"))
            ]

        if os.path.exists(fake_folder):
            fake_imgs = [
                os.path.join(fake_folder, f)
                for f in os.listdir(fake_folder)
                if f.lower().endswith((".png", ".jpg", ".jpeg"))
            ]
        
        random.shuffle(real_imgs)
        random.shuffle(fake_imgs)

        if split == "train":
            real_imgs = real_imgs[:train_limit_real]
            fake_imgs = fake_imgs[:train_limit_fake]
        elif split == "test":
            real_imgs = real_imgs[:test_limit_real]
            fake_imgs = fake_imgs[:test_limit_fake]
        
        for img in real_imgs:
            self.samples.append((img, 0))  # REAL = 0
        for img in fake_imgs:
            self.samples.append((img, 1))  # FAKE = 1

        print(f"[{split.upper()}] Loaded {len(real_imgs)} REAL and {len(fake_imgs)} FAKE images from '{root}'")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, i):
        path, y = self.samples[i]
        img = Image.open(path).convert("RGB")
        img = patch_select(img)  # Apply spectral patch selection
        if self.transform:
            img = self.transform(img)
        # Caption key uses basename (same as used in gen_captions1)
        rel_folder = os.path.basename(os.path.dirname(path))
        rel_name = os.path.basename(path)
        unique_key = f"{rel_folder}_{rel_name}"
        cap = self.captions.get(unique_key, "")
        return img, y, cap

# ============== NULL SPACE ==============
def build_nullspace(text_feats, null_dim=None, tol=1e-6):
    U,S,VT = np.linalg.svd(text_feats, full_matrices=False)
    rank = np.sum(S>tol)
    D = VT.shape[1]
    null_dim = null_dim or max(1, D-rank)
    N = VT.T[:, -null_dim:]
    P = N @ N.T
    return torch.from_numpy(P).float(), N

# ============== MODEL ==============
class NSNetHead(nn.Module):
    def __init__(self, dim, proj_dim=512):
        super().__init__()
        self.proj = nn.Sequential(nn.Linear(dim, proj_dim), nn.ReLU(), nn.Linear(proj_dim, proj_dim))
        self.cls = nn.Linear(proj_dim,1)
    def forward(self,x):
        f = self.proj(x)
        logit = self.cls(f).squeeze(-1)
        return F.normalize(f,dim=1), logit

# ============== LOSSES & METRICS ==============
def nt_xent(f,y,T=0.07):
    sim = (f @ f.T)/T
    mask = (y.unsqueeze(1)==y.unsqueeze(0)).float()
    exp_sim = torch.exp(sim)*(1-torch.eye(len(f),device=f.device))
    num = (exp_sim*mask).sum(1)
    denom = exp_sim.sum(1)
    loss = -torch.log((num+1e-8)/(denom+1e-8))
    return loss.mean()

def metrics(y,logit):
    y = np.array(y)
    p = torch.sigmoid(torch.tensor(logit)).numpy()
    pred = (p>=0.5).astype(int)
    acc = accuracy_score(y,pred)
    r_acc = accuracy_score(y[y==0], pred[y==0])
    f_acc = accuracy_score(y[y==1], pred[y==1])
    ap = average_precision_score(y,p)
    return {"acc":acc,"r_acc":r_acc,"f_acc":f_acc,"ap":ap}

# ============== PIPELINE ==============
def gen_captions1(data_root, out_json="captions.json",
                  limit_real=2200, limit_fake=2200):
    device = get_device()
    proc = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    imgs = []

    real_folder = os.path.join(data_root, "real")
    fake_folder = os.path.join(data_root, "fakeV2", "fake-v2")

    real_imgs = []
    fake_imgs = []
    if os.path.exists(real_folder):
        real_imgs = [
            os.path.join(real_folder, f)
            for f in os.listdir(real_folder)
            if f.lower().endswith((".jpg", ".jpeg", ".png"))
        ][:limit_real]

    if os.path.exists(fake_folder):
        fake_imgs = [
            os.path.join(fake_folder, f)
            for f in os.listdir(fake_folder)
            if f.lower().endswith((".jpg", ".jpeg", ".png"))
        ][:limit_fake]

    imgs.extend(real_imgs)
    imgs.extend(fake_imgs)
    print(f"Total selected images for captioning: {len(imgs)} "
          f"({len(real_imgs)} real + {len(fake_imgs)} fake)")

    caps = {}

    skipped = []
    for p in tqdm(imgs, desc="BLIP captioning"):
        rel_folder = os.path.basename(os.path.dirname(p))
        base_name = os.path.basename(p)
        unique_key = f"{rel_folder}_{base_name}"

        try:
            img = Image.open(p).convert("RGB")
            img.thumbnail((512, 512))
            inputs = proc(images=img, return_tensors="pt").to(device)
            with torch.no_grad():
                out = model.generate(**inputs, max_new_tokens=40)
            cap = proc.decode(out[0], skip_special_tokens=True)
            caps[unique_key] = cap

            # Periodic autosave
            if len(caps) % 100 == 0:
                json.dump(caps, open(out_json, "w"))

        except UnidentifiedImageError:
            print(f"Skipped unreadable image: {p}")
            skipped.append(p)
            continue
        except Exception as e:
            print(f"Error on {p}: {e}")
            skipped.append(p)
            continue

    json.dump(caps, open(out_json, "w"))
    print(f"✅ Saved {len(caps)} captions to {out_json}")
    print(f"⚠️ Skipped {len(skipped)} problematic images.")
    if skipped:
        with open("skipped_images.txt", "w") as f:
            f.write("\n".join(skipped))
        print("Skipped image list saved to skipped_images.txt")

    return caps

def build_null(captions):
    device = get_device()

    # Load CLIP model and proper tokenizer
    model, _, _ = open_clip.create_model_and_transforms("ViT-L-14", pretrained="openai")
    tokenizer = open_clip.get_tokenizer("ViT-L-14")

    model.to(device).eval()

    texts = [str(t) for t in captions.values()]
    print(f"Encoding {len(texts)} captions for NULL-space...")

    feats = []
    for i in tqdm(range(0, len(texts), 32), desc="Text enc"):
        batch = texts[i:i+32]
        tokens = tokenizer(batch).to(device)

        with torch.no_grad():
            f = model.encode_text(tokens)
        feats.append(f.cpu())

    feats = torch.cat(feats).numpy()

    P, _ = build_nullspace(feats)
    np.savez("nullspace.npz", P=P.cpu().numpy()) 
    print(f"✅ NULL-space saved. Shape: {tuple(P.shape)}")

    return P.to(device)

def train_nsnet(data_root = "/root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7",
                train_limit_real=2000, train_limit_fake=2000,
                test_limit_real=200, test_limit_fake=200):
    device = get_device()
    print("Using device:", device)
    captions = json.load(open("captions.json"))
    null_npz = np.load("nullspace.npz")
    P = torch.from_numpy(null_npz["P"]).float().to(device)

    model_clip,_,_ = open_clip.create_model_and_transforms("ViT-L-14", pretrained="openai")
    model_clip.to(device).eval()
    head = NSNetHead(model_clip.visual.output_dim).to(device)
    opt = torch.optim.Adam(head.parameters(), lr=2e-4)

    tfm = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.48145466,0.4578275,0.40821073),
                             std=(0.26862954,0.26130258,0.27577711))
    ])

    train_ds = DALLEDataset(
        data_root, "train", captions, tfm,
        train_limit_real=train_limit_real, train_limit_fake=train_limit_fake,
        test_limit_real=test_limit_real, test_limit_fake=test_limit_fake
    )
    test_ds = DALLEDataset(
        data_root, "test", captions, tfm,
        train_limit_real=train_limit_real, train_limit_fake=train_limit_fake,
        test_limit_real=test_limit_real, test_limit_fake=test_limit_fake
    )

    train_dl = DataLoader(train_ds,batch_size=8,shuffle=True)

    best_ap = 0
    for ep in range(1, 3):
        head.train(); tot = 0
        for x, y, _ in tqdm(train_dl, desc=f"Epoch {ep}"):
            x, y = x.to(device), y.to(device)
            with torch.no_grad(): f_img = model_clip.encode_image(x)
            f_null = f_img @ P
            f, logit = head(f_null)
            loss = nt_xent(f, y) + 0.2 * F.binary_cross_entropy_with_logits(logit, y.float())
            opt.zero_grad(); loss.backward(); opt.step()
            tot += loss.item()
        print(f"Epoch {ep} loss {tot/len(train_dl):.4f}")

        # Eval
        head.eval(); logits = []; ys = []
        for x, y, _ in DataLoader(test_ds, batch_size=8):
            x = x.to(device)
            with torch.no_grad():
                f = model_clip.encode_image(x) @ P
                _, log = head(f)
            logits += log.cpu().tolist(); ys += y.tolist()
        m = metrics(ys, logits)
        print("Val metrics", m)
        if m["ap"] > best_ap:
            best_ap = m["ap"]
            torch.save(head.state_dict(), "best_head.pth")
    print("Training done. Best AP:", best_ap)

Writing nsnet_cpu.py


## Loading the dataset and captioning images

In [None]:
import importlib, nsnet_cpu
importlib.reload(nsnet_cpu)

data_root = "/root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7"

caps = nsnet_cpu.gen_captions1(
    data_root,
    out_json="captions.json",
    limit_real=2200,
    limit_fake=2200
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


✅ open_clip patching is active.
✅ open_clip patching is active.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Total selected images for captioning: 4400 (2200 real + 2200 fake)



BLIP captioning:   0%|          | 0/4400 [00:00<?, ?it/s][A
BLIP captioning:   0%|          | 1/4400 [00:05<7:16:43,  5.96s/it][A
BLIP captioning:   0%|          | 2/4400 [00:11<6:40:28,  5.46s/it][A
BLIP captioning:   0%|          | 3/4400 [00:14<5:19:00,  4.35s/it][A
BLIP captioning:   0%|          | 4/4400 [00:16<4:31:21,  3.70s/it][A
BLIP captioning:   0%|          | 5/4400 [00:19<4:08:32,  3.39s/it][A
BLIP captioning:   0%|          | 6/4400 [00:23<4:21:36,  3.57s/it][A
BLIP captioning:   0%|          | 7/4400 [00:25<3:53:43,  3.19s/it][A
BLIP captioning:   0%|          | 8/4400 [00:28<3:44:51,  3.07s/it][A
BLIP captioning:   0%|          | 9/4400 [00:31<3:35:01,  2.94s/it][A
BLIP captioning:   0%|          | 10/4400 [00:34<3:40:34,  3.01s/it][A
BLIP captioning:   0%|          | 11/4400 [00:37<3:42:52,  3.05s/it][A
BLIP captioning:   0%|          | 12/4400 [00:40<3:34:07,  2.93s/it][A
BLIP captioning:   0%|          | 13/4400 [00:43<3:35:32,  2.95s/it][A
BLIP captio

⚠️ Error on /root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7/fakeV2/fake-v2/12479.jpg: Image size (232748750 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.



BLIP captioning:  83%|████████▎ | 3662/4400 [3:04:01<34:19,  2.79s/it][A
BLIP captioning:  83%|████████▎ | 3663/4400 [3:04:04<33:18,  2.71s/it][A
BLIP captioning:  83%|████████▎ | 3664/4400 [3:04:06<33:33,  2.74s/it][A
BLIP captioning:  83%|████████▎ | 3665/4400 [3:04:09<33:09,  2.71s/it][A
BLIP captioning:  83%|████████▎ | 3666/4400 [3:04:13<36:43,  3.00s/it][A
BLIP captioning:  83%|████████▎ | 3667/4400 [3:04:16<36:58,  3.03s/it][A
BLIP captioning:  83%|████████▎ | 3668/4400 [3:04:19<36:04,  2.96s/it][A
BLIP captioning:  83%|████████▎ | 3669/4400 [3:04:21<35:20,  2.90s/it][A
BLIP captioning:  83%|████████▎ | 3670/4400 [3:04:25<36:35,  3.01s/it][A
BLIP captioning:  83%|████████▎ | 3671/4400 [3:04:27<35:36,  2.93s/it][A
BLIP captioning:  83%|████████▎ | 3672/4400 [3:04:30<34:04,  2.81s/it][A
BLIP captioning:  83%|████████▎ | 3673/4400 [3:04:34<36:50,  3.04s/it][A
BLIP captioning:  84%|████████▎ | 3674/4400 [3:04:37<38:25,  3.18s/it][A
BLIP captioning:  84%|████████▎ | 367

✅ Saved 4399 captions to captions.json
⚠️ Skipped 1 problematic images.
Skipped image list saved to skipped_images.txt





In [None]:
!mkdir -p "/content/drive/MyDrive/DALLE_NSNet"
!cp /content/captions.json "/content/drive/MyDrive/DALLE_NSNet/captions.json"
print("✅ Captions saved to Drive at: /content/drive/MyDrive/DALLE_NSNet/captions.json")

import json

with open("/content/drive/MyDrive/DALLE_NSNet/captions.json", "r") as f:
    caps = json.load(f)

for k, v in list(caps.items()):
    if isinstance(v, list):
        caps[k] = v[0] if len(v) > 0 else ""
    elif not isinstance(v, str):
        caps[k] = str(v)

json.dump(caps, open("captions_fixed.json", "w"))
print(f"✅ Fixed captions: {len(caps)} entries saved to captions_fixed.json")

✅ Captions saved to Drive at: /content/drive/MyDrive/DALLE_NSNet/captions.json
✅ Fixed captions: 4399 entries saved to captions_fixed.json


## Building the Nullspace from all the captions generated from the images

In [None]:
P = nsnet_cpu.build_null(caps)

✅ open_clip patching is active.


100%|███████████████████████████████████████| 933M/933M [00:11<00:00, 78.2MiB/s]


Encoding 4399 captions for NULL-space...


Text enc: 100%|██████████| 138/138 [17:45<00:00,  7.72s/it]


✅ NULL-space saved. Shape: (768, 768)


## Training the model
Due to limited computation resources we used 2000 Real and 2000 Fake images for training with 2 epochs.

In [None]:
nsnet_cpu.train_nsnet(
    data_root=data_root,
    train_limit_real=2000,
    train_limit_fake=2000,
    test_limit_real=200,
    test_limit_fake=200
)

Using device: cpu
[TRAIN] Loaded 2000 REAL and 2000 FAKE images from '/root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7'
[TEST] Loaded 200 REAL and 200 FAKE images from '/root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7'


Epoch 1: 100%|██████████| 500/500 [3:34:37<00:00, 25.76s/it]


Epoch 1 loss 1.1379
Val metrics {'acc': 0.5675, 'r_acc': 0.855, 'f_acc': 0.28, 'ap': np.float64(0.5879819635671724)}


Epoch 2: 100%|██████████| 500/500 [3:36:31<00:00, 25.98s/it]


Epoch 2 loss 1.1697


## Testing our Model

In [None]:
import importlib, nsnet_cpu
importlib.reload(nsnet_cpu)
print("✅ nsnet_cpu imported successfully")

import torch, json, numpy as np
from torch.utils.data import DataLoader
from torchvision import transforms
from tqdm import tqdm
from sklearn.metrics import accuracy_score, confusion_matrix, average_precision_score

device = nsnet_cpu.get_device()
print("Using device:", device)

# Load saved captions and null-space
captions = json.load(open("captions.json"))
null_npz = np.load("nullspace.npz")
P = torch.from_numpy(null_npz["P"]).float().to(device)

# Load CLIP backbone
model_clip, _, _ = open_clip.create_model_and_transforms("ViT-L-14", pretrained="openai")
model_clip.to(device).eval()

# Load trained NSNet head
head = nsnet_cpu.NSNetHead(model_clip.visual.output_dim).to(device)
head.load_state_dict(torch.load("best_head.pth", map_location=device))
head.eval()
print("✅ Model and head loaded successfully.")

data_root = "/root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7"

# Define normalization transform
tfm = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
                         std=(0.26862954, 0.26130258, 0.27577711))
])

test_ds = nsnet_cpu.DALLEDataset(
    root=data_root,
    split="test",
    captions=captions,
    transform=tfm,
    train_limit_real=2000,
    train_limit_fake=2000,
    test_limit_real=200,
    test_limit_fake=200
)
test_dl = DataLoader(test_ds, batch_size=8, shuffle=False)

print(f"\n🧾 Number of test images loaded: {len(test_ds)}")

logits, ys = [], []
for x, y, _ in tqdm(test_dl, desc="Testing"):
    x = x.to(device)
    with torch.no_grad():
        f_img = model_clip.encode_image(x)
        f_null = f_img @ P
        _, logit = head(f_null)
    logits += logit.cpu().tolist()
    ys += y.tolist()

y_true = np.array(ys)
probs = torch.sigmoid(torch.tensor(logits)).numpy()
y_pred = (probs >= 0.5).astype(int)

acc = accuracy_score(y_true, y_pred)
r_acc = accuracy_score(y_true[y_true==0], y_pred[y_true==0])
f_acc = accuracy_score(y_true[y_true==1], y_pred[y_true==1])
ap = average_precision_score(y_true, probs)
cm = confusion_matrix(y_true, y_pred)

print("\n✅ Evaluation Results on DALLE Recognition Test Set:")
print(f"Overall Accuracy : {acc:.4f}")
print(f"Real Accuracy    : {r_acc:.4f}")
print(f"Fake Accuracy    : {f_acc:.4f}")
print(f"Average Precision: {ap:.4f}")

print("\n📊 Confusion Matrix:")
print("        Pred Real | Pred Fake")
print(f"Real | {cm[0][0]:5d}       | {cm[0][1]:5d}")
print(f"Fake | {cm[1][0]:5d}       | {cm[1][1]:5d}")

print("\n✅ Testing completed successfully.")

✅ open_clip patching is active.
✅ nsnet_cpu imported successfully
Using device: cpu
✅ Model and head loaded successfully.
[TEST] Loaded 200 REAL and 200 FAKE images from '/root/.cache/kagglehub/datasets/superpotato9/dalle-recognition-dataset/versions/7'

🧾 Number of test images loaded: 400


Testing: 100%|██████████| 50/50 [27:03<00:00, 32.48s/it]


✅ Evaluation Results on DALLE Recognition Test Set:
Overall Accuracy : 0.5575
Real Accuracy    : 0.5550
Fake Accuracy    : 0.5600
Average Precision: 0.5724

📊 Confusion Matrix:
        Pred Real | Pred Fake
Real |   111       |    89
Fake |    88       |   112

✅ Testing completed successfully.





As can be seen, the performance is quite poor (fake accuracy = 0.560, overall accuracy = 0.557). In the original paper training was done on AIGIBench consisting of 144k images (2 epochs). However, we couldn't use that due to limited resources.