## Zero-shot without classes incoporated

In [None]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
import os
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryPrecisionRecallCurve

def clip_pred(imgs):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    inputs = processor(
        text=["a synthetic image created by AI", "a real image taken by a human"],
        images=imgs,
        return_tensors="pt",
        padding=True
    )

    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    prob = logits_per_image.softmax(dim=1)  
    return prob

In [None]:
def load_images_from_folder(folder):
    images = []
    
    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename))
        if img is not None:
            images.append(img)
        
    
    labels = torch.zeros(len(images), dtype=torch.int)
    return images, labels

In [None]:
def metrics(ys, ts):
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    acc.update(ys, ts)
    f1.update(ys, ts)
    cm.update(ys, ts)
    return acc.compute(), f1.compute(), cm.compute()


In [None]:
folder_path = r'C:\Users\vikto\Documents\GitHub\DTU_repo\deep_learning\4_Convolutional\images'
images, labels = load_images_from_folder(folder_path)
probs = clip_pred(images)
metrics(probs[:,1], labels)

### HPC adjusted zero-shot no classes

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import os
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix

# Enable debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["PYTHONFAULTHANDLER"] = "1"

# Log GPU details
if torch.cuda.is_available():
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB")
    print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved() / (1024 ** 3):.2f} GB")
else:
    print("CUDA is not available. Check your GPU setup.")

def clip_pred(imgs, model, processor):
    """
    Perform prediction using the CLIP model.
    """
    inputs = processor(
        text=["a synthetic image created by AI", "a real image taken by a human"],
        images=imgs,
        return_tensors="pt",
        padding=True
    )
    
    inputs = {k: v.to("cuda") for k, v in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-text similarity score
    prob = logits_per_image.softmax(dim=1)  # Probability over classes
    return prob

def load_images_from_folders(fake_folder, real_folder):
    """
    Load images and their corresponding labels from specified folders.
    """
    images = []
    labels = []
    
    # Load FAKE images
    for filename in os.listdir(fake_folder):
        img_path = os.path.join(fake_folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            labels.append(0)  # Label for "FAKE"
    
    # Load REAL images
    for filename in os.listdir(real_folder):
        img_path = os.path.join(real_folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            labels.append(1)  # Label for "REAL"
    
    return images, torch.tensor(labels, dtype=torch.int)

def evaluate_model(images, labels, batch_size=64):
    """
    Evaluate the CLIP model using mini-batch processing and calculate metrics.
    """
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    # Process images in mini-batches
    probs = []
    for i in range(0, len(images), batch_size):
        batch_imgs = images[i:i+batch_size]
        batch_probs = clip_pred(batch_imgs, model, processor)
        probs.append(batch_probs)
    
    probs = torch.cat(probs, dim=0)  # Combine all batches
    preds = torch.argmax(probs, dim=1)  # Predicted labels
    
    # Calculate metrics
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    
    acc.update(preds, labels)
    f1.update(preds, labels)
    cm.update(preds, labels)
    
    print(f"Accuracy: {acc.compute().item():.4f}")
    print(f"F1 Score: {f1.compute().item():.4f}")
    print(f"Confusion Matrix: {cm.compute()}")
    return acc.compute(), f1.compute(), cm.compute()

# Paths to the datasets
fake_folder = r'/dtu/blackhole/18/160664/test/FAKE/'
real_folder = r'/dtu/blackhole/18/160664/test/REAL/'

# Load the images and labels
images, labels = load_images_from_folders(fake_folder, real_folder)
print(f"Loaded {len(images)} images.")
print(f"Labels: {labels}")

# Evaluate the model
evaluate_model(images, labels)

## Zero-shot with classes implemented

In [None]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
import os
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryAUROC

def clip_pred(imgs_class, class_type, model, processor):

    # Process all images and prompts in a single batch
    inputs = processor(
        text= ['A human-made photo of a' + str(class_type), 'A synthetic computer-generated photo of a' + str(class_type)],  # Prompts for each image
        images=imgs_class,  # Duplicate images to match the number of prompts
        return_tensors="pt",
        padding=True
    )

    inputs = {k: v.to("cpu") for k, v in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-text similarity score
    prob = logits_per_image.softmax(dim=1)  # Probability over classes
    return prob[:,1]


def load_images_from_folder(folder):
    images_class = {'airplane': [[],[]], 'automobile': [[],[]], 'bird': [[],[]], 'cat': [[],[]], 'deer': [[],[]], 'dog': [[],[]], 'frog': [[],[]], 'horse': [[],[]], 'ship': [[],[]], 'truck': [[],[]]}
    class_types = {'(2)': 'automobile', '(3)': 'bird', '(4)': 'cat', '(5)': 'deer', '(6)': 'dog', '(7)': 'frog', '(8)': 'horse', '(9)': 'ship', '(10)': 'truck'}

    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")

            for c in class_types.keys():
                if c in filename:
                    images_class[class_types.get(c)][0].append(img)
                    images_class[class_types.get(c)][1].append(0)
                    break
            if all(c not in filename for c in class_types.keys()):
                images_class['airplane'][0].append(img)
                images_class['airplane'][1].append(0)

    return images_class

def evaluate_model(imgs_class):
    """
    Evaluate the CLIP model using mini-batch processing and calculate metrics.
    """
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cpu")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    # Process images in mini-batches
    probs = []
    labels = []
    for i in imgs_class:
        batch_probs = clip_pred(imgs_class[i][0], i,  model, processor)
        probs.extend(batch_probs)
        labels.extend(imgs_class[i][1])

    probs = torch.tensor(probs)
    labels = torch.tensor(labels, dtype=torch.int)
    
    # Calculate metrics
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    auroc = BinaryAUROC()
    
    acc.update(probs, labels)
    f1.update(probs, labels)
    cm.update(probs, labels)
    auroc.update(probs, labels)
    
    accuracy = acc.compute()
    f1_score = f1.compute()
    confusion_matrix = cm.compute()
    auroc_score = auroc.compute()
    
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1_score}")
    print(f"Confusion Matrix: \n{confusion_matrix}")
    print(f"AUROC: {auroc_score}")


folder_path = r'C:\Users\vikto\Documents\GitHub\Feature-analysis-using-CLIP-model\images'
imgs_class = load_images_from_folder(folder_path)
# Evaluate the model
evaluate_model(imgs_class)



### 

### Non HPC adjusted zero-shot with classes

In [None]:
import torch
from PIL import Image
import os
from transformers import CLIPProcessor, CLIPModel
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryAUROC

def clip_pred(imgs, imgs_class):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Create prompts for all images
    prompts = []
    for img_class in imgs_class:
        prompts.append("a synthetic image of a " + str(img_class))
        prompts.append("a real image of a " + str(img_class))

    # Process all images and prompts in a single batch
    inputs = processor(
        text=prompts,
        images=imgs * 2,  # Duplicate images to match the number of prompts
        return_tensors="pt",
        padding=True
    )

    # Pass the batch to the model
    outputs = model(**inputs)
    logits = outputs.logits_per_image  # this is the image-text similarity score

    # Reshape logits to separate synthetic and real image logits
    logits = logits.view(len(imgs), 2, -1).mean(dim=2)

    prob = logits.softmax(dim=1)
    return prob

def load_images_from_folder(folder):
    images = []
    class_types = {'(2)': 'automobile', '(3)': 'bird', '(4)': 'cat', '(5)': 'deer', '(6)': 'dog', '(7)': 'frog', '(8)': 'horse', '(9)': 'ship', '(10)': 'truck'}
    imgs_class = []

    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename))
        if img is not None:
            images.append(img)

            for c in class_types.keys():
                if c in filename:
                    imgs_class.append(class_types.get(c))
                    break
            else:
                imgs_class.append('airplane')

    labels = torch.zeros(len(images), dtype=torch.int)
    return images, labels, imgs_class

def metrics(ys, ts):
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    auroc = BinaryAUROC()
    acc.update(ys, ts)
    f1.update(ys, ts)
    cm.update(ys, ts)
    auroc.update(ys, ts)

    return acc.compute(), f1.compute(), cm.compute(), auroc.compute()

folder_path = r'C:\Users\vikto\Documents\GitHub\Feature-analysis-using-CLIP-model\images'
images, labels, imgs_class = load_images_from_folder(folder_path)
probs = clip_pred(images, imgs_class)
metrics(probs[:, 1], labels)

In [None]:
import torch
from PIL import Image
import os
from transformers import CLIPProcessor, CLIPModel
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryAUROC
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve


def clip_pred(imgs, imgs_class):
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Create prompts for all images
    prompts = []
    for img_class in imgs_class:
        prompts.append("a synthetic image of a " + str(img_class))
        prompts.append("a real image of a " + str(img_class))

    # Process all images and prompts in a single batch
    inputs = processor(
        text=prompts,
        images=imgs * 2,  # Duplicate images to match the number of prompts
        return_tensors="pt",
        padding=True
    )

    # Pass the batch to the model
    outputs = model(**inputs)
    logits = outputs.logits_per_image  # this is the image-text similarity score

    # Reshape logits to separate synthetic and real image logits
    logits = logits.view(len(imgs), 2, -1).mean(dim=2)

    prob = logits.softmax(dim=1)
    return prob

def load_images_from_folder(folder):
    images = []
    class_types = {'(2)': 'automobile', '(3)': 'bird', '(4)': 'cat', '(5)': 'deer', '(6)': 'dog', '(7)': 'frog', '(8)': 'horse', '(9)': 'ship', '(10)': 'truck'}
    imgs_class = []

    for filename in os.listdir(folder):
        img = Image.open(os.path.join(folder, filename))
        if img is not None:
            images.append(img)

            for c in class_types.keys():
                if c in filename:
                    imgs_class.append(class_types.get(c))
                    break
            else:
                imgs_class.append('airplane')

    labels = torch.zeros(len(images), dtype=torch.int)
    return images, labels, imgs_class

def evaluate_metrics(probs, labels):
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    auroc = BinaryAUROC()

    acc.update(probs[:, 1], labels)
    f1.update(probs[:, 1], labels)
    cm.update(probs[:, 1], labels)
    auroc.update(probs[:, 1], labels)

    accuracy = acc.compute()
    f1_score = f1.compute()
    confusion_matrix = cm.compute()
    auroc_score = auroc.compute()

    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1_score}")
    print(f"Confusion Matrix: \n{confusion_matrix}")
    print(f"AUROC: {auroc_score}")

    # Plot ROC Curve
    fpr, tpr, _ = roc_curve(labels.detach().numpy(), probs[:, 1].detach().numpy())
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, marker='.', label='ROC Curve (AUC = {:.2f})'.format(auroc_score))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.grid(True)
    plt.show()

folder_path = r'C:\Users\vikto\Documents\GitHub\Feature-analysis-using-CLIP-model\images'
images, labels, imgs_class = load_images_from_folder(folder_path)
probs = clip_pred(images, imgs_class)
evaluate_metrics(probs, labels)

### Adjusted class code for HPC

#### For running fake and real seperately:

In [None]:
import torch
from PIL import Image
import os
from transformers import CLIPProcessor, CLIPModel
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryAUROC

# Enable debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["PYTHONFAULTHANDLER"] = "1"

# Log GPU details
if torch.cuda.is_available():
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB")
    print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved() / (1024 ** 3):.2f} GB")
else:
    print("CUDA is not available. Check your GPU setup.")

def clip_pred(imgs_class, class_type, model, processor):

    # Process all images and prompts in a single batch
    inputs = processor(
        text= ['A human-made photo of a' + str(class_type), 'A synthetic computer-generated photo of a' + str(class_type)],  # Prompts for each image
        images=imgs_class,  # Duplicate images to match the number of prompts
        return_tensors="pt",
        padding=True
    )

    inputs = {k: v.to("cpu") for k, v in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
        image_features = outputs.get_image_features(outputs['pixel_values'], hidden_states=False)
    logits_per_image = outputs.logits_per_image  # Image-text similarity score
    prob = logits_per_image.softmax(dim=1)  # Probability over classes
    return prob[:,1], image_features

def load_images_from_folder(folder):
    images_class = {'airplane': [[],[]], 'automobile': [[],[]], 'bird': [[],[]], 'cat': [[],[]], 'deer': [[],[]], 'dog': [[],[]], 'frog': [[],[]], 'horse': [[],[]], 'ship': [[],[]], 'truck': [[],[]]}
    class_types = {'(2)': 'automobile', '(3)': 'bird', '(4)': 'cat', '(5)': 'deer', '(6)': 'dog', '(7)': 'frog', '(8)': 'horse', '(9)': 'ship', '(10)': 'truck'}

    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")

            for c in class_types.keys():
                if c in filename:
                    images_class[class_types.get(c)][0].append(img)
                    images_class[class_types.get(c)][1].append(0)
                    break
            if all(c not in filename for c in class_types.keys()):
                images_class['airplane'][0].append(img)
                images_class['airplane'][1].append(0)

    return images_class

def evaluate_model(imgs_class):
    """
    Evaluate the CLIP model using mini-batch processing and calculate metrics.
    """
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cpu")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    # Process images in mini-batches
    probs = []
    labels = []
    for i in imgs_class:
        batch_probs = clip_pred(imgs_class[i][0], i,  model, processor)
        probs.extend(batch_probs)
        labels.extend(imgs_class[i][1])

    probs = torch.tensor(probs)
    labels = torch.tensor(labels, dtype=torch.int)
    
    # Calculate metrics
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    auroc = BinaryAUROC()
    
    acc.update(probs, labels)
    f1.update(probs, labels)
    cm.update(probs, labels)
    auroc.update(probs, labels)
    
    accuracy = acc.compute()
    f1_score = f1.compute()
    confusion_matrix = cm.compute()
    auroc_score = auroc.compute()
    
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1_score}")
    print(f"Confusion Matrix: \n{confusion_matrix}")
    print(f"AUROC: {auroc_score}")



# Paths to the datasets
folder_path = r'C:\Users\vikto\Documents\GitHub\Feature-analysis-using-CLIP-model\images'

# Load the images and labels
imgs_class= load_images_from_folder(folder_path)

# Evaluate the model
evaluate_model(imgs_class)

#### Fake and real at the same time:

In [None]:
import torch
from PIL import Image
import os
from transformers import CLIPProcessor, CLIPModel
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryAUROC

# Enable debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["PYTHONFAULTHANDLER"] = "1"

# Log GPU details
if torch.cuda.is_available():
    print(f"CUDA is available. Device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory Allocated: {torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB")
    print(f"CUDA Memory Reserved: {torch.cuda.memory_reserved() / (1024 ** 3):.2f} GB")
else:
    print("CUDA is not available. Check your GPU setup.")

def clip_pred(imgs, imgs_class, model, processor):
    """
    Perform prediction using the CLIP model.
    """
    # Create prompts for all images
    prompts = []
    for img_class in imgs_class:
        prompts.append("a synthetic image of a " + str(img_class))
        prompts.append("a real image of a " + str(img_class))

    # Process all images and prompts in a single batch
    inputs = processor(
        text=prompts,
        images=imgs * 2,  # Duplicate images to match the number of prompts
        return_tensors="pt",
        padding=True
    )

    inputs = {k: v.to("cuda") for k, v in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
    logits = outputs.logits_per_image  # this is the image-text similarity score

    # Reshape logits to separate synthetic and real image logits
    logits = logits.view(len(imgs), 2, -1).mean(dim=2)

    prob = logits.softmax(dim=1)
    return prob

def load_images_from_folders(fake_folder, real_folder):
    images = []
    labels = []
    class_types = {'(2)': 'automobile', '(3)': 'bird', '(4)': 'cat', '(5)': 'deer', '(6)': 'dog', '(7)': 'frog', '(8)': 'horse', '(9)': 'ship', '(10)': 'truck'}
    imgs_class = []

    # Load FAKE images
    for filename in os.listdir(fake_folder):
        img_path = os.path.join(fake_folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            labels.append(0)  # Label for "FAKE"

            for c in class_types.keys():
                if c in filename:
                    imgs_class.append(class_types.get(c))
                    break
            else:
                imgs_class.append('airplane')

    # Load REAL images
    for filename in os.listdir(real_folder):
        img_path = os.path.join(real_folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")
            images.append(img)
            labels.append(1)  # Label for "REAL"

            for c in class_types.keys():
                if c in filename:
                    imgs_class.append(class_types.get(c))
                    break
            else:
                imgs_class.append('airplane')

    return images, torch.tensor(labels, dtype=torch.int), imgs_class

def evaluate_model(images, labels, imgs_class, batch_size=64):
    """
    Evaluate the CLIP model using mini-batch processing and calculate metrics.
    """
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    # Process images in mini-batches
    probs = []
    for i in range(0, len(images), batch_size):
        batch_imgs = images[i:i+batch_size]
        batch_classes = imgs_class[i:i+batch_size]
        batch_probs = clip_pred(batch_imgs, batch_classes, model, processor)
        probs.append(batch_probs)
    
    probs = torch.cat(probs, dim=0)  # Combine all batches
    preds = torch.argmax(probs, dim=1)  # Predicted labels
    
    # Calculate metrics
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    auroc = BinaryAUROC()
    
    acc.update(preds, labels)
    f1.update(preds, labels)
    cm.update(preds, labels)
    auroc.update(preds, labels)
    
    accuracy = acc.compute()
    f1_score = f1.compute()
    confusion_matrix = cm.compute()
    auroc_score = auroc.compute()
    
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1_score}")
    print(f"Confusion Matrix: \n{confusion_matrix}")
    print(f"AUROC: {auroc_score}")

# Paths to the datasets
fake_folder = r'/dtu/blackhole/18/160664/test/FAKE/'
real_folder = r'/dtu/blackhole/18/160664/test/REAL/'

# Load the images and labels
images, labels, imgs_class = load_images_from_folders(fake_folder, real_folder)
print(f"Loaded {len(images)} images.")
print(f"Labels: {labels}")

# Evaluate the model
evaluate_model(images, labels, imgs_class)

## Extracting image features and applying FFNN

In [1]:
import torch
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
import os
from torcheval.metrics import BinaryAccuracy, BinaryF1Score, BinaryConfusionMatrix, BinaryAUROC

def clip_pred(imgs_class, class_type, model, processor):

    # Process all images and prompts in a single batch
    inputs = processor(
        text= ['A human-made photo of a' + str(class_type), 'A synthetic computer-generated photo of a' + str(class_type)],  # Prompts for each image
        images=imgs_class,  # Duplicate images to match the number of prompts
        return_tensors="pt",
        padding=True
    )

    inputs = {k: v.to("cpu") for k, v in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)
        img_features = model.get_image_features(pixel_values=inputs['pixel_values'])
    logits_per_image = outputs.logits_per_image  # Image-text similarity score
    prob = logits_per_image.softmax(dim=1)  # Probability over classes
    return prob[:,1], img_features


def load_images_from_folder(folder):
    images_class = {'airplane': [[],[]], 'automobile': [[],[]], 'bird': [[],[]], 'cat': [[],[]], 'deer': [[],[]], 'dog': [[],[]], 'frog': [[],[]], 'horse': [[],[]], 'ship': [[],[]], 'truck': [[],[]]}
    class_types = {'(2)': 'automobile', '(3)': 'bird', '(4)': 'cat', '(5)': 'deer', '(6)': 'dog', '(7)': 'frog', '(8)': 'horse', '(9)': 'ship', '(10)': 'truck'}

    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        if os.path.isfile(img_path):
            img = Image.open(img_path).convert("RGB")

            for c in class_types.keys():
                if c in filename:
                    images_class[class_types.get(c)][0].append(img)
                    images_class[class_types.get(c)][1].append(0)
                    break
            if all(c not in filename for c in class_types.keys()):
                images_class['airplane'][0].append(img)
                images_class['airplane'][1].append(0)

    return images_class

def evaluate_model(imgs_class):
    """
    Evaluate the CLIP model using mini-batch processing and calculate metrics.
    """
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cpu")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    # Process images in mini-batches
    probs = []
    labels = []
    image_features = []
    for i in imgs_class:
        batch_probs, batch_features = clip_pred(imgs_class[i][0], i,  model, processor)
        probs.extend(batch_probs)
        labels.extend(imgs_class[i][1])
        for f in batch_features:
            image_features.append(f)
    
    probs = torch.tensor(probs)
    labels = torch.tensor(labels, dtype=torch.int)
    dataset = {'features': image_features, 'labels': labels}
    
    # Calculate metrics
    acc = BinaryAccuracy()
    f1 = BinaryF1Score()
    cm = BinaryConfusionMatrix()
    auroc = BinaryAUROC()
    
    acc.update(probs, labels)
    f1.update(probs, labels)
    cm.update(probs, labels)
    auroc.update(probs, labels)
    
    accuracy = acc.compute()
    f1_score = f1.compute()
    confusion_matrix = cm.compute()
    auroc_score = auroc.compute()
    
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1_score}")
    print(f"Confusion Matrix: \n{confusion_matrix}")
    print(f"AUROC: {auroc_score}")

    return dataset

folder_path = r'C:\Users\vikto\Documents\GitHub\Feature-analysis-using-CLIP-model\images'
imgs_class = load_images_from_folder(folder_path)
# Evaluate the model
feature_data = evaluate_model(imgs_class)





Accuracy: 0.13333334028720856
F1 Score: 0.0
Confusion Matrix: 
tensor([[ 4., 26.],
        [ 0.,  0.]])
AUROC: 0.5


### Feed foward neural network on image features from encoder:

In [2]:
from torch.utils.data import DataLoader, Dataset

class dict_to_data(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels.float()

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [3]:
dataset = dict_to_data(feature_data['features'], feature_data['labels'])
train_loader = DataLoader(dataset, batch_size=10, shuffle=True)

In [4]:
from torch import nn
import torch.optim as optim

num_features = 512
num_hidden = 256

class network(nn.Module):

    def __init__(self, num_features, num_hidden):

        super(network, self).__init__()
        
        self.net = nn.Sequential(
            nn.Linear(num_features, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)
    

ffnn = network(num_features, num_hidden)
ffnn.to("cpu")
print(ffnn)

network(
  (net): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=1, bias=True)
    (3): Sigmoid()
  )
)


#### train loop

In [5]:
optimizer = optim.Adam(ffnn.parameters(), lr=0.0001)
loss = nn.BCELoss()

In [None]:
p = torch.Tensor([0.7, 0.5, 0.1])
t = torch.Tensor([1, 0, 1])
preds = p > 0.5
torch.sum(preds == t)/len(t)

In [8]:
num_epochs = 10


for epoch in range(num_epochs):
    ffnn.train()
    acc = BinaryAccuracy()
    for batch_features, batch_labels in train_loader:  
        batch_features, batch_labels = batch_features.to("cpu"), batch_labels.to("cpu")

        batch_output = ffnn(batch_features)
        batch_loss = loss(batch_output.squeeze(), batch_labels)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        acc.update(batch_output.squeeze(), batch_labels)
        batch_accuracy = acc.compute()
        acc.reset()
        
    print(f"Epoch {epoch+1}, Loss: {batch_loss.item()}, Accuracy: {batch_accuracy}")

Epoch 1, Loss: 0.0875934511423111, Accuracy: 1.0
Epoch 2, Loss: 0.08007360994815826, Accuracy: 1.0
Epoch 3, Loss: 0.07435925304889679, Accuracy: 1.0
Epoch 4, Loss: 0.0705740824341774, Accuracy: 1.0
Epoch 5, Loss: 0.06528390944004059, Accuracy: 1.0
Epoch 6, Loss: 0.05868624895811081, Accuracy: 1.0
Epoch 7, Loss: 0.05837278813123703, Accuracy: 1.0
Epoch 8, Loss: 0.05107269808650017, Accuracy: 1.0
Epoch 9, Loss: 0.048593632876873016, Accuracy: 1.0
Epoch 10, Loss: 0.04712023586034775, Accuracy: 1.0


#### test network

In [15]:
def test_network(model, test_loader):
    model.eval()
    test_loss = 0
    test_acc = BinaryAccuracy()
    loss_func = nn.BCELoss()
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features, batch_labels = batch_features.to('cpu'), batch_labels.to('cpu')

            # Forward pass
            outputs = model(batch_features)
            loss = loss_func(outputs.squeeze(), batch_labels)
            test_loss += loss
            test_acc.update(outputs.squeeze(), batch_labels)
            accuracy = test_acc.compute()
            

    avg_loss = test_loss / len(test_loader)

    print(f'Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}')
    return avg_loss, accuracy

test_network(ffnn, train_loader)

Test Loss: 0.0459, Test Accuracy: 1.0000


(tensor(0.0459), tensor(1.))