In [None]:
# load clip model from lavis library
!pip install salesforce-lavis -U



In [None]:
import torch
import numpy as np
import random
from PIL import Image
from tqdm.notebook import tqdm
from lavis.models import load_model_and_preprocess
from lavis.processors import load_processor
import torch.nn.functional as F
from torch import nn

In [None]:
# Make reproducible code
GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

# **Load Dataset**

In [None]:
!pip install huggingface_hub -q

In [None]:
from huggingface_hub import login
from google.colab import userdata

access_token = userdata.get('HF_TOKEN_ALL')
login(token = access_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
USERNAME = userdata.get('HUGGINGFACE_USERNAME')
ACCESS_TOKEN = access_token

In [None]:
ds_test = 'FashionMnist_test'
local = '/content/'
hf_path_test = f'VQA-Illusion/{ds_test}'
# hf_path_model = 'FashionMNIST_CLIP'
# hf_path_weights = 'CLIP_FashionMnist_train'

In [None]:
%cd {local}
!git clone 'https://{USERNAME}:{ACCESS_TOKEN}@huggingface.co/datasets/{hf_path_test}'

/content
Cloning into 'FashionMnist_test'...
remote: Enumerating objects: 5781, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 5781 (delta 3), reused 0 (delta 0), pack-reused 5775 (from 1)[K
Receiving objects: 100% (5781/5781), 888.61 KiB | 4.96 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (5765/5765), done.
Filtering content: 100% (5761/5761), 1.34 GiB | 6.82 MiB/s, done.


In [None]:
import pandas as pd

df = pd.read_csv(f'{local + ds_test}/df_data.csv')
df

Unnamed: 0,image_name,Pprompt,Nprompt,illusion_strength,label
0,FashionMnist_1,Two friends hiking on a mountain during sunset,low quality,1.5,9
1,FashionMnist_2,A starry night sky with constellations glowing,low quality,1.5,2
2,FashionMnist_3,A Thanksgiving feast giving thanks and love,low quality,1.5,1
3,FashionMnist_4,A gecko on a large leaf in a tropical rainforest,low quality,1.5,1
4,FashionMnist_5,A Hanukkah menorah lighting symbolizing faith,low quality,1.5,6
...,...,...,...,...,...
1147,FashionMnist_1148,"Electrified waterfalls cascade, neon rivers re...",low quality,1.5,2
1148,FashionMnist_1149,A rocky desert landscape with cacti and tumble...,low quality,1.5,2
1149,FashionMnist_1150,A majestic waterfall plunging into a crystal-c...,low quality,1.5,5
1150,FashionMnist_1151,"Eerie silence surrounds snowy mountains, glowi...",low quality,1.5,3


# **Load Model**

In [None]:
# def load_model(model_path, device):
#   loaded_model, loaded_vis_processors, loaded_text_processors = load_model_and_preprocess("clip_feature_extractor", "ViT-B-32", is_eval=True, device = device)
#   fine_tuned_weights = torch.load(model_path)
#   loaded_model.load_state_dict(fine_tuned_weights)
#   return loaded_model, loaded_vis_processors, loaded_text_processors

In [None]:
# %cd {local}
# !git clone 'https://{USERNAME}:{ACCESS_TOKEN}@huggingface.co/VQA-Illusion/{hf_path_model}'

/content
Cloning into 'FashionMNIST_CLIP'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (6/6), 2.12 KiB | 2.12 MiB/s, done.


In [None]:
# model, vis_processors, text_processors = load_model(f"/content/{hf_path_model}/{hf_path_weights}.pth", device)

In [None]:
model, vis_processors, text_processors = load_model_and_preprocess("clip_feature_extractor", "ViT-B-32", is_eval=True, device = device)

100%|███████████████████████████████████████| 354M/354M [00:04<00:00, 76.8MiB/s]


# **Inference**

In [None]:
labels = [
    "illusion object T-shirt/top",
    "illusion object Trouser",
    "illusion object Pullover",
    "illusion object Dress",
    "illusion object Coat",
    "illusion object Sandal",
    "illusion object Shirt",
    "illusion object Sneaker",
    "illusion object Bag",
    "illusion object Ankle boot",
    "no illusion object"
]

raw_labels = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [None]:
labels = [text_processors["eval"](label) for label in labels]
raw_labels = [text_processors["eval"](rlabel) for rlabel in raw_labels]

In [None]:
df["raw_answer"] = None
df["ill_answer"] = None
df["illless_answer"] = None
df["ill_filter_answer"] = None
df["illless_filter_answer"] = None

In [None]:
def inference(img, labels, model, vis_processors, device):
    image = vis_processors["eval"](img).unsqueeze(0).to(device)
    sample = {"image": image, "text_input": labels}
    clip_features = model.extract_features(sample)
    image_features = clip_features.image_embeds_proj
    text_features = clip_features.text_embeds_proj
    sims = (image_features @ text_features.t())[0] / 0.01
    probs = torch.nn.Softmax(dim=0)(sims).tolist()
    max_index = probs.index(max(probs))
    # max_label = labels[max_index]
    return max_index

In [None]:
%cd '/content/FashionMnist_test'

/content/FashionMnist_test


In [None]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    raw_image = Image.open(f"./raw_images/{row['image_name']}.jpg").convert("RGB")
    ill_image = Image.open(f"./ill_images/{row['image_name']}.jpg").convert("RGB")
    ill_less_image = Image.open(f"./illusionless_images/{row['image_name']}.jpg").convert("RGB")
    ill_filtered_image = Image.open(f"./illusion_images_filtered/{row['image_name']}.jpg").convert("RGB")
    ill_less_filtered_image = Image.open(f"./illusionless_images_filtered/{row['image_name']}.jpg").convert("RGB")

    # RAW
    df.loc[index, "raw_answer"] = inference(raw_image, raw_labels, model, vis_processors, device)

    # Illusion
    df.loc[index, "ill_answer"] = inference(ill_image, labels, model, vis_processors, device)

    # Illusionless
    df.loc[index, "illless_answer"] = inference(ill_less_image, labels, model, vis_processors, device)

    # IllusionFilter
    df.loc[index, "ill_filter_answer"] = inference(ill_filtered_image, labels, model, vis_processors, device)

    # IllusionlessFilter
    df.loc[index, "illless_filter_answer"] = inference(ill_less_filtered_image, labels, model, vis_processors, device)

  0%|          | 0/1152 [00:00<?, ?it/s]

In [None]:
df

Unnamed: 0,image_name,Pprompt,Nprompt,illusion_strength,label,raw_answer,ill_answer,illless_answer,ill_filter_answer,illless_filter_answer
0,FashionMnist_1,Two friends hiking on a mountain during sunset,low quality,1.5,9,7,10,10,7,10
1,FashionMnist_2,A starry night sky with constellations glowing,low quality,1.5,2,8,6,10,6,10
2,FashionMnist_3,A Thanksgiving feast giving thanks and love,low quality,1.5,1,1,1,10,1,10
3,FashionMnist_4,A gecko on a large leaf in a tropical rainforest,low quality,1.5,1,1,10,10,1,10
4,FashionMnist_5,A Hanukkah menorah lighting symbolizing faith,low quality,1.5,6,6,3,10,4,10
...,...,...,...,...,...,...,...,...,...,...
1147,FashionMnist_1148,"Electrified waterfalls cascade, neon rivers re...",low quality,1.5,2,6,10,10,4,10
1148,FashionMnist_1149,A rocky desert landscape with cacti and tumble...,low quality,1.5,2,6,10,10,0,10
1149,FashionMnist_1150,A majestic waterfall plunging into a crystal-c...,low quality,1.5,5,5,10,10,5,10
1150,FashionMnist_1151,"Eerie silence surrounds snowy mountains, glowi...",low quality,1.5,3,1,10,10,4,10


In [None]:
df.to_csv(f"/content/CLIP_FashionMNIST_test.csv", index=False)