In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install and import packages

In [None]:
!pip install accelerate -U
!pip install omegaconf
!pip install iopath
!pip install peft==0.2.0
!pip install timm
!pip install decord
!pip install webdataset
!pip install wandb
!pip install visual_genome
!pip install bitsandbytes
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2
!pip install transformers==4.37.2

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

Collecting iopath
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting portalocker (from iopath)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Building wheels for collected packages: iopath
  Building wheel for iopath (setup.py) ... [?25l[?25hdone
  Created wheel for iopath: filename=iopath-0.1.10-py3-none-any.whl size=31532 sha256=77c5681a86f40fd7c28df797bad239de6d819253acb884f50301281c62f1b0af
  Stored in directory: /root/.cache/pip/wheels/9a/a3/b6/ac0fcd1b4ed5cfeb3db92e6a0e476cfd48ed0df92b91080c1d
Successfully built iopath
Installing collected packages: portalocker, iopath
Successfully installed iopath-0.1.10 portalocker-2.8.2
Collecting peft==0.2.0
  Downloading peft-0.2.0-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.2 MB/s[

In [None]:
import torch
import numpy as np
import random
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from huggingface_hub import login
from google.colab import userdata
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import json
import os

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [None]:
# Make reproducible code
GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


### Prepare Model



Download Llama2 7b chat

In [None]:
%cd /content
! git clone https://huggingface.co/daryl149/llama-2-7b-chat-hf

/content
fatal: destination path 'llama-2-7b-chat-hf' already exists and is not an empty directory.


In [None]:
! du -sh /content/llama-2-7b-chat-hf

26G	/content/llama-2-7b-chat-hf


Clone mini-gpt repo

In [None]:
! git clone https://github.com/Vision-CAIR/MiniGPT-4.git

Cloning into 'MiniGPT-4'...
remote: Enumerating objects: 1797, done.[K
remote: Counting objects: 100% (884/884), done.[K
remote: Compressing objects: 100% (226/226), done.[K
remote: Total 1797 (delta 719), reused 658 (delta 658), pack-reused 913[K
Receiving objects: 100% (1797/1797), 65.21 MiB | 27.87 MiB/s, done.
Resolving deltas: 100% (1047/1047), done.


Now you should add the location of llama and checkpoint to related files

### Import dataset

In [None]:
datasetName = "Mnist" # dataset name
huggigface_repository_path = "VQA-Illusion/Mnist" # hugging-face dataset path
local_dataset_path = "/content"

# for evaluation inference
batchSize = 4

In [None]:
access_token = userdata.get('HF_TOKEN_ALL')
login(token = access_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
USERNAME = userdata.get('HUGGINGFACE_USERNAME')
ACCESS_TOKEN = access_token
%cd {local_dataset_path}
!git clone 'https://{USERNAME}:{ACCESS_TOKEN}@huggingface.co/datasets/{huggigface_repository_path}'
# move to dataset directory
%cd {datasetName}

/content
Cloning into 'Mnist'...
remote: Enumerating objects: 5559, done.[K
remote: Counting objects: 100% (5556/5556), done.[K
remote: Compressing objects: 100% (5554/5554), done.[K
remote: Total 5559 (delta 2), reused 5555 (delta 2), pack-reused 3 (from 1)[K
Receiving objects: 100% (5559/5559), 813.88 KiB | 5.50 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (5548/5548), done.
Filtering content: 100% (5545/5545), 952.21 MiB | 6.30 MiB/s, done.
/content/Mnist


In [None]:
%cd /content/Mnist
df = pd.read_csv('./df_data.csv')
df

/content/Mnist


Unnamed: 0,image_name,Pprompt,Nprompt,illusion_strength,label
0,Mnist_1,A field of blooming sunflowers swaying in the ...,low quality,1.5,7
1,Mnist_2,A peaceful countryside scene with grazing shee...,low quality,1.5,2
2,Mnist_3,A tranquil pond with lily pads floating on the...,low quality,1.5,1
3,Mnist_4,A sunny vineyard with rows of ripe grapes,low quality,1.5,0
4,Mnist_5,A picturesque vineyard at sunset with the sky ...,low quality,1.5,4
...,...,...,...,...,...
1104,Mnist_1105,Misty jungle surrounded by vibrant flowers and...,low quality,1.5,8
1105,Mnist_1106,A forest with blooming flowers,low quality,1.5,0
1106,Mnist_1107,"Desolate desert landscape, shifting sands illu...",low quality,1.5,5
1107,Mnist_1108,A vast desert with a towering canyon in the di...,low quality,1.5,0


In [None]:
class Illusion_Dataset(Dataset):
    def __init__(self, df, datasetname, ill_prompt, raw_prompt, transform=None):
        self.df = df
        self.transform = transform
        self.datasetname = datasetname
        self.ill_prompt = ill_prompt
        self.raw_prompt = raw_prompt

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        image_name = self.df.iloc[idx]['image_name']

        raw_image = Image.open(f"/content/{self.datasetname}/raw_images/{image_name}.jpg").convert('RGB')
        ill_image = Image.open(f"/content/{self.datasetname}/ill_images/{image_name}.jpg").convert("RGB")
        ill_less_image = Image.open(f"/content/{self.datasetname}/illusionless_images/{image_name}.jpg").convert("RGB")
        ill_filtered_image = Image.open(f"/content/{self.datasetname}/illusion_images_filtered/{image_name}.jpg").convert("RGB")
        ill_less_filtered_image = Image.open(f"/content/{self.datasetname}/illusionless_images_filtered/{image_name}.jpg").convert("RGB")

        true_label = self.df.iloc[idx]['label']

        if self.transform:
            raw_image = self.transform(raw_image)
            ill_image = self.transform(ill_image)
            ill_less_image = self.transform(ill_less_image)
            ill_filtered_image = self.transform(ill_filtered_image)
            ill_less_filtered_image = self.transform(ill_less_filtered_image)

        return self.ill_prompt, self.raw_prompt, raw_image, ill_image, ill_less_image, ill_filtered_image, ill_less_filtered_image, true_label, image_name

In [None]:
# prepare your datasets' labels:

raw_labels = [
    "digit 0",
    "digit 1",
    "digit 2",
    "digit 3",
    "digit 4",
    "digit 5",
    "digit 6",
    "digit 7",
    "digit 8",
    "digit 9",
]

ill_labels = [
    "no illusion digit",
    "illusion digit 0",
    "illusion digit 1",
    "illusion digit 2",
    "illusion digit 3",
    "illusion digit 4",
    "illusion digit 5",
    "illusion digit 6",
    "illusion digit 7",
    "illusion digit 8",
    "illusion digit 9",
]

raw_class_names_str = "'" + "', '".join(raw_labels) + "'"
raw_prompt = f"what digit is placed in the image? {raw_class_names_str}"

ill_class_names_str = "'" + "', '".join(ill_labels) + "'"
ill_prompt = f"There might be a hidden digit in the image. Choose one of these labels: {ill_class_names_str}"

In [None]:
from torchvision import transforms
import numpy as np

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.Resize((448, 448)),  # Resize images to 224x224
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet mean and std
])

dataset = Illusion_Dataset(df, datasetName, ill_prompt, raw_prompt, transform)

# Create a data loader
data_loader = DataLoader(dataset, batch_size = batchSize, shuffle = False)

if you want to see images

In [None]:
def show_image(tensor):
  # Reverse the normalization
    mean = torch.tensor([0.485, 0.456, 0.406]).view(-1, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(-1, 1, 1)
    tensor = tensor * std + mean

    # Clamp the values in the tensor to be between 0 and 1
    tensor = torch.clamp(tensor, 0, 1)

    # Move the tensor to CPU and convert to numpy
    numpy_array = tensor.cpu().numpy()

    # Transpose the numpy array to have the channel dimension last
    numpy_array = numpy_array.transpose(1, 2, 0)

    # Convert the numpy array to a PIL Image
    image = Image.fromarray((numpy_array * 255).astype(np.uint8))

    # Display the image
    # Display the image using matplotlib
    plt.imshow(image)
    plt.axis('off')  # Remove axis
    plt.show()

### Import model

In [None]:
# clear cuda
# del model
# del vis_processor

In [None]:
# torch.cuda.empty_cache()

In [None]:
%cd /content/MiniGPT-4

/content/MiniGPT-4


In [None]:
import os
import re
import json
import argparse
from collections import defaultdict
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from minigpt4.common.config import Config
from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser, computeIoU
from minigpt4.conversation.conversation import CONV_VISION_minigptv2


def list_of_str(arg):
    return list(map(str, arg.split(',')))

parser = eval_parser()

args = parser.parse_args([
    "--cfg-path", "eval_configs/minigptv2_eval.yaml",
])

In [None]:
model, vis_processor = init_model(args)
conv_temp = CONV_VISION_minigptv2.copy()
conv_temp.system = ""

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Initialization Model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


trainable params: 33554432 || all params: 6771970048 || trainable%: 0.49548996469513035


100%|██████████| 1.89G/1.89G [00:24<00:00, 83.9MB/s]


Position interpolate from 16x16 to 32x32
Load Minigpt-4-LLM Checkpoint: /content/drive/MyDrive/checkpoint_stage3.pth
Initialization Finished


### Evaluate model

In [None]:
df["raw_answer"] = None
df["ill_answer"] = None
df["ill_less_answer"] = None
df["ill_filtered_answer"] = None
df["ill_less_filtered_answer"] = None

In [None]:
model.eval()

model_predicts = []

try:
  for batch_idx, (ill_prompts, raw_prompts, raw_images, ill_images, ill_less_images, ill_filtered_images, ill_less_filtered_images, true_labels, image_ids) in tqdm(enumerate(data_loader), total=len(data_loader)):

      raw_texts = prepare_texts(raw_prompts, conv_temp)
      ill_texts = prepare_texts(ill_prompts, conv_temp)

      raw_images = raw_images.to(dtype = torch.float16)
      ill_images = ill_images.to(dtype = torch.float16)
      ill_less_images = ill_less_images.to(dtype = torch.float16)
      ill_filtered_images = ill_filtered_images.to(dtype = torch.float16)
      ill_less_filtered_images = ill_less_filtered_images.to(dtype = torch.float16)

      # raw
      raw_model_answers = model.generate(raw_images, raw_texts, max_new_tokens=20, do_sample=False)

      # illusion
      illusion_model_answers = model.generate(ill_images, ill_texts, max_new_tokens=20, do_sample=False)

      # illusion less
      illusion_less_model_answers = model.generate(ill_less_images, ill_texts, max_new_tokens=20, do_sample=False)

      # illusion filtered
      illusion_filtered_model_answers = model.generate(ill_filtered_images, ill_texts, max_new_tokens=20, do_sample=False)

      # illusion less filtered
      illusion_less_filtered_model_answers = model.generate(ill_less_filtered_images, ill_texts, max_new_tokens=20, do_sample=False)


      for raw_model_answer, illusion_model_answer, illusion_less_model_answer, illusion_filtered_model_answer, illusion_less_filtered_model_answer, true_label, img_id in zip(raw_model_answers, illusion_model_answers, illusion_less_model_answers, illusion_filtered_model_answers, illusion_less_filtered_model_answers, true_labels, image_ids):
        model_predicts.append({"image_name" : img_id, "label" : true_label.item(), "raw_answer" : raw_model_answer, 'ill_answer' : illusion_model_answer, 'ill_less_answer' : illusion_less_model_answer, 'ill_filtered_answer' : illusion_filtered_model_answer, 'ill_less_filtered_answer' : illusion_less_filtered_model_answer})

except Exception as error:
  print(error)
  resultDf = pd.DataFrame(model_predicts)
  resultDf.to_csv(f'/content/drive/MyDrive/Final_project/Evaluations/{datasetName}_minigptv2_.csv', index=False)


100%|██████████| 278/278 [1:49:42<00:00, 23.68s/it]


In [None]:
resultDf = pd.DataFrame(model_predicts)
resultDf

Unnamed: 0,image_name,label,raw_answer,ill_answer,ill_less_answer,ill_filtered_answer,ill_less_filtered_answer
0,Mnist_1,7,<p>digit 7</p> {<20><24><80><,illusion digit 8,no illusion digit,illusion digit 7,illusion digit 0
1,Mnist_2,2,<p>digit 2</p> {<27><10><96><,illusion digit 1,illusion digit 1,illusion digit 2,illusion digit 0
2,Mnist_3,1,<p>digit 1</p> {<37><12><67><,illusion digit 8,no illusion digit,no illusion digit,no illusion digit
3,Mnist_4,0,<p>digit 5</p> {<36><14><63><,no illusion digit,illusion digit 5,no illusion digit,illusion digit 5
4,Mnist_5,4,<p>digit 4</p> {<22><13><77><,illusion digit 5,illusion digit 5,illusion digit 5,illusion digit 5
...,...,...,...,...,...,...,...
1104,Mnist_1105,8,<p>digit 8</p> {<30><14><79><,illusion digit 5,illusion digit 5,no illusion digit,no illusion digit
1105,Mnist_1106,0,<p>digit 8</p> {<27><16><79><,illusion digit 8,illusion digit 5,illusion digit 8,illusion digit 5
1106,Mnist_1107,5,<p>digit 5</p> {<24><14><91><,illusion digit 5,no illusion digit,no illusion digit,no illusion digit
1107,Mnist_1108,0,<p>digit 5</p> {<15><33><86><,illusion digit 8,illusion digit 5,illusion digit 8,illusion digit 5


In [None]:
resultDf['model answer'].to_list()

['<p>The digit</p> {<20><24><80><97>} in the image is the number 7',
 'The digit in the image is the number 2.',
 '<p>The digit</p> {<37><12><68><87>} is 1',
 '<p>The digit</p> {<24><14><82><86>} in the image is a zero',
 '<p>The digit</p> {<22><15><77><90>} in the image is the number 4',
 '<p>The digit</p> {<37><16><69><89>} is 1',
 'The digit in the image is the number 4.',
 '<p>The digit</p> {<28><21><83><95>} in the image is a zero',
 '<p>The digit</p> {<16><14><91><85>} is 6',
 'The digit in the image is the number 8.']

In [None]:
#save dataframe
resultDf.to_csv(f'/content/drive/MyDrive/Final_project/Evaluations/{datasetName}_minigptv2_.csv', index=False)