In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install and import packages

In [None]:
!pip install accelerate -U
!pip install omegaconf
!pip install iopath
!pip install peft==0.2.0
!pip install timm
!pip install decord
!pip install webdataset
!pip install wandb
!pip install visual_genome
!pip install bitsandbytes
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2
!pip install transformers==4.37.2

In [None]:
import torch
import numpy as np
import random
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from huggingface_hub import login
from google.colab import userdata
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import json
import os

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

In [None]:
# Make reproducible code
GLOBAL_SEED = 10

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


### Prepare Model



Download Llama2 7b chat

In [None]:
%cd /content
! git clone https://huggingface.co/daryl149/llama-2-7b-chat-hf

/content
Cloning into 'llama-2-7b-chat-hf'...
remote: Enumerating objects: 36, done.[K
remote: Total 36 (delta 0), reused 0 (delta 0), pack-reused 36 (from 1)[K
Unpacking objects: 100% (36/36), 484.50 KiB | 2.82 MiB/s, done.
Filtering content: 100% (3/3), 4.55 GiB | 4.16 MiB/s, done.
Encountered 1 file(s) that may not have been copied correctly on Windows:
	pytorch_model-00001-of-00002.bin

See: `git lfs help smudge` for more details.


In [None]:
! du -sh /content/llama-2-7b-chat-hf

26G	/content/llama-2-7b-chat-hf


Clone mini-gpt repo

In [None]:
! git clone https://github.com/Vision-CAIR/MiniGPT-4.git

Cloning into 'MiniGPT-4'...
remote: Enumerating objects: 1797, done.[K
remote: Counting objects: 100% (884/884), done.[K
remote: Compressing objects: 100% (226/226), done.[K
remote: Total 1797 (delta 719), reused 658 (delta 658), pack-reused 913[K
Receiving objects: 100% (1797/1797), 65.21 MiB | 31.51 MiB/s, done.
Resolving deltas: 100% (1047/1047), done.


**Now you should add the location of llama and checkpoint to related files**

MiniGPT-4/minigpt4/configs/models/minigpt_v2.yml -> line 14

MiniGPT-4/eval_configs/minigptv2_eval.yml
 -->
  /content/drive/MyDrive/Final_project/checkpoint_stage3.pth

### Import dataset

In [None]:
# datasetName = "Mnist" # dataset name
# huggigface_repository_path = "VQA-Illusion/Mnist" # hugging-face dataset path
datasetName = "FashionMnist_test" # dataset name
huggigface_repository_path = "VQA-Illusion/FashionMnist_test" # hugging-face dataset path
local_dataset_path = "/content"

# for evaluation inference
batchSize = 4
# batchSize = 8

In [None]:
# access_token = userdata.get('HF_TOKEN_ALL')
access_token = userdata.get('HUGGINGFACE_WRITE_ACCESS_TOKEN')
login(token = access_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
USERNAME = userdata.get('HUGGINGFACE_USERNAME')
ACCESS_TOKEN = access_token
%cd {local_dataset_path}
!git clone 'https://{USERNAME}:{ACCESS_TOKEN}@huggingface.co/datasets/{huggigface_repository_path}'
# move to dataset directory
%cd {datasetName}

/content
Cloning into 'FashionMnist_test'...
remote: Enumerating objects: 5781, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 5781 (delta 3), reused 0 (delta 0), pack-reused 5775 (from 1)[K
Receiving objects: 100% (5781/5781), 888.61 KiB | 10.71 MiB/s, done.
Resolving deltas: 100% (3/3), done.
Updating files: 100% (5765/5765), done.
Filtering content: 100% (5761/5761), 1.34 GiB | 15.13 MiB/s, done.
[Errno 2] No such file or directory: 'FashionMnist'
/content


In [None]:
# %cd /content/Mnist
%cd /content/FashionMnist_test
df = pd.read_csv('./df_data.csv')
df

/content/FashionMnist_test


Unnamed: 0,image_name,Pprompt,Nprompt,illusion_strength,label
0,FashionMnist_1,Two friends hiking on a mountain during sunset,low quality,1.5,9
1,FashionMnist_2,A starry night sky with constellations glowing,low quality,1.5,2
2,FashionMnist_3,A Thanksgiving feast giving thanks and love,low quality,1.5,1
3,FashionMnist_4,A gecko on a large leaf in a tropical rainforest,low quality,1.5,1
4,FashionMnist_5,A Hanukkah menorah lighting symbolizing faith,low quality,1.5,6
...,...,...,...,...,...
1147,FashionMnist_1148,"Electrified waterfalls cascade, neon rivers re...",low quality,1.5,2
1148,FashionMnist_1149,A rocky desert landscape with cacti and tumble...,low quality,1.5,2
1149,FashionMnist_1150,A majestic waterfall plunging into a crystal-c...,low quality,1.5,5
1150,FashionMnist_1151,"Eerie silence surrounds snowy mountains, glowi...",low quality,1.5,3


In [None]:
class Illusion_Dataset(Dataset):
    def __init__(self, df, datasetname, ill_prompt, raw_prompt, transform=None):
        self.df = df
        self.transform = transform
        self.datasetname = datasetname
        self.ill_prompt = ill_prompt
        self.raw_prompt = raw_prompt

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        image_name = self.df.iloc[idx]['image_name']

        raw_image = Image.open(f"/content/{self.datasetname}/raw_images/{image_name}.jpg").convert('RGB')
        ill_image = Image.open(f"/content/{self.datasetname}/ill_images/{image_name}.jpg").convert("RGB")
        ill_less_image = Image.open(f"/content/{self.datasetname}/illusionless_images/{image_name}.jpg").convert("RGB")
        ill_filtered_image = Image.open(f"/content/{self.datasetname}/illusion_images_filtered/{image_name}.jpg").convert("RGB")
        ill_less_filtered_image = Image.open(f"/content/{self.datasetname}/illusionless_images_filtered/{image_name}.jpg").convert("RGB")

        true_label = self.df.iloc[idx]['label']

        if self.transform:
            raw_image = self.transform(raw_image)
            ill_image = self.transform(ill_image)
            ill_less_image = self.transform(ill_less_image)
            ill_filtered_image = self.transform(ill_filtered_image)
            ill_less_filtered_image = self.transform(ill_less_filtered_image)

        return self.ill_prompt, self.raw_prompt, raw_image, ill_image, ill_less_image, ill_filtered_image, ill_less_filtered_image, true_label, image_name

In [None]:
# prepare your datasets' labels:

# raw_labels = [
#     "digit 0",
#     "digit 1",
#     "digit 2",
#     "digit 3",
#     "digit 4",
#     "digit 5",
#     "digit 6",
#     "digit 7",
#     "digit 8",
#     "digit 9",
# ]
raw_labels = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# ill_labels = [
#     "illusion digit 0",
#     "illusion digit 1",
#     "illusion digit 2",
#     "illusion digit 3",
#     "illusion digit 4",
#     "illusion digit 5",
#     "illusion digit 6",
#     "illusion digit 7",
#     "illusion digit 8",
#     "illusion digit 9",
# ]
ill_labels = ['illusion T-shirt/top', 'illusion Trouser', 'illusion Pullover', 'illusion Dress', 'illusion Coat', 'illusion Sandal',
              'illusion Shirt', 'illusion Sneaker', 'illusion Bag', 'illusion Ankle boot']
# ill_labels = ['illusion T-shirt/top', 'illusion Trouser', 'illusion Pullover', 'illusion Dress', 'illusion Coat', 'illusion Sandal',
#               'illusion Shirt', 'illusion Sneaker', 'illusion Bag', 'illusion Ankle boot', 'No illusion']


raw_class_names_str = "'" + "', '".join(raw_labels) + "'"
# raw_prompt = f"what digit is placed in the image? {raw_class_names_str}"
raw_prompt = f"what object is placed in the image? choose one of these labels: {raw_class_names_str}"
# raw_prompt = f"Which class is in the picture: {raw_class_names_str}\nJust answer the correct class in a single word."
# raw_prompt = f"what cloth is placed in the image? {raw_class_names_str}"
# raw_prompt = f"This image contains an object in it. \
#     The object classes are {raw_class_names_str} . \
#     Please just generate answer in a single word and select one of the object classes mentioned that is in the image. \
#     For example, if you detect a 'T-shirt/top' just generate 'T-shirt/top'."

# raw_prompt = f"what object is placed in the image? The object is one of theses objects: {raw_class_names_str}. \
# Please just generate answer in a single word. The answer must be one of the objects mentioned."

ill_class_names_str = "'" + "', '".join(ill_labels) + "'"
# ill_prompt = f"There might be a hidden digit in the image. if it doesn't exist say No illusion digits. if you can determine a digit choose one of these labels for it: {ill_class_names_str}"
ill_prompt = f"There might be a hidden object in the image. if it doesn't exist say 'No illusion'. if you can determine a object choose one of these labels for it: {ill_class_names_str}"
# ill_prompt = f"There might be an illusion of something in the image or not. these are classes that illusion of them might be in\
    # the picture: {class_names_str}\nJust answer the correct class in a single word."
# ill_prompt = f"This image may contains illusion of a hidden object in it or not. \
#     The hidden object classes are {ill_class_names_str} . \
#     Please just generate answer in a single word and select one of hidden object classes mentioned. \
#     For example, if you detect a 'T-shirt/top' as a hidden object just generate 'T-shirt/top'.  if you do not detect a hidden object just generate 'No hidden object'."

# ill_prompt = "The image may contains an illusion hidden object in it or not. \
#     Either it contains an illusion hidden object, or it does not contain an illusion hidden object.\
#     If it contains an illusion hidden object, the illusion hidden object is from Fashion MNIST dataset.\
#     The dataset has 10 different objects. These objects are: 'T-shirt/top' , 'Trouser' , 'Pullover' , 'Dress' , 'Coat' , 'Sandal' , 'Shirt' , 'Sneaker' , 'Bag' , 'Ankle boot' . \
#     Please just generate answer in a single word. If it does not contain a hidden object just generate 'No Illusion' .\
#     Otherwise if it contains an illusion hidden object just generate one of the 10 classes above in one word. \
#     For example, if you detect a 'T-shirt/top' just generate 'T-shirt/top'. if you do not detect any hidden obeject just generate 'No Illusion' .
# "

# ill_prompt = "The image may contains an illusion hidden object in it or not. Either it contains an illusion hidden object, or it does not contain an illusion hidden object. If it contains an illusion hidden object, the illusion hidden object is from Fashion MNIST dataset. The dataset has 10 different objects. These objects are: 'T-shirt/top' , 'Trouser' , 'Pullover' , 'Dress' , 'Coat' , 'Sandal' , 'Shirt' , 'Sneaker' , 'Bag' , 'Ankle boot' . Please just generate answer in a single word. If it does not contain a hidden object just generate 'No Illusion' .Otherwise if it contains an illusion hidden object just generate one of the 10 classes above in one word. For example, if you detect a 'T-shirt/top' just generate 'T-shirt/top'. if you do not detect any hidden obeject just generate 'No Illusion' ."

# ill_prompt = f"The image might contain be an illusion object. If it doesn't contain say 'No illusion'. \
# If you can determine an illusion object choose one of these labels for it: {ill_class_names_str}. Otherwise just say\
# 'No illisuion'. Please just generate answer in a single word. The answer must be one of the labels or 'No illusion'."

In [None]:
from torchvision import transforms
import numpy as np

# Define a transform to normalize the data
transform = transforms.Compose([
    transforms.Resize((448, 448)),  # Resize images to 224x224
    transforms.ToTensor(),  # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet mean and std
])

dataset = Illusion_Dataset(df, datasetName, ill_prompt, raw_prompt, transform)

# Create a data loader
data_loader = DataLoader(dataset, batch_size = batchSize, shuffle = False)

if you want to see images

In [None]:
def show_image(tensor):
  # Reverse the normalization
    mean = torch.tensor([0.485, 0.456, 0.406]).view(-1, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(-1, 1, 1)
    tensor = tensor * std + mean

    # Clamp the values in the tensor to be between 0 and 1
    tensor = torch.clamp(tensor, 0, 1)

    # Move the tensor to CPU and convert to numpy
    numpy_array = tensor.cpu().numpy()

    # Transpose the numpy array to have the channel dimension last
    numpy_array = numpy_array.transpose(1, 2, 0)

    # Convert the numpy array to a PIL Image
    image = Image.fromarray((numpy_array * 255).astype(np.uint8))

    # Display the image
    # Display the image using matplotlib
    plt.imshow(image)
    plt.axis('off')  # Remove axis
    plt.show()

### Import model

In [None]:
# clear cuda
# del model
# del vis_processor

In [None]:
# torch.cuda.empty_cache()

In [None]:
%cd /content/MiniGPT-4

/content/MiniGPT-4


In [None]:
import os
import re
import json
import argparse
from collections import defaultdict
import random
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from minigpt4.common.config import Config
from minigpt4.common.eval_utils import prepare_texts, init_model, eval_parser, computeIoU
from minigpt4.conversation.conversation import CONV_VISION_minigptv2


def list_of_str(arg):
    return list(map(str, arg.split(',')))

parser = eval_parser()

args = parser.parse_args([
    "--cfg-path", "eval_configs/minigptv2_eval.yaml",
])

In [None]:
model, vis_processor = init_model(args)
conv_temp = CONV_VISION_minigptv2.copy()
conv_temp.system = ""

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Initialization Model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


trainable params: 33554432 || all params: 6771970048 || trainable%: 0.49548996469513035


100%|██████████| 1.89G/1.89G [00:22<00:00, 90.3MB/s]


Position interpolate from 16x16 to 32x32
Load Minigpt-4-LLM Checkpoint: /content/drive/MyDrive/Final_project/checkpoint_stage3.pth
Initialization Finished


### Evaluate model

In [None]:
df["raw_answer"] = None
df["ill_answer"] = None
df["ill_less_answer"] = None
df["ill_filtered_answer"] = None
df["ill_less_filtered_answer"] = None

In [None]:
model.eval()

model_predicts = []

try:
  for batch_idx, (ill_prompts, raw_prompts, raw_images, ill_images, ill_less_images, ill_filtered_images, ill_less_filtered_images, true_labels, image_ids) in tqdm(enumerate(data_loader), total=len(data_loader)):

      raw_texts = prepare_texts(raw_prompts, conv_temp)
      ill_texts = prepare_texts(ill_prompts, conv_temp)

      raw_images = raw_images.to(dtype = torch.float16)
      ill_images = ill_images.to(dtype = torch.float16)
      ill_less_images = ill_less_images.to(dtype = torch.float16)
      ill_filtered_images = ill_filtered_images.to(dtype = torch.float16)
      ill_less_filtered_images = ill_less_filtered_images.to(dtype = torch.float16)

      # raw
      raw_model_answers = model.generate(raw_images, raw_texts, max_new_tokens=20, do_sample=False)

      # illusion
      illusion_model_answers = model.generate(ill_images, ill_texts, max_new_tokens=20, do_sample=False)

      # illusion less
      illusion_less_model_answers = model.generate(ill_less_images, ill_texts, max_new_tokens=20, do_sample=False)

      # illusion filtered
      illusion_filtered_model_answers = model.generate(ill_filtered_images, ill_texts, max_new_tokens=20, do_sample=False)

      # illusion less filtered
      illusion_less_filtered_model_answers = model.generate(ill_less_filtered_images, ill_texts, max_new_tokens=20, do_sample=False)


      for raw_model_answer, illusion_model_answer, illusion_less_model_answer, illusion_filtered_model_answer, illusion_less_filtered_model_answer, true_label, img_id in zip(raw_model_answers, illusion_model_answers, illusion_less_model_answers, illusion_filtered_model_answers, illusion_less_filtered_model_answers, true_labels, image_ids):
        model_predicts.append({"image_name" : img_id, "label" : true_label.item(), "raw_answer" : raw_model_answer, 'ill_answer' : illusion_model_answer, 'ill_less_answer' : illusion_less_model_answer, 'ill_filtered_answer' : illusion_filtered_model_answer, 'ill_less_filtered_answer' : illusion_less_filtered_model_answer})

      if batch_idx % 10 == 0:
        resultDf = pd.DataFrame(model_predicts)
        resultDf.to_csv(f'/content/drive/MyDrive/Final_project/Experiments/{datasetName}_minigptv2_.csv', index=False)



except Exception as error:
  print(error)
  resultDf = pd.DataFrame(model_predicts)
  resultDf.to_csv(f'/content/drive/MyDrive/Final_project/Experiments/{datasetName}_minigptv2_.csv', index=False)


100%|██████████| 288/288 [2:05:38<00:00, 26.18s/it]


In [None]:
resultDf = pd.DataFrame(model_predicts)
resultDf

Unnamed: 0,image_name,label,raw_answer,ill_answer,ill_less_answer,ill_filtered_answer,ill_less_filtered_answer
0,FashionMnist_1,9,<p>T-shirt/top</p> {<1><21><9,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,No illusion
1,FashionMnist_2,2,<p>T-shirt/top</p> {<14><2><8,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
2,FashionMnist_3,1,<p>T-shirt/top</p> {<32><1><7,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
3,FashionMnist_4,1,<p>T-shirt/top</p> {<33><1><7,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
4,FashionMnist_5,6,T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
...,...,...,...,...,...,...,...
1147,FashionMnist_1148,2,<p>T-shirt/top</p> {<15><1><8,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
1148,FashionMnist_1149,2,T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
1149,FashionMnist_1150,5,<p>T-shirt/top</p> {<1><28><9,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top
1150,FashionMnist_1151,3,<p>T-shirt/top</p> {<30><1><6,illusion T-shirt/top,illusion T-shirt/top,illusion T-shirt/top,No illusion


In [None]:
#save dataframe
# resultDf.to_csv(f'/content/drive/MyDrive/Final_project/Evaluations/{datasetName}_minigptv2_.csv', index=False)
resultDf.to_csv(f'/content/drive/MyDrive/Final_project/Experiments/{datasetName}_minigptv2_.csv', index=False)