In [1]:
# Set the number of samples to be used for evaluation; 
n_evaluation_samples = 3 # 3 samples are used for evaluation
# n_evaluation_samples = -1 # all samples are used for evaluation

In [2]:
# Move to directory with data
%cd ../../data/evaluation_data

/home/pzou3/1_ResearchProjects/5_CodeRelease/3_ImplictAVE/ImplicitAVE/data/evaluation_data


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from sklearn.metrics import f1_score, accuracy_score
import torch
import numpy as np
import random
import gc
import os
import json
import pickle as pkl
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)



In [6]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Loading checkpoint shards: 100%|██████████| 10/10 [00:10<00:00,  1.07s/it]


In [7]:
def preprocess(df, name_to_value):
  id_to_prompt = {}
  id_to_label = {}

  df['prompt'] = df.apply(lambda row: f"What is the {row['attribute_names']} of this product: {row['texts']}? Answer with the option from the given choices directly: {name_to_value[row['attribute_names']]}.\nAnswer:", axis=1)


  for row in df.itertuples():
    id_to_prompt[row.id] = row.prompt
    id_to_label[row.id] = row.attribute_values

  return id_to_prompt, id_to_label

In [8]:
def extract_answer(text):
    # Look for the pattern " Answer:" and extract the text immediately following it until the next quotation mark
    start_keyword = "Answer:"
    end_char = '"'

    # Find the starting position of the answer
    start_index = text.find(start_keyword)
    if start_index == -1:
        # If " Answer:" is not found, return an empty string or an error message
        return "Answer not found"

    # Adjust start_index to get the actual start of the answer text
    start_index += len(start_keyword) + 1

    # Find the end of the answer based on the next quotation mark
    end_index = text.find(end_char, start_index)
    if end_index == -1:
        # If the closing quotation mark is not found, return the substring from the start_index to the end of the string
        return text[start_index:]

    # Extract and return the answer
    return text[start_index:end_index]

In [9]:
def inference(model, tokenizer, id_to_prompts, id_to_labels, img_dir):
  id_to_pred = {}
  preds = []
  labels = []

  for id in id_to_prompts:
    labels.append(id_to_labels[id])
    text = id_to_prompts[id]
    img_name = f'{id}.jpg'
    img_path = os.path.join(img_dir, img_name)

    if not os.path.exists(img_path):
      continue

    query = tokenizer.from_list_format([
    {'image': img_path},
    {'text': text},
      ])
    inputs = tokenizer(query, return_tensors='pt')
    inputs = inputs.to(model.device)
    pred = model.generate(**inputs)
    out = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)

    output = extract_answer(out)[:-13]


    preds.append(output)
    id_to_pred[id] = output

    torch.cuda.empty_cache()
    gc.collect()

  return id_to_pred, preds, labels

In [10]:
def calculate_metrics(predictions, targets):
  binary_predictions = [1 if target.lower() in prediction.lower() else 0 for target, prediction in zip(targets, predictions)]
  binary_targets = [1] * len(targets)

  accuracy = sum(binary_predictions)
  micro_f1 = f1_score(binary_targets, binary_predictions, average='micro')
  return accuracy, micro_f1

In [11]:
file_names = [
    'Clothing_annotated_final.tsv',
    'Jewlery_and_General_Apparel_annotated_final.tsv',
    'Footwear_annotated_final.tsv',
    'Food_annotated_final.tsv',
    'Home_annotated_final.tsv'
]

In [12]:
img_dir = "images"
for dataset in file_names:
  if "Food" in dataset:
    with open("options_Food.json", "r") as f:
      name_to_val = json.load(f)

  elif "Home" in dataset:
    with open("options_Home.json", "r") as f:
      name_to_val = json.load(f)

  else:
    with open("options_Clothing_Shoes_and_Jewelry.json", "r") as f:
      name_to_val = json.load(f)

  data = pd.read_csv(f'texts/{dataset}', sep='\t')[:n_evaluation_samples]

  id_to_prompts, id_to_labels = preprocess(data, name_to_val)
  id_to_pred, preds, labels = inference(model, tokenizer, id_to_prompts, id_to_labels, img_dir)
  both_acc, both_micro_f1 = calculate_metrics(preds, labels)

  # save predictions:
  # with open(f'results/Qwen_Vl_7B_preds_both_{dataset[:-4]}.pkl', 'wb') as f:
  #     pkl.dump(id_to_pred, f)

  print(f'For {dataset[:-4]} multi modalities micro_f1 was {both_micro_f1}')
  print('Prediction Examples: ', preds[:3])
  print('Label Examples: ', labels[:3])

  torch.cuda.empty_cache()
  gc.collect()

For Clothing_annotated_final multi modalities micro_f1 was 0.3333333333333333
Prediction Examples:  ['Short Sleeve', "'3/4 Sleeve'", 'Short Sleeve']
Label Examples:  ['3/4 Sleeve', '3/4 Sleeve', '3/4 Sleeve']
For Jewlery_and_General_Apparel_annotated_final multi modalities micro_f1 was 1.0
Prediction Examples:  ["'Argyle'", 'Argyle', "'Argyle'"]
Label Examples:  ['Argyle', 'Argyle', 'Argyle']
For Footwear_annotated_final multi modalities micro_f1 was 0.3333333333333333
Prediction Examples:  ['Mid Calf', "'Ankle Boot'", 'Mid Calf']
Label Examples:  ['Ankle Boot', 'Ankle Boot', 'Ankle Boot']
For Food_annotated_final multi modalities micro_f1 was 0.3333333333333333
Prediction Examples:  ["'Sliced'", "'Powder'", "['Bags/Packets']"]
Label Examples:  ['Sliced', 'Rub', 'Sliced']
For Home_annotated_final multi modalities micro_f1 was 0.6666666666666666
Prediction Examples:  ["'All Seasons'", "'All Seasons'", "'Autumn'"]
Label Examples:  ['All Seasons', 'All Seasons', 'All Seasons']


# Prompt Testing Code

In [13]:
def preprocess(df, name_to_value, custom_prompt):
    id_to_prompt = {}
    id_to_label = {}

    def apply_custom_prompt(row):
        return custom_prompt.format(
            attribute_names=row['attribute_names'],
            category=row['category'],
            texts=row['texts'],
            options=name_to_value[row['attribute_names']]
        )

    df['prompt'] = df.apply(apply_custom_prompt, axis=1)

    for row in df.itertuples():
        id_to_prompt[row.id] = row.prompt
        id_to_label[row.id] = row.attribute_values

    return id_to_prompt, id_to_label

In [14]:
custom_prompts = [
    "Question: What is {attribute_names} of this product?\nContext: [Category] {category} {texts}.\nYou must only answer the question with exactly one of the following options {options}.\nAnswer:",
    "What is {attribute_names} of this product?[Category] {category} {texts}.Answer with the option from the given choices directly: {options}.\nAnswer:",
    "[Category] {category} {texts}. What is {attribute_names} of this product? Answer with the option from the given choices directly: {options}.\nAnswer:",
    "[Category] {category} {texts}. What is {attribute_names} of this product based on the given information and the given image? Answer with the option from the given choices directly: {options}.\nAnswer:",
    "[Category] {category} {texts}. Which one of {options} is the {attribute_names} of this product? Answer with the option from the given choices directly.\nAnswer:",
    "{texts}. What is the {attribute_names} of this product? Answer with the option from the given choices directly: {options}.\nAnswer:",
    "{texts}. Based on the description and the image, what is the {attribute_names} of this product? Answer with the option from the given choices directly: {options}.\nAnswer:",
    "What is the {attribute_names} of this product: {texts}? Answer with the option from the given choices directly: {options}.\nAnswer:"
]

In [15]:
img_dir = "images"
dataset = "texts/Clothing_annotated_final.tsv"
with open("options_Clothing_Shoes_and_Jewelry.json", "r") as f:
    name_to_val = json.load(f)

for i, prompt in enumerate(custom_prompts):
  data = pd.read_csv(f'{dataset}', sep='\t')[:n_evaluation_samples]

  id_to_prompts, id_to_labels = preprocess(data, name_to_val, prompt)
  id_to_pred, preds, labels = inference(model, tokenizer, id_to_prompts, id_to_labels, img_dir)
  both_acc, both_micro_f1 = calculate_metrics(preds, labels)


  print(f'For prompt {i} the micro_f1 was {both_micro_f1}')

  torch.cuda.empty_cache()
  gc.collect()


For prompt 0 the micro_f1 was 0.3333333333333333
For prompt 1 the micro_f1 was 0.3333333333333333
For prompt 2 the micro_f1 was 0.6666666666666666
For prompt 3 the micro_f1 was 0.6666666666666666
For prompt 4 the micro_f1 was 1.0
For prompt 5 the micro_f1 was 0.3333333333333333
For prompt 6 the micro_f1 was 0.0
For prompt 7 the micro_f1 was 0.6666666666666666
