In [None]:
# %pip install --upgrade pip
# %git pull
# %pip install -e .

In [None]:
# Move to directory with data
%cd ../data/evaluation_data

In [2]:
from llava.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
from llava.eval.run_llava import eval_model, load_images

import argparse
import torch
import random
import pickle as pklb

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    IMAGE_PLACEHOLDER,
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path,
)

import pandas as pd
import gc
import numpy as np

import requests
from PIL import Image
from io import BytesIO
import json
import re
import os
from sklearn.metrics import f1_score, accuracy_score

ModuleNotFoundError: No module named 'llava'

In [None]:
model_path = "liuhaotian/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3"

tokenizer, model, image_processor, context_len = load_pretrained_model(
    model_path=model_path,
    model_base=None,
    load_4bit=True,
    model_name=get_model_name_from_path(model_path)
)

In [None]:
def eval(tokenizer,
         model,
         image_processor,
         model_path,
         qs,
         image_files,
         temperature=1,
         top_p=0.8,
         num_beams=5,
         max_new_tokens=17):

  disable_torch_init()

  model_name = get_model_name_from_path(model_path)
  image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN

  if IMAGE_PLACEHOLDER in qs:
    if model.config.mm_use_im_start_end:
        qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
    else:
        qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
  else:
      if model.config.mm_use_im_start_end:
          qs = image_token_se + "\n" + qs
      else:
          qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

  if "llama-2" in model_name.lower():
    conv_mode = "llava_llama_2"
  elif "mistral" in model_name.lower():
      conv_mode = "mistral_instruct"
  elif "v1.6-34b" in model_name.lower():
      conv_mode = "chatml_direct"
  elif "v1" in model_name.lower():
      conv_mode = "llava_v1"
  elif "mpt" in model_name.lower():
      conv_mode = "mpt"
  else:
      conv_mode = "llava_v0"

  conv = conv_templates[conv_mode].copy()
  conv.append_message(conv.roles[0], qs)
  conv.append_message(conv.roles[1], None)

  prompt = conv.get_prompt()

  images = load_images(image_files)
  image_sizes = [x.size for x in images]
  images_tensor = process_images(
      images,
      image_processor,
      model.config
  ).to(model.device, dtype=torch.float16)

  input_ids = (
    tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
    .unsqueeze(0)
    .cuda()
  )

  with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=images_tensor,
        image_sizes=image_sizes,
        do_sample=True if temperature > 0 else False,
        temperature=temperature,
        top_p=top_p,
        num_beams=num_beams,
        max_new_tokens=max_new_tokens,
        use_cache=True,
    )

    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return outputs

In [None]:
def preprocess(df, name_to_value):
  id_to_prompt = {}
  id_to_label = {}


  df['prompt'] = df.apply(lambda row: f"Question: What is {row['attribute_names']} of this product?{row['category']} {row['texts']}.You must only answer the question with exactly one of the following options {name_to_value[row['attribute_names']]}. \nAnswer:", axis=1)


  for row in df.itertuples():
    id_to_prompt[row.id] = row.prompt
    id_to_label[row.id] = row.attribute_values

  return id_to_prompt, id_to_label


In [None]:
file_names = [
    'Clothing_annotated_final.tsv',
    'Jewlery_and_General_Apparel_annotated_final.tsv',
    'Footwear_annotated_final.tsv',
    'Food_annotated_final.tsv',
    'Home_annotated_final.tsv'
]

In [None]:
def inference(model, model_path, tokenizer, image_processor, id_to_prompts, id_to_labels, img_dir, temperature=0.9, top_p=0.9):
  id_to_pred = {}
  preds = []
  labels = []

  for id in id_to_prompts:
    labels.append(id_to_labels[id])
    text = id_to_prompts[id]
    img_name = f'{id}.jpg'
    img_path = os.path.join(img_dir, img_name)

    if not os.path.exists(img_path):
      continue

    output = eval(tokenizer,
                  model,
                  image_processor,
                  model_path,
                  text,
                  [img_path],
                  temperature=temperature,
                  top_p=top_p,
                  num_beams=5,
                  max_new_tokens=17)[0]
    preds.append(output)
    id_to_pred[id] = output

    torch.cuda.empty_cache()
    gc.collect()

  return id_to_pred, preds, labels


In [None]:
def calculate_metrics(predictions, targets):
  binary_predictions = [1 if target.lower() in prediction.lower() else 0 for target, prediction in zip(targets, predictions)]
  binary_targets = [1] * len(targets)

  accuracy = sum(binary_predictions)
  micro_f1 = f1_score(binary_targets, binary_predictions, average='micro')
  return accuracy, micro_f1

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)  

In [None]:
img_dir = "images"
for dataset in file_names:
  if "Food" in dataset:
    with open("options_Food.json", "r") as f:
      name_to_val = json.load(f)

  elif "Home" in dataset:
    with open("options_Home.json", "r") as f:
      name_to_val = json.load(f)

  else:
    with open("options_Clothing_Shoes_and_Jewelry.json", "r") as f:
      name_to_val = json.load(f)

  data = pd.read_csv(f'{dataset}', sep='\t')

  id_to_prompts, id_to_labels = preprocess(data, name_to_val)
  id_to_pred, preds, labels = inference(model, model_path, tokenizer, image_processor, id_to_prompts, id_to_labels, img_dir, temperature=0.9, top_p=0.9)
  both_acc, both_micro_f1 = calculate_metrics(preds, labels)



  with open(f'results/llava_v1_13b_preds_both_{dataset[:-4]}.pkl', 'wb') as f:
      pkl.dump(id_to_pred, f)

  print(f'For {dataset[:-4]} multi modalities micro_f1 was {both_micro_f1}')

  torch.cuda.empty_cache()
  gc.collect()