## IR question

## Data preprocessing

In [1]:
import os
os.makedirs("./data/mol_figures/step3/", exist_ok=True)

## MASS question

## Data sampling

In [2]:
import pandas as pd
iteration = 3
data = pd.read_csv("./data/mol_figures/step2/H-NMR_questions.csv")
ratios = {
        'H-NMR spectrum structure elucidation': 1,
    }
for i in range(0, iteration):
    total_samples = 100
    samples_per_class = {clss: int(total_samples * ratio) for clss, ratio in ratios.items()}
    print(samples_per_class)

    sampled_data = pd.DataFrame()
    for clss, n_samples in samples_per_class.items():
        print(clss)
        sampled_class_data = data[data['cls'] == clss].sample(n=n_samples)
        sampled_data = pd.concat([sampled_data, sampled_class_data])


    if len(sampled_data) < total_samples:
        additional_samples = data[~data.index.isin(sampled_data.index)].sample(n=total_samples - len(sampled_data), random_state=42)
        sampled_data = pd.concat([sampled_data, additional_samples])
    # os.makedirs('./data/mol_figures/mol_understanding', exist_ok=True)
    sampled_data.to_csv(f'./data/mol_figures/step2/H-NMR_sampled_questions_answers_{i}.csv', index=False)
    print("Sampled data saved to 'H-NMR_sampled_questions_answers.csv'")

{'H-NMR spectrum structure elucidation': 100}
H-NMR spectrum structure elucidation
Sampled data saved to 'H-NMR_sampled_questions_answers.csv'
{'H-NMR spectrum structure elucidation': 100}
H-NMR spectrum structure elucidation
Sampled data saved to 'H-NMR_sampled_questions_answers.csv'
{'H-NMR spectrum structure elucidation': 100}
H-NMR spectrum structure elucidation
Sampled data saved to 'H-NMR_sampled_questions_answers.csv'


## zero-shotting learning H_NMR

In [3]:
import openai
import anthropic
from openai import OpenAI
import os
import pandas as pd
import base64
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import pipeline
from PIL import Image
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from utils import *
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from transformers.generation import GenerationConfig
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode


# Set your OpenAI API key
os.environ['OPENAI_API_KEY'] = ''
os.environ['ANTHROPIC_API_KEY'] = ''
cache_dir = ""

sys_prompt = """
    As an expert organic chemist, your task is to analyze and determine the potential structures that can be derived from a given molecule's H-NMR specturm image.\
    Utilize your knowledge to systematically explore and identify plausible structural configurations based on these spectrum images provided and answer the question.\
    Identify and list possible molecular fragments that match the spectral data and Ensure the fragments are chemically feasible and consistent with the H-NMR data.
    Analyze the problem step-by-step internally, but do not include the analysis in your output. 
    Respond with ONLY ‘Yes’ or ‘No’ to indicate whether the molecule could potentially contain the functional group". 
    Example output: Yes.
    """
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
def analyze_image_with_prompt_gpt(image_paths, prompt_text, sys_prompt=sys_prompt):
    # Encode the image to base64
    api_key = os.getenv("OPENAI_API_KEY")
    base64_images = [encode_image(image_path) for image_path in image_paths]
    client = OpenAI()
    
    messages=[
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": sys_prompt + "\n" + prompt_text
            },
          ],
        }
      ]
    
    for base64_image in base64_images:
        messages[0]['content'].append(
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_image}"
                }
            }
        )
        
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      max_tokens=500,
      temperature=0.7
    )
    
    gpt_responses = response.choices[0]
    
    # Return only the GPT model's responses
    return gpt_responses

def analyze_image_with_prompt_claude(image_paths, prompt_text, sys_prompt=sys_prompt):
    
    # Encode the image to base64
    api_key = ''
    client = anthropic.Anthropic(api_key=api_key)
    base64_image = encode_image(image_path)
    
    message = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": 'image/png',
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": "Describe this image."
                }
            ],
        }
    ],
    )
  
    return message


def analyze_image_with_prompt_llava_8B(model, processor, image_path, prompt_text, sys_prompt=sys_prompt):
    images = Image.open(image_path)

    prompt = (f"<|start_header_id|>system<|end_header_id|>\n\n{sys_prompt}<|eot_id|>"
        f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{prompt_text}<|eot_id|>"
              "<|start_header_id|>assistant<|end_header_id|>\n\n")
    inputs = processor(prompt, images, return_tensors='pt').to("cuda", torch.float16)
    outputs = model.generate(**inputs, max_new_tokens=128, do_sample=False)
    len_tokens = len(prompt.split())
    generated_text = processor.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
    return generated_text
    
def analyze_image_with_prompt_instructBlip(model, processor, image_path, prompt_text, sys_prompt=sys_prompt):
    # if image_path is not None:
    images = Image.open(image_path)
    prompt = sys_prompt + prompt_text
    inputs = processor(images=images, text=prompt, return_tensors="pt").to("cuda")
    # else:
    #     prompt = prompt_text
    #     inputs = processor(text=sys_prompt + prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
            **inputs,
            do_sample=True,
            num_beams=10,
            max_length=512,
            min_length=4,
            top_p=0.9,
            repetition_penalty=1.5,
            length_penalty=1.0,
            temperature=1,
    )
    generated_text = processor.decode(outputs[0], skip_special_tokens=True).strip()
    return generated_text

def analyze_image_with_prompt_Qwen(model, tokenizer, image_path, prompt_text, sys_prompt=sys_prompt):
    query = tokenizer.from_list_format([
        {'image': image_path},
        {'text': sys_prompt+prompt_text},
    ])
    response, history = model.chat(tokenizer, query=query, history=None)
    return response
    # # 2nd dialogue turn
    # response, history = model.chat(tokenizer, '输出"击掌"的检测框', history=history)
    # print(response)
    # # <ref>击掌</ref><box>(517,508),(589,611)</box>
    # image = tokenizer.draw_bbox_on_latest_picture(response, history)
    # if image:
    #   image.save('1.jpg')
    # else:
    #   print("no box")

def analyze_image_with_prompt_InternVL(model, tokenizer, image_path, prompt_text, sys_prompt=sys_prompt):
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    
    
    def build_transform(input_size):
        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
        transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=MEAN, std=STD)
        ])
        return transform
    
    
    def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
        best_ratio_diff = float('inf')
        best_ratio = (1, 1)
        area = width * height
        for ratio in target_ratios:
            target_aspect_ratio = ratio[0] / ratio[1]
            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
            if ratio_diff < best_ratio_diff:
                best_ratio_diff = ratio_diff
                best_ratio = ratio
            elif ratio_diff == best_ratio_diff:
                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                    best_ratio = ratio
        return best_ratio
    
    
    def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
        orig_width, orig_height = image.size
        aspect_ratio = orig_width / orig_height
    
        # calculate the existing image aspect ratio
        target_ratios = set(
            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
            i * j <= max_num and i * j >= min_num)
        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
    
        # find the closest aspect ratio to the target
        target_aspect_ratio = find_closest_aspect_ratio(
            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
    
        # calculate the target width and height
        target_width = image_size * target_aspect_ratio[0]
        target_height = image_size * target_aspect_ratio[1]
        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
    
        # resize the image
        resized_img = image.resize((target_width, target_height))
        processed_images = []
        for i in range(blocks):
            box = (
                (i % (target_width // image_size)) * image_size,
                (i // (target_width // image_size)) * image_size,
                ((i % (target_width // image_size)) + 1) * image_size,
                ((i // (target_width // image_size)) + 1) * image_size
            )
            # split the image
            split_img = resized_img.crop(box)
            processed_images.append(split_img)
        assert len(processed_images) == blocks
        if use_thumbnail and len(processed_images) != 1:
            thumbnail_img = image.resize((image_size, image_size))
            processed_images.append(thumbnail_img)
        return processed_images
    
    
    def load_image(image_file, input_size=448, max_num=6):
        image = Image.open(image_file).convert('RGB')
        transform = build_transform(input_size=input_size)
        images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(image) for image in images]
        pixel_values = torch.stack(pixel_values)
        return pixel_values
    
    
    
    # set the max number of tiles in `max_num`
    pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
    
    generation_config = dict(
        num_beams=1,
        max_new_tokens=200,
        do_sample=False,
    )
    
    # single-round single-image conversation
    question = sys_prompt + prompt_text
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    return response
    
    #     # multi-round single-image conversation
    #     question = "请详细描述图片" # Please describe the picture in detail
    #     response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
    #     print(question, response)

    #     question = "请根据图片写一首诗" # Please write a poem according to the picture
    #     response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
    #     print(question, response)
    
    # # multi-round multi-image conversation
    # pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
    # pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
    # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
    # 
    # question = "详细描述这两张图片" # Describe the two pictures in detail
    # response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
    # print(question, response)
    # 
    # question = "这两张图片的相同点和区别分别是什么" # What are the similarities and differences between these two pictures
    # response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
    # print(question, response)
    # 
    # # batch inference (single image per sample)
    # pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
    # pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
    # image_counts = [pixel_values1.size(0), pixel_values2.size(0)]
    # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
    # 
    # questions = ["Describe the image in detail."] * len(image_counts)
    # responses = model.batch_chat(tokenizer, pixel_values,
    #                              image_counts=image_counts,
    #                              questions=questions,
    #                              generation_config=generation_config)
    # for question, response in zip(questions, responses):
    #     print(question)
    #     print(response)


def analyze_image_with_prompt(model_name, image_path, prompt_text, sys_prompt=sys_prompt, model=None, processor=None):
    if 'gpt-4o' in model_name:
        return analyze_image_with_prompt_gpt(image_path, prompt_text, sys_prompt=sys_prompt)
    elif "claude" in model_name:
        return analyze_image_with_prompt_claude(image_path, prompt_text, sys_prompt=sys_prompt)
    elif "llava" in model_name:
        return analyze_image_with_prompt_llava_8B(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    elif "instructBlip" in model_name:
        return analyze_image_with_prompt_instructBlip(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    elif 'Qwen' in model_name:
        return analyze_image_with_prompt_Qwen(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    elif 'InternVL' in model_name:
        return analyze_image_with_prompt_InternVL(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    else:
        raise ValueError("Invalid model name. Choose 'gpt' or 'claude'.")
    
    
def is_open_source(model_name):
    if 'claude' in model_name or 'gemini' in model_name or 'gpt' in model_name:
        return False
    return True

In [None]:
iteration = 3
model_names = ['instructBlip-7B', 'instructBlip-13B']
for model_name in model_names:
    if is_open_source(model_name):
        if 'llava' in model_name:
            model = LlavaForConditionalGeneration.from_pretrained(
                model_paths[model_name], 
                torch_dtype=torch.float16, 
                low_cpu_mem_usage=True, 
                cache_dir=cache_dir,
                ).to("cuda")

            processor = AutoProcessor.from_pretrained(model_paths[model_name])
        elif 'instructBlip' in model_name:
            model = InstructBlipForConditionalGeneration.from_pretrained(model_paths[model_name], cache_dir=cache_dir).to("cuda")
            processor = InstructBlipProcessor.from_pretrained(model_paths[model_name], cache_dir=cache_dir) 
        elif 'Qwen' in model_name:
            processor = AutoTokenizer.from_pretrained(model_paths[model_name], trust_remote_code=True, cache_dir=cache_dir)
            model = AutoModelForCausalLM.from_pretrained(model_paths[model_name], device_map="cuda", trust_remote_code=True, cache_dir=cache_dir).eval()
        elif 'InternVL' in model_name:
            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
            model = AutoModel.from_pretrained(
                model_paths[model_name],
                torch_dtype=torch.bfloat16,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                cache_dir=cache_dir,
                device_map='auto').eval()

            tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, cache_dir=cache_dir)
        else:
            model = None
            processor=None
            
    for i in range(0, iteration):
        data_frame = pd.DataFrame(columns=["question", "cls", "Answer", "Generated Response"])
        data = pd.read_csv(f"./data/mol_figures/step2/H-NMR_sampled_questions_answers_{i}.csv")
        for _, row in data.iterrows():
            try:
                index_parts = row['Molecule Index'].split('_')
                index_parts = [part.strip() for part in index_parts]
                # image_path = "/home/kguo2/PycharmProjects/spectrumLM/data/mol_figures/5th_specs/Problem_21_1.png"
                image_path = f'./data/mol_figures/{index_parts[0]}th_specs/Problem {index_parts[1]}_4.png'

                generated_response = analyze_image_with_prompt(model_name, image_path, row['Question'], sys_prompt=sys_prompt, model=model, processor=processor)
                print(generated_response)
                data_frame.loc[len(data_frame)] = [row['Question'], row['cls'], row['Answer'], generated_response]
                
            except Exception as e:
                print(e)
                continue
        data_frame.to_csv(f'./data/mol_figures/step2/H-NMR_{model_name}_generated_responses_{i}.csv', index=False)
    
    del model, processor

In [5]:
from sklearn.metrics import accuracy_score, f1_score
import re

def evaluate_responses(df):
    categories = ['H-NMR spectrum structure elucidation']
    results = {}
    
    for category in categories:
        # Filter data for the current category
        category_data = df[df['cls'] == category]
        
        # Compute accuracy
        answer = [1 if ans == 'Yes' else 0 for ans in category_data['Answer']]
        generated_response = [1 if 'yes' in ans.lower() else 0 if 'no' in ans.lower() else -1 for ans in category_data['Generated Response']]
        accuracy = accuracy_score(answer, generated_response)
        
        # Compute F1 score
        f1 = f1_score(answer, generated_response, average='macro')
        
        results[category] = {'Accuracy': accuracy, 'F1 Score': f1}
    
    return results

In [None]:
import numpy as np
import pandas as pd
models = ['instructBlip-7B', 'instructBlip-13B']

for model in models:
    print()
    print(model)

    sa_f1 = []
    sa_acc = []
    ar_f1 = []
    ar_acc = []
    for i in range(0, 3):
        generated_answers = pd.read_csv(f'./data/mol_figures/step2/H-NMR_{model}_generated_responses_{i}.csv')
        results = evaluate_responses(generated_answers)
        # print(results)

        ar_f1.append(results['H-NMR spectrum structure elucidation']['F1 Score'])
        ar_acc.append(results['H-NMR spectrum structure elucidation']['Accuracy'])
    ar_acc = np.array(ar_acc)
    ar_f1 = np.array(ar_f1)
    print(ar_f1.mean())
    print(ar_f1.std())
    print(ar_acc.mean())
    print(ar_acc.std())
    print("f1 mean", ar_f1.mean())
    print("f1 std", ar_f1.std())
    print("acc mean", ar_acc.mean())
    print("acc std", ar_acc.std())

## C_NMR

In [4]:
import openai
import anthropic
from openai import OpenAI
import os
import pandas as pd
import base64
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import pipeline
from PIL import Image
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from utils import *
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from transformers.generation import GenerationConfig
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode


sys_prompt = """
    As an expert organic chemist, your task is to analyze and determine the potential structures that can be derived from a given molecule's C-NMR specturm image.\
    Utilize your knowledge to systematically explore and identify plausible structural configurations based on these spectrum images provided and answer the question.\
    Identify and list possible molecular fragments that match the spectral data and Ensure the fragments are chemically feasible and consistent with the C-NMR data.
    Analyze the problem step-by-step internally, but do not include the analysis in your output. 
    Respond with ONLY ‘Yes’ or ‘No’ to indicate whether the molecule could potentially contain the functional group". Example output:Yes.
    """
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
def analyze_image_with_prompt_gpt(image_paths, prompt_text, sys_prompt=sys_prompt):
    # Encode the image to base64
    api_key = os.getenv("OPENAI_API_KEY")
    base64_images = [encode_image(image_path) for image_path in image_paths]
    client = OpenAI()
    
    messages=[
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": sys_prompt + "\n" + prompt_text
            },
          ],
        }
      ]
    
    for base64_image in base64_images:
        messages[0]['content'].append(
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_image}"
                }
            }
        )
        
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      max_tokens=500,
      temperature=0.7
    )
    
    gpt_responses = response.choices[0]
    
    # Return only the GPT model's responses
    return gpt_responses

def analyze_image_with_prompt_claude(image_paths, prompt_text, sys_prompt=sys_prompt):
    
    # Encode the image to base64
    api_key = ''
    client = anthropic.Anthropic(api_key=api_key)
    base64_image = encode_image(image_path)
    
    message = client.messages.create(
    model="claude-3-opus-20240229",
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": 'image/png',
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": "Describe this image."
                }
            ],
        }
    ],
    )
  
    return message


def analyze_image_with_prompt_llava_8B(model, processor, image_path, prompt_text, sys_prompt=sys_prompt):
    images = Image.open(image_path)

    prompt = (f"<|start_header_id|>system<|end_header_id|>\n\n{sys_prompt}<|eot_id|>"
        f"<|start_header_id|>user<|end_header_id|>\n\n<image>\n{prompt_text}<|eot_id|>"
              "<|start_header_id|>assistant<|end_header_id|>\n\n")
    inputs = processor(prompt, images, return_tensors='pt').to("cuda", torch.float16)
    outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True)
    len_tokens = len(prompt.split())
    generated_text = processor.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
    return generated_text
    
def analyze_image_with_prompt_instructBlip(model, processor, image_path, prompt_text, sys_prompt=sys_prompt):
    # if image_path is not None:
    images = Image.open(image_path)
    prompt = sys_prompt + prompt_text
    inputs = processor(images=images, text=prompt, return_tensors="pt").to("cuda")
    # else:
    #     prompt = prompt_text
    #     inputs = processor(text=sys_prompt + prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
            **inputs,
            do_sample=True,
            num_beams=5,
            max_length=512,
            min_length=4,
            top_p=0.9,
            repetition_penalty=1.5,
            length_penalty=1.0,
            temperature=1,
    )
    generated_text = processor.decode(outputs[0], skip_special_tokens=True).strip()
    return generated_text

def analyze_image_with_prompt_Qwen(model, tokenizer, image_path, prompt_text, sys_prompt=sys_prompt):
    query = tokenizer.from_list_format([
        {'image': image_path},
        {'text': sys_prompt+prompt_text},
    ])
    response, history = model.chat(tokenizer, query=query, history=None)
    return response
    # # 2nd dialogue turn
    # response, history = model.chat(tokenizer, '输出"击掌"的检测框', history=history)
    # print(response)
    # # <ref>击掌</ref><box>(517,508),(589,611)</box>
    # image = tokenizer.draw_bbox_on_latest_picture(response, history)
    # if image:
    #   image.save('1.jpg')
    # else:
    #   print("no box")

def analyze_image_with_prompt_InternVL(model, tokenizer, image_path, prompt_text, sys_prompt=sys_prompt):
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    
    
    def build_transform(input_size):
        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
        transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=MEAN, std=STD)
        ])
        return transform
    
    
    def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
        best_ratio_diff = float('inf')
        best_ratio = (1, 1)
        area = width * height
        for ratio in target_ratios:
            target_aspect_ratio = ratio[0] / ratio[1]
            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
            if ratio_diff < best_ratio_diff:
                best_ratio_diff = ratio_diff
                best_ratio = ratio
            elif ratio_diff == best_ratio_diff:
                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                    best_ratio = ratio
        return best_ratio
    
    
    def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
        orig_width, orig_height = image.size
        aspect_ratio = orig_width / orig_height
    
        # calculate the existing image aspect ratio
        target_ratios = set(
            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
            i * j <= max_num and i * j >= min_num)
        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
    
        # find the closest aspect ratio to the target
        target_aspect_ratio = find_closest_aspect_ratio(
            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
    
        # calculate the target width and height
        target_width = image_size * target_aspect_ratio[0]
        target_height = image_size * target_aspect_ratio[1]
        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
    
        # resize the image
        resized_img = image.resize((target_width, target_height))
        processed_images = []
        for i in range(blocks):
            box = (
                (i % (target_width // image_size)) * image_size,
                (i // (target_width // image_size)) * image_size,
                ((i % (target_width // image_size)) + 1) * image_size,
                ((i // (target_width // image_size)) + 1) * image_size
            )
            # split the image
            split_img = resized_img.crop(box)
            processed_images.append(split_img)
        assert len(processed_images) == blocks
        if use_thumbnail and len(processed_images) != 1:
            thumbnail_img = image.resize((image_size, image_size))
            processed_images.append(thumbnail_img)
        return processed_images
    
    
    def load_image(image_file, input_size=448, max_num=6):
        image = Image.open(image_file).convert('RGB')
        transform = build_transform(input_size=input_size)
        images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(image) for image in images]
        pixel_values = torch.stack(pixel_values)
        return pixel_values
    
    
    
    # set the max number of tiles in `max_num`
    pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
    
    generation_config = dict(
        num_beams=1,
        max_new_tokens=200,
        do_sample=False,
    )
    
    # single-round single-image conversation
    question = sys_prompt + prompt_text
    response = model.chat(tokenizer, pixel_values, question, generation_config)
    return response
    
    #     # multi-round single-image conversation
    #     question = "请详细描述图片" # Please describe the picture in detail
    #     response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
    #     print(question, response)

    #     question = "请根据图片写一首诗" # Please write a poem according to the picture
    #     response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
    #     print(question, response)
    
    # # multi-round multi-image conversation
    # pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
    # pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
    # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
    # 
    # question = "详细描述这两张图片" # Describe the two pictures in detail
    # response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
    # print(question, response)
    # 
    # question = "这两张图片的相同点和区别分别是什么" # What are the similarities and differences between these two pictures
    # response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
    # print(question, response)
    # 
    # # batch inference (single image per sample)
    # pixel_values1 = load_image('./examples/image1.jpg', max_num=6).to(torch.bfloat16).cuda()
    # pixel_values2 = load_image('./examples/image2.jpg', max_num=6).to(torch.bfloat16).cuda()
    # image_counts = [pixel_values1.size(0), pixel_values2.size(0)]
    # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
    # 
    # questions = ["Describe the image in detail."] * len(image_counts)
    # responses = model.batch_chat(tokenizer, pixel_values,
    #                              image_counts=image_counts,
    #                              questions=questions,
    #                              generation_config=generation_config)
    # for question, response in zip(questions, responses):
    #     print(question)
    #     print(response)


def analyze_image_with_prompt(model_name, image_path, prompt_text, sys_prompt=sys_prompt, model=None, processor=None):
    if 'gpt-4o' in model_name:
        return analyze_image_with_prompt_gpt(image_path, prompt_text, sys_prompt=sys_prompt)
    elif "claude" in model_name:
        return analyze_image_with_prompt_claude(image_path, prompt_text, sys_prompt=sys_prompt)
    elif "llava" in model_name:
        return analyze_image_with_prompt_llava_8B(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    elif "instructBlip" in model_name:
        return analyze_image_with_prompt_instructBlip(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    elif 'Qwen' in model_name:
        return analyze_image_with_prompt_Qwen(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    elif 'InternVL' in model_name:
        return analyze_image_with_prompt_InternVL(model, processor, image_path, prompt_text, sys_prompt=sys_prompt)
    else:
        raise ValueError("Invalid model name. Choose 'gpt' or 'claude'.")
    
    
def is_open_source(model_name):
    if 'claude' in model_name or 'gemini' in model_name or 'gpt' in model_name:
        return False
    return True

In [None]:
iteration = 3
model_names = ['llava', 'instructBlip-7B', 'instructBlip-13B', 'Qwen-VL-Chat']
for model_name in model_names:
    if is_open_source(model_name):
        if 'llava' in model_name:
            model = LlavaForConditionalGeneration.from_pretrained(
                model_paths[model_name], 
                torch_dtype=torch.float16, 
                low_cpu_mem_usage=True, 
                cache_dir=cache_dir,
                ).to("cuda")

            processor = AutoProcessor.from_pretrained(model_paths[model_name])
        elif 'instructBlip' in model_name:
            model = InstructBlipForConditionalGeneration.from_pretrained(model_paths[model_name], cache_dir=cache_dir).to("cuda")
            processor = InstructBlipProcessor.from_pretrained(model_paths[model_name], cache_dir=cache_dir) 
        elif 'Qwen' in model_name:
            processor = AutoTokenizer.from_pretrained(model_paths[model_name], trust_remote_code=True, cache_dir=cache_dir)
            model = AutoModelForCausalLM.from_pretrained(model_paths[model_name], device_map="cuda", trust_remote_code=True, cache_dir=cache_dir).eval()
        elif 'InternVL' in model_name:
            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
            model = AutoModel.from_pretrained(
                model_paths[model_name],
                torch_dtype=torch.bfloat16,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                cache_dir=cache_dir,
                device_map='auto').eval()

            tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, cache_dir=cache_dir)
        else:
            model = None
            processor=None
            
    for i in range(0, iteration):
        data_frame = pd.DataFrame(columns=["question", "cls", "Answer", "Generated Response"])
        data = pd.read_csv(f"./data/mol_figures/step2/C-NMR_sampled_questions_answers_{i}.csv")
        for _, row in data.iterrows():
            try:
                index_parts = row['Molecule Index'].split('_')
                index_parts = [part.strip() for part in index_parts]
                # image_path = "/home/kguo2/PycharmProjects/spectrumLM/data/mol_figures/5th_specs/Problem_21_1.png"
                image_path = f'./data/mol_figures/{index_parts[0]}th_specs/Problem {index_parts[1]}_3.png'

                generated_response = analyze_image_with_prompt(model_name, image_path, row['Question'], sys_prompt=sys_prompt, model=model, processor=processor)
                print(generated_response)
                data_frame.loc[len(data_frame)] = [row['Question'], row['cls'], row['Answer'], generated_response]
                
            except Exception as e:
                print(e)
                continue
        data_frame.to_csv(f'./data/mol_figures/step2/C-NMR_{model_name}_generated_responses_{i}.csv', index=False)
    
    del model, processor

In [7]:
from sklearn.metrics import accuracy_score, f1_score
import re

def evaluate_responses(df):
    categories = ['C-NMR spectrum structure elucidation']
    results = {}
    
    for category in categories:
        # Filter data for the current category
        category_data = df[df['cls'] == category]
        
        # Compute accuracy
        answer = [1 if ans == 'Yes' else 0 for ans in category_data['Answer']]
        generated_response = [1 if 'yes' in ans else 0 if 'no' in ans else -1 for ans in category_data['Generated Response'].apply(str)]
        accuracy = accuracy_score(answer, generated_response)
        
        # Compute F1 score
        f1 = f1_score(answer, generated_response, average='macro')
        
        results[category] = {'Accuracy': accuracy, 'F1 Score': f1}
    
    return results

In [None]:
import numpy as np
import pandas as pd
models = [ 'instructBlip-7B', 'instructBlip-13B', 'Qwen-VL-Chat']

for model in models:
    print()
    print(model)

    sa_f1 = []
    sa_acc = []
    ar_f1 = []
    ar_acc = []
    for i in range(0, 3):
        generated_answers = pd.read_csv(f'./data/mol_figures/step2/C-NMR_{model}_generated_responses_{i}.csv')
        results = evaluate_responses(generated_answers)
        # print(results)

        ar_f1.append(results['C-NMR spectrum structure elucidation']['F1 Score'])
        ar_acc.append(results['C-NMR spectrum structure elucidation']['Accuracy'])
    ar_acc = np.array(ar_acc)
    ar_f1 = np.array(ar_f1)
    print(ar_f1.mean())
    print(ar_f1.std())
    print(ar_acc.mean())
    print(ar_acc.std())
    print("f1 mean", ar_f1.mean())
    print("f1 std", ar_f1.std())
    print("acc mean", ar_acc.mean())
    print("acc std", ar_acc.std())