In [5]:
import torch
import json



import time
import openai
import datetime
import os

class LLMPrompter:
    def __init__(self, gpt_version, api_key) -> None:
        self.gpt_version = gpt_version
        if api_key is None:
            raise ValueError("OpenAI API key is not provided.")
        else:
            openai.api_key = api_key
    # Prompting for GPT-4
    def query_gpt4(
        self,
        prompt: dict,
        # Adjust the params
        sampling_params: dict = {
            "temperature": 0.7,
            "max_tokens": 200,
            "top_p": 1.0,
            "frequency_penalty": 0.0,
            "presence_penalty": 0.0,
        },
    ) -> str:
        while True:
            try:
                if "gpt-4" in self.gpt_version:
                    response = openai.chat.completions.create(
                        model=self.gpt_version,
                        messages=[
                            {"role": "system", "content": prompt["system"]},
                            {"role": "user", "content": prompt["user"]},
                        ],
                        **sampling_params
                    )
                else:
                    response = openai.chat.completions.create(
                        model=self.gpt_version, prompt=prompt, **sampling_params
                    )
            except Exception as e:
                print("Request failed, sleep 2 secs and try again...", e)
                time.sleep(2)
                continue
            break
        return response.choices[0].message.content
    # Prompting for GPT3.5-Turbo
    def query_gpt_turbo(
        self,
        prompt: str,
        sampling_params: dict = {
            "temperature": 0.7,
            "max_tokens": 2000,
            "top_p": 1.0,
            "frequency_penalty": 0.0,
            "presence_penalty": 0.0,
        },
    ) -> str:
        while True:
            try:
                if "turbo" in self.gpt_version:
                    # For chat models, use openai.ChatCompletion.create
                    response = openai.ChatCompletion.create(
                        model=self.gpt_version,
                        messages=[
                            {"role": "user", "content": prompt}
                        ],
                        **sampling_params
                    )
                    return response["choices"][0]["message"]["content"].strip()
                else:
                    # For non-chat models, use openai.Completion.create
                    response = openai.Completion.create(
                        model=self.gpt_version, 
                        prompt=prompt, 
                        **sampling_params
                    )
                    return response["choices"][0]["text"].strip()
            except Exception as e:
                print("Request failed, sleep 2 secs and try again...", e)
                time.sleep(2)
                continue


    def make_key(self):
        return datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    
if __name__=="__main__":
    api_key = os.environ["OPENAI_API_KEY"]
    prompter = LLMPrompter("gpt-4", api_key)
    prompt = {
        "system":"""You are a kind score giver."""
    }
    res = torch.load('result-7b-warehouse.pt')
    with open('test.json', 'r') as file:
        dataset = json.load(file)
        cnt = 0
        all_answers = []
        for entry in dataset:
            if cnt >= 300:
                break
            image_file = entry['image']  # Extract the image field
            # Iterate through each conversation in the entry
            for conversation in entry['conversations']:
                if conversation['from'] == 'gpt':
                    answer = conversation['value']
                    all_answers.append(answer)
                    cnt += 1
                    break
    results = []
    for i in range(300):
        prompt["user"]=f'Given the response generated by a language model and the ground truth for a task on spatial reasoning, evaluate the quality of the response according to correctness, logic and relevance. Score the response on a scale from 0 to 10, where 0 is the lowest and 10 is the highest quality, in comparison to the ground truth. Only output the numerical score. Response:{res[i]}. Ground-truth:{all_answers[i]}'
        response = prompter.query_gpt4(prompt)
        results.append(response)
    print(results)


['4', '6.5', '10', '4', '9', '9.5', '8', '4', '6', '10', '4', '9.5', '10', '8', '7.5', '6.5', '8.5', '6', '5', '3', '10', '9.5', '6.5', '9', '10', '8', '7.5', '10', '5', '10', '9.5', '8.5', '8', '4', '9', '7', '7', '4', '8', '9.5', '7', '5', '9.5', '9', '10', '4', '6.5', '8.5', '2', '7.5', '9', '6', '8', '7', '9.5', '8', '7.5', '4', '9.5', '6', '10', '8.5', '4', '7', '8.5', '7', '8.5', '6.5', '10', '7', '4', '7', '7', '6', '6', '10', '8.5', '6.5', '8.5', '7.5', '3', '9.5', '4', '8', '6', '4', '10', '6.5', '10', '7', '7', '10', '7', '8', '7.5', '4', '5', '10', '10', '9.5', '7', '9.5', '8', '5', '9.5', '6', '4', '6.5', '7', '9', '7.5', '9', '9', '4', '7.5', '8', '10', '7.5', '6.5', '7.5', '8', '9.5', '7.5', '9', '2', '9.5', '8', '4', '9.5', '7.5', '6', '6.5', '7', '9.5', '7', '8', '7', '3', '9.5', '8', '7.5', '7', '6.5', '8.5', '6', '4', '8.5', '10', '8', '5', '6.5', '8', '4', '7', '10', '7.5', '4', '4', '4', '8', '6.5', '10', '7.5', '4', '8.5', '8', '8', '9', '10', '5', '7', '6.5', '7',

In [6]:
numbers = [float(num) for num in results]

# Calculate the mean
mean_value = sum(numbers) / len(numbers)

print("The mean is:", mean_value)

The mean is: 7.215
