In [1]:
import os
import re
import json
import base64
import tiktoken
import time
import fitz  # PyMuPDF
import pandas as pd
import openai
from tqdm.notebook import tqdm
from openai import OpenAI


In [2]:
samples = []
with open("data.json", "r") as f:
    for line in f:
        samples.append(eval(line.strip()))


In [3]:
def read_excel(file_path):
    # 读取Excel文件中的所有sheet
    xls = pd.ExcelFile(file_path)
    sheets = {}
    for sheet_name in xls.sheet_names:
        sheets[sheet_name] = xls.parse(sheet_name)
    return sheets

def dataframe_to_text(df):
    # 将DataFrame转换为文本
    text = df.to_string(index=False)
    return text

def combine_sheets_text(sheets):
    # 将所有sheet的文本内容组合起来
    combined_text = ""
    for sheet_name, df in sheets.items():
        sheet_text = dataframe_to_text(df)
        combined_text += f"Sheet name: {sheet_name}\n{sheet_text}\n\n"
    return combined_text


def find_jpg_files(directory):
    jpg_files = [file for file in os.listdir(directory) if file.lower().endswith('.jpg') or file.lower().endswith('.png')]
    return jpg_files if jpg_files else None

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
def read_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text
def read_txt(path):
    with open(path, "r") as f:
        return f.read()

def find_excel_files(directory):
    jpg_files = [file for file in os.listdir(directory) if (file.lower().endswith('xlsx') or file.lower().endswith('xlsb') or file.lower().endswith('xlsm')) and not "answer" in file.lower()]
    return jpg_files if jpg_files else None

In [4]:
MODEL_LIMITS = {
    "gpt-3.5-turbo-0125": 16_385,
    "gpt-4-turbo-2024-04-09": 128_000,
    "gpt-4o-2024-05-13": 128_000,
    "gpt-4o-mini-2024-07-18": 128_000
}

# The cost per token for each model input.
MODEL_COST_PER_INPUT = {
    "gpt-3.5-turbo-0125": 0.0000005,
    "gpt-4-turbo-2024-04-09": 0.00001,
    "gpt-4o-2024-05-13": 0.000005,
    "gpt-4o-mini-2024-07-18": 0.00000015
}

# The cost per token for each model output.
MODEL_COST_PER_OUTPUT = {
    "gpt-3.5-turbo-0125": 0.0000015,
    "gpt-4-turbo-2024-04-09": 0.00003,
    "gpt-4o-2024-05-13": 0.000015,
    "gpt-4o-mini-2024-07-18": 0.0000006
}



In [5]:
api_key = "Your API key"

client = OpenAI(api_key=api_key)

data_path = './data'
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4-turbo-2024-04-09"
model = "gpt-4o-2024-05-13"
# model = "gpt-4o-mini-2024-07-18"

In [None]:
error_cout = 0
total_cost = 0
for sample in tqdm(samples[0:]):
    if len(sample["questions"]) > 0:
        start = sample["questions"][0]
        end = sample["questions"][-1]
        # print(start)
        # print(end)
        image = find_jpg_files(os.path.join(data_path, sample["id"]))
        # if image:
        #     image = os.path.join("data", sample["id"], image[0])
        
        # excel_content = ""
        excels = find_excel_files(os.path.join(data_path, sample["id"]))
        # if excels:
        #     for excel in excels:
        #         excel_file_path = os.path.join("data",  sample["id"], excel)
        #         sheets = read_excel(excel_file_path)
        #         combined_text = combine_sheets_text(sheets)
        #         excel_content += f"The excel file {excel} is: " + combined_text

        introduction = read_txt(os.path.join(data_path, sample["id"], "introduction.txt"))
        questions = []
        for question_name in sample["questions"]:
            questions.append(read_txt(os.path.join(data_path, sample["id"], question_name+".txt")))
        
        question_content = ""    
        # print(workbooks)
        answers = []
        
        
        for question in tqdm(questions):
            start = time.time()

            text = f"The introduction is detailed as follows. \n {introduction} \nThe questions are detailed as follows. \n {question} \n\nPlease answer the above question. "
            
            assistant = client.beta.assistants.create(
            instructions="You are a data analyst. I will give  you a background introduction and data analysis question. You must answer the question. If the question is a multi-choice question and you are unsure which one is correct, you must guess an option.  Don't ask me any questions and give me the answer in the response. ",
            model=model,
            tools=[{"type": "code_interpreter"}],
            # tool_resources={
            #   "code_interpreter": {
            #     "file_ids": [train_file.id, test_file.id, sample_file.id]
            #   }
            # }
                )
            
            print("Upload a file with an assistants purpose")
            file_ids = []
            if image:
                id = client.files.create(
                file=open(os.path.join(data_path, sample["id"], image[0]), "rb"),
                purpose='assistants').id
                file_ids.append(id)
                # print(id)
            
            if excels:
                for excel in excels:
                    file_ids.append(client.files.create(
                file=open(os.path.join(data_path,  sample["id"], excel), "rb"),
                purpose='assistants'
              ).id)
            
            print("start a messages")
            attachments = []
            print("file_ids: ", file_ids)
            for file_id in file_ids:
                attachments.append({
                "file_id": file_id,
                "tools": [{"type": "code_interpreter"}]
                })
            if len(attachments) > 0:
                thread = client.beta.threads.create(
                    messages=[
                      {
                        "role": "user",
                        "content": text,
                        "attachments": attachments
                      }
                    ]
                  )
            else:
                thread = client.beta.threads.create(
                    messages=[
                      {
                        "role": "user",
                        "content": text,
                      }
                    ]
                  )
            print("start client running")
            run = client.beta.threads.runs.create_and_poll(
                thread_id=thread.id,
                assistant_id=assistant.id,
                max_prompt_tokens=100000
              )
            print("read  messages")
            
            prompt_tokens = run.usage.prompt_tokens 
            completion_tokens = run.usage.completion_tokens
            cost = 0.03 + run.usage.completion_tokens * MODEL_COST_PER_OUTPUT[model] + run.usage.prompt_tokens * MODEL_COST_PER_INPUT[model]
              
            
            messages = client.beta.threads.messages.list(
                  thread_id=thread.id
                )

            # with open(f"./test_results/gpt4o/{name}_message.txt", "w") as f:
            #     f.write(str(messages))
            for file_id in file_ids:
                client.files.delete(file_id)
     
      
            client.beta.threads.delete(thread.id)
            client.beta.assistants.delete(assistant.id)
            save_content = str([c.content[0].text.value for c in messages.data])
            answers.append({"id": sample["id"], "model": "code_interpreter_"+model, "input": prompt_tokens,
                            "output": completion_tokens, "cost": cost, "time": time.time()-start, "response": save_content})
            total_cost += cost
            print("Total cost: ", total_cost)
            # break
        save_path = os.path.join("./evaluation/save_process", "code_interpreter_"+model)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(os.path.join(save_path, sample['id']+".json"), "w") as f:
            for answer in answers:
                json.dump(answer, f)
                f.write("\n")
                
        # break 36
end = time.time()

In [None]:
messages