In [1]:
import glob
import json
import os
import traceback
import pandas as pd
import numpy as np
from collections import defaultdict

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage

from pathlib import Path

import ast

from masterthesis.agent.GitAgent import GitAgent
from masterthesis.agent.MavenReproducerAgent import MavenReproducerAgent
from masterthesis.dataset.load_dataset import cleanup_dataset, load_dataset

def load_persisted_str_data(file_path):
    file_path = Path(file_path)
    with file_path.open('r') as file:
        content = file.read()
    
    # Basic safety check: ensure the content is a valid Python expression
    try:
        ast.parse(content)
    except SyntaxError:
        raise ValueError("The file content is not a valid Python expression")
    
    # Use eval to evaluate the string
    data = eval(content, {
        'AIMessage': AIMessage,
        'HumanMessage': HumanMessage,
        'SystemMessage': SystemMessage,
        'ToolMessage': ToolMessage
    })
    
    return data

def get_language_model(file_path):
    return file_path.split("/")[-2]

# Get all agent protocol files
agent_protocol_files = glob.glob("dataset/*/*/agent_protocol.json")

# Initialize dictionaries to store data for each language model
data = defaultdict(lambda: {'Attempts': 0, 'Compilation Errors': 0, 'Compile Success': 0, 'Test Success': 0})

# total_input_tokens =0
# total_output_tokens = 0


reproduced_compilation_success = 0
reproduced_test_success = 0

# for file in agent_protocol_files:
#     lm = get_language_model(file)
#     commit_hash = file.split("/")[-3]
#     data[lm]['Attempts'] += 1


#     final_state = load_persisted_str_data(Path(file).parent / "final_state")





#     for message in final_state["messages"]:
#         if isinstance(message, AIMessage):
           
#            if message.usage_metadata:
#             total_input_tokens += message.usage_metadata['input_tokens']
#             total_output_tokens += message.usage_metadata['output_tokens']


# total_input_tokens = 30544504
# total_output_tokens = 768981
# num_invocations = 1800

total_input_tokens = 30357473
total_output_tokens = 975658
num_invocations = 1578


with open("model_metadata.json", "r") as file:
    model_metadata = json.load(file)

with open("model_prices.json", "r") as file:
    prices = json.load(file)


# Calculate the average input and output tokens per invocation
average_input_tokens_per_invocation = total_input_tokens // num_invocations
average_output_tokens_per_invocation = total_output_tokens // num_invocations


total_rounded_input_tokens = total_input_tokens
total_rounded_output_tokens = total_output_tokens

# Calculate total tokens per day
total_tokens_per_day = total_rounded_input_tokens + total_rounded_output_tokens

# Calculate costs for each model with rounded tokens
rounded_costs = {}

# Models with separate input and output prices
for model in ["gpt-4o", "gpt-4o-mini", "mistral-nemo"]:
    input_cost = (total_rounded_input_tokens / 1_000_000) * prices[model]["input"]
    output_cost = (total_rounded_output_tokens / 1_000_000) * prices[model]["output"]
    total_cost = input_cost + output_cost
    rounded_costs[model] = total_cost

# Add Claude models only if total tokens per day is 1 million or lower
# if total_tokens_per_day <= 1_000_000:
for model in ["claude-3.5-sonnet", "claude-3-haiku"]:
    input_cost = (total_rounded_input_tokens / 1_000_000) * prices[model]["input"]
    output_cost = (total_rounded_output_tokens / 1_000_000) * prices[model]["output"]
    total_cost = input_cost + output_cost
    rounded_costs[model] = total_cost

# Models with combined input and output prices
for model in ["Llama-3.1-70B", "Llama-3.1-405B"]:
    combined_tokens = total_rounded_input_tokens + total_rounded_output_tokens
    combined_cost = (combined_tokens / 1_000_000) * prices[model]["combined"]
    rounded_costs[model] = combined_cost

for model in ["gemini-1.5-pro"]:
    input_cost_short = (total_rounded_input_tokens / 1_000_000) * prices[model]["input_short"]
    output_cost_short = (total_rounded_output_tokens / 1_000_000) * prices[model]["output_short"]
    total_cost_short = input_cost_short + output_cost_short

    input_cost_long = (total_rounded_input_tokens / 1_000_000) * prices[model]["input_long"]
    output_cost_long = (total_rounded_output_tokens / 1_000_000) * prices[model]["output_long"]
    total_cost_long = input_cost_long + output_cost_long

    if average_input_tokens_per_invocation <= 500_000:
        rounded_costs[f"{model} (short)"] = total_cost_short
    else:
    # rounded_costs[f"{model} (short)"] = total_cost_short
        rounded_costs[f"{model} (long)"] = total_cost_long

# Display the results in a readable format
import pandas as pd

# Create a DataFrame for better visualization
costs_df = pd.DataFrame(list(rounded_costs.items()), columns=["Model", "Cost (USD)"])
costs_df.sort_values("Cost (USD)", ascending=False, inplace=True)
costs_df["Cost (USD)"] = costs_df["Cost (USD)"].apply(lambda x: f"${x:.2f}")

# Display rounded tokens and costs
total_rounded_input_tokens, total_rounded_output_tokens, total_tokens_per_day, costs_df

(30357473,
 975658,
 31333131,
                     Model Cost (USD)
 0                  gpt-4o    $166.42
 6          Llama-3.1-405B    $156.67
 7  gemini-1.5-pro (short)    $116.50
 3       claude-3.5-sonnet    $105.71
 5           Llama-3.1-70B     $27.57
 2            mistral-nemo      $9.40
 4          claude-3-haiku      $8.81
 1             gpt-4o-mini      $5.14)