In [1]:
import gurobipy as gp
from gurobipy import GRB
from eventlet.timeout import Timeout

# import auxillary packages
import requests  # for loading the example source code
import openai
import json

# import flaml and autogen
from flaml import autogen
from flaml.autogen.agentchat import Agent, UserProxyAgent
from flaml.autogen.code_utils import extract_code
from EnergySaverLLM.Agent import ChargingAgent, reset_params_file, clear_param_backups

In [2]:
benchmark_dataset_path = 'benchmark/EV.benchmark.json'

In [3]:
# global params_filepath
params_filepath = "EnergySaverLLM/Model/params/EVCharging.json"
params_filepath_backup = "EnergySaverLLM/Model/params/EVCharging_original.json"

In [4]:
with open(benchmark_dataset_path, 'r') as f:
    # text = f.read()
    benchmark_dataset = json.loads(f.read())

In [5]:
log_history = {}
autogen.oai.ChatCompletion.start_logging(log_history)

In [6]:
code_path = "EnergySaverLLM/Model/EVCharging.py"

In [7]:
with open(code_path) as f:
    code = f.read()

In [8]:
example_qa = """
----------
Instruction: Charge the car till 9 AM.
Answer Code:
```JSON
"end_charge_time": 9
```

----------
Question: Charge the car to full charge by 9 AM
Answer Code:
```JSON
"end_charge": 1.00,
"end_charge_time": 9
```
"""

In [9]:
config_list = autogen.config_list_from_json(
    env_or_file = "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-4", "gpt-3.5-turbo-16k"],
    },
)

In [10]:
benchmark_result = []
for benchmark_instance in benchmark_dataset:
    
    reset_params_file(params_filepath, params_filepath_backup)
    clear_param_backups(params_filepath)

    agent = ChargingAgent(
        name="Tesla Charging Example",
        source_code=code,
        example_qa=example_qa,
        json_filepath=params_filepath,
        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
        }
    )

    user = UserProxyAgent(
        "user", max_consecutive_auto_reply=0,
        human_input_mode="NEVER", code_execution_config=False
    )
    
    user.initiate_chat(agent, message=benchmark_instance['prompt'], silent=True)

    try:
        predicted_param = json.loads('{'+extract_code(eval(list(log_history.keys())[-2])[-1]['content'])[0][1]+'}')
    except:
        benchmark_result.append((0, 'Invalid Response: ' + eval(list(log_history.keys())[-2])[-1]['content']))
        continue


    bench_json_str = benchmark_instance['json_str']
    colon_idx = bench_json_str.index(':')
    bench_json_str_adj = bench_json_str[:8] + '"' + bench_json_str[8:colon_idx] + '"' + bench_json_str[colon_idx:-3] + "\n" + bench_json_str[-3:]

    truth_param = json.loads('{'+extract_code(bench_json_str_adj)[0][1]+'}')
    result_instance_str = "; bench: " + extract_code(bench_json_str_adj)[0][1] + " pred: " + json.dumps(predicted_param)
    benchmark_result.append((int(truth_param == predicted_param), str(truth_param == predicted_param) + result_instance_str))

    # break
    

    


{'0': 0.3, '1': 0.3, '2': 0.3, '3': 0.3, '4': 0.3, '5': 0.3, '6': 0.3, '7': 0.35, '8': 0.35, '9': 0.35, '10': 0.35, '11': 0.3, '12': 0.3, '13': 0.3, '14': 0.3, '15': 0.3, '16': 0.3, '17': 0.3, '18': 0.3, '19': 0.35, '20': 0.35, '21': 0.35, '22': 0.3, '23': 0.3}
Restricted license - for non-production use only - expires 2024-10-28
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (mac64[arm])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 2 rows, 48 columns and 96 nonzeros
Model fingerprint: 0x7c396a84
Variable types: 0 continuous, 48 integer (0 binary)
Coefficient statistics:
  Matrix range     [3e-01, 1e+00]
  Objective range  [8e+00, 8e+00]
  Bounds range     [1e+01, 1e+01]
  RHS range        [2e+01, 4e+01]
Presolve removed 2 rows and 48 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 8 available 

In [11]:
benchmark_result

[(1,
  'True; bench: "carbon_cost_weight": 0.2 pred: {"carbon_cost_weight": 0.2}'),
 (1,
  'True; bench: "carbon_cost_weight": 0.4 pred: {"carbon_cost_weight": 0.4}'),
 (1, 'True; bench: "max_energy_cost": 10 pred: {"max_energy_cost": 10}'),
 (1, 'True; bench: "max_energy_cost": 15 pred: {"max_energy_cost": 15}'),
 (1, 'True; bench: "max_energy_cost": 20 pred: {"max_energy_cost": 20}'),
 (1, 'True; bench: "max_energy_cost": 25 pred: {"max_energy_cost": 25}'),
 (1, 'True; bench: "max_energy_cost": 30 pred: {"max_energy_cost": 30}'),
 (1, 'True; bench: "max_energy_cost": 35 pred: {"max_energy_cost": 35}'),
 (1, 'True; bench: "max_energy_cost": 40 pred: {"max_energy_cost": 40}'),
 (1, 'True; bench: "max_energy_cost": 45 pred: {"max_energy_cost": 45}'),
 (1, 'True; bench: "end_charge": 0.6 pred: {"end_charge": 0.6}'),
 (1, 'True; bench: "end_charge": 0.7 pred: {"end_charge": 0.7}'),
 (1, 'True; bench: "end_charge": 0.9 pred: {"end_charge": 0.9}'),
 (1, 'True; bench: "end_charge": 1.0 pred:

In [12]:
config_list = autogen.config_list_from_json(
    env_or_file = "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-3.5-turbo-16k"],
    },
)

In [13]:
log_history = {}
autogen.oai.ChatCompletion.start_logging(log_history)

In [14]:
benchmark_result_gpt3 = []
for benchmark_instance in benchmark_dataset:
    
    reset_params_file(params_filepath, params_filepath_backup)
    clear_param_backups(params_filepath)

    agent = ChargingAgent(
        name="Tesla Charging Example",
        source_code=code,
        example_qa=example_qa,
        json_filepath=params_filepath,
        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
        }
    )

    user = UserProxyAgent(
        "user", max_consecutive_auto_reply=0,
        human_input_mode="NEVER", code_execution_config=False
    )
    
    user.initiate_chat(agent, message=benchmark_instance['prompt'], silent=True)

    try:
        predicted_param = json.loads('{'+extract_code(eval(list(log_history.keys())[-2])[-1]['content'])[0][1]+'}')
    except:
        benchmark_result_gpt3.append((0, 'Invalid Response: ' + eval(list(log_history.keys())[-2])[-1]['content']))
        continue


    bench_json_str = benchmark_instance['json_str']
    colon_idx = bench_json_str.index(':')
    bench_json_str_adj = bench_json_str[:8] + '"' + bench_json_str[8:colon_idx] + '"' + bench_json_str[colon_idx:-3] + "\n" + bench_json_str[-3:]

    truth_param = json.loads('{'+extract_code(bench_json_str_adj)[0][1]+'}')
    result_instance_str = "; bench: " + extract_code(bench_json_str_adj)[0][1] + " pred: " + json.dumps(predicted_param)
    benchmark_result_gpt3.append((int(truth_param == predicted_param), str(truth_param == predicted_param) + result_instance_str))

    # break
    

    


{'0': 0.3, '1': 0.3, '2': 0.3, '3': 0.3, '4': 0.3, '5': 0.3, '6': 0.3, '7': 0.35, '8': 0.35, '9': 0.35, '10': 0.35, '11': 0.3, '12': 0.3, '13': 0.3, '14': 0.3, '15': 0.3, '16': 0.3, '17': 0.3, '18': 0.3, '19': 0.35, '20': 0.35, '21': 0.35, '22': 0.3, '23': 0.3}
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (mac64[arm])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 2 rows, 48 columns and 96 nonzeros
Model fingerprint: 0x7c396a84
Variable types: 0 continuous, 48 integer (0 binary)
Coefficient statistics:
  Matrix range     [3e-01, 1e+00]
  Objective range  [8e+00, 8e+00]
  Bounds range     [1e+01, 1e+01]
  RHS range        [2e+01, 4e+01]
Presolve removed 2 rows and 48 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 8 available processors)

Solution count 1: 327.6 

Optimal solution found (toleran

In [15]:
benchmark_result_gpt3

[(1,
  'True; bench: "carbon_cost_weight": 0.2 pred: {"carbon_cost_weight": 0.2}'),
 (1,
  'True; bench: "carbon_cost_weight": 0.4 pred: {"carbon_cost_weight": 0.4}'),
 (1, 'True; bench: "max_energy_cost": 10 pred: {"max_energy_cost": 10}'),
 (1, 'True; bench: "max_energy_cost": 15 pred: {"max_energy_cost": 15}'),
 (1, 'True; bench: "max_energy_cost": 20 pred: {"max_energy_cost": 20}'),
 (1, 'True; bench: "max_energy_cost": 25 pred: {"max_energy_cost": 25}'),
 (1, 'True; bench: "max_energy_cost": 30 pred: {"max_energy_cost": 30}'),
 (1, 'True; bench: "max_energy_cost": 35 pred: {"max_energy_cost": 35}'),
 (1, 'True; bench: "max_energy_cost": 40 pred: {"max_energy_cost": 40}'),
 (1, 'True; bench: "max_energy_cost": 45 pred: {"max_energy_cost": 45}'),
 (1, 'True; bench: "end_charge": 0.6 pred: {"end_charge": 0.6}'),
 (1, 'True; bench: "end_charge": 0.7 pred: {"end_charge": 0.7}'),
 (1, 'True; bench: "end_charge": 0.9 pred: {"end_charge": 0.9}'),
 (1, 'True; bench: "end_charge": 1.0 pred: