In [1]:
import gurobipy as gp
from gurobipy import GRB
from eventlet.timeout import Timeout

# import auxillary packages
import requests  # for loading the example source code
import openai
import json
import time
import random

# import flaml and autogen
from flaml import autogen
from flaml.autogen.agentchat import Agent, UserProxyAgent
from flaml.autogen.code_utils import extract_code
from EnergySaverLLM.Agent import ChargingAgent, reset_params_file, clear_param_backups

In [2]:
benchmark_dataset_path = "EV_combined.benchmark.json"

In [3]:
# global params_filepath
params_filepath = "EnergySaverLLM/Model/params/EVCharging.json"
params_filepath_backup = "EnergySaverLLM/Model/params/EVCharging_original.json"

In [4]:
with open(benchmark_dataset_path, 'r') as f:
    benchmark_dataset = json.loads(f.read())

In [5]:
log_history = {}
autogen.oai.ChatCompletion.start_logging(log_history)

In [6]:
code_path = "EnergySaverLLM/Model/EVCharging.py"

In [7]:
with open(code_path) as f:
    code = f.read()

In [8]:
example_qa = """
----------
Instruction: Charge the car till 9 AM.
Answer Code:
```JSON
"end_charge_time": 9
```

----------
Question: Charge the car to full charge by 9 AM
Answer Code:
```JSON
"end_charge": 1.00,
"end_charge_time": 9
```
"""

In [22]:
config_list = autogen.config_list_from_json(
    env_or_file = "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-4"],
    },
)

In [23]:
agent = ChargingAgent(
    name="Tesla Charging Example",
    source_code=code,
    example_qa=example_qa,
    json_filepath=params_filepath,
    llm_config={
        "request_timeout": 600,
        "seed": 42,
        "config_list": config_list,
    },
    evaluate=True
)

user = UserProxyAgent(
    "user", max_consecutive_auto_reply=0,
    human_input_mode="NEVER", code_execution_config=False
)

{'0': 0.3, '1': 0.3, '2': 0.3, '3': 0.3, '4': 0.3, '5': 0.3, '6': 0.3, '7': 0.35, '8': 0.35, '9': 0.35, '10': 0.35, '11': 0.3, '12': 0.3, '13': 0.3, '14': 0.3, '15': 0.3, '16': 0.3, '17': 0.3, '18': 0.3, '19': 0.35, '20': 0.35, '21': 0.35, '22': 0.3, '23': 0.3}
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (mac64[arm])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 2 rows, 48 columns and 96 nonzeros
Model fingerprint: 0x7c396a84
Variable types: 0 continuous, 48 integer (0 binary)
Coefficient statistics:
  Matrix range     [3e-01, 1e+00]
  Objective range  [8e+00, 8e+00]
  Bounds range     [1e+01, 1e+01]
  RHS range        [2e+01, 4e+01]
Presolve removed 2 rows and 48 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 8 available processors)

Solution count 1: 327.6 

Optimal solution found (toleran

In [24]:
len(benchmark_dataset)

853

In [25]:
def parse_json(json_str):
    
    
    try:    
        return json.loads(json_str[7:-4])
    except:
        pass

    try:
        return json.loads('{' + json_str[7:-4] + '}')
    except:
        raise TypeError("Invalid String, unable to convert to json") 


In [26]:
benchmark_result = []
sampled_benchmark = random.sample(benchmark_dataset, 100)
count_correct = 0
for benchmark_instance in sampled_benchmark:
    
    reset_params_file(params_filepath, params_filepath_backup)
    clear_param_backups(params_filepath)
    
    agent = ChargingAgent(
        name="Tesla Charging Example",
        source_code=code,
        example_qa=example_qa,
        json_filepath=params_filepath,
        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
        },
        evaluate=True
    )

    user = UserProxyAgent(
        "user", max_consecutive_auto_reply=0,
        human_input_mode="NEVER", code_execution_config=False
    )

    user.initiate_chat(agent, message=benchmark_instance['prompt'], silent = True, clear_history=True)

    try:
        predicted_param = parse_json(eval(list(log_history.keys())[-1])[-1]['content'])
    except:
        benchmark_result.append((0, 'Invalid Response: ' + eval(list(log_history.keys())[-1])[-1]['content']))

    bench_json_str = benchmark_instance['json_str']
    
    truth_param = json.loads('{'+extract_code(bench_json_str)[0][1]+'}')
    result_instance_str = "; bench: " + extract_code(bench_json_str)[0][1] + " pred: " + json.dumps(predicted_param)
    result_instance = {'result' : truth_param == predicted_param, 
                       'index' : benchmark_instance['index'], 
                       'bench' : '{' + extract_code(bench_json_str)[0][1] + '}',
                       'pred' : json.dumps(predicted_param)}

    count_correct = count_correct + int(truth_param == predicted_param)
    benchmark_result.append(result_instance)

    time.sleep(0.5)


{'0': 0.3, '1': 0.3, '2': 0.3, '3': 0.3, '4': 0.3, '5': 0.3, '6': 0.3, '7': 0.35, '8': 0.35, '9': 0.35, '10': 0.35, '11': 0.3, '12': 0.3, '13': 0.3, '14': 0.3, '15': 0.3, '16': 0.3, '17': 0.3, '18': 0.3, '19': 0.35, '20': 0.35, '21': 0.35, '22': 0.3, '23': 0.3}
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (mac64[arm])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 2 rows, 48 columns and 96 nonzeros
Model fingerprint: 0x7c396a84
Variable types: 0 continuous, 48 integer (0 binary)
Coefficient statistics:
  Matrix range     [3e-01, 1e+00]
  Objective range  [8e+00, 8e+00]
  Bounds range     [1e+01, 1e+01]
  RHS range        [2e+01, 4e+01]
Presolve removed 2 rows and 48 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 8 available processors)

Solution count 1: 327.6 

Optimal solution found (toleran

In [27]:
count_correct

80

In [28]:
len(benchmark_result)

100

In [29]:
for result in benchmark_result:
    if not result['result']:
        print(benchmark_dataset[result['index']])

{'json_str': '```JSON\n"battery_capacity": 70,\n"end_charge_time": 0\n```', 'prompt': 'Begin charging immediately.', 'index': 828}
{'json_str': '```JSON\n"carbon_cost_weight": 0.1,\n"end_charge_time": 21\n```', 'prompt': 'The change is that the "carbon_cost_weight" parameter has been updated to 0.1, and the "end_charge_time" parameter has been updated to 21.', 'index': 349}
{'json_str': '```JSON\n"carbon_cost_weight": 0.15,\n"end_charge": 1.15\n```', 'prompt': 'Penalize carbon emissions less during charging and stop charging at 40% battery level.', 'index': 214}
{'json_str': '```JSON\n"carbon_cost_weight": 0.65,\n"end_charge": 1.15\n```', 'prompt': 'Increase the factor of carbon emission to the cost by 65% and only charge the car to 40% this time.', 'index': 193}
{'json_str': '```JSON\n"carbon_cost_weight": 0.0,\n"battery_capacity": 40\n```', 'prompt': 'I want to not consider carbon emissions while charging and change the battery capacity to 90 KWH.', 'index': 297}
{'json_str': '```JSO

In [46]:
benchmark_result_second_pass = []
sampled_benchmark = [benchmark_dataset[result['index']] for result in benchmark_result if not result['result']]
count_correct_second_pass = 0
for benchmark_instance in sampled_benchmark:
    
    # reset_params_file(params_filepath, params_filepath_backup)
    # clear_param_backups(params_filepath)
    
    agent = ChargingAgent(
        name="Tesla Charging Example",
        source_code=code,
        example_qa=example_qa,
        json_filepath=params_filepath,
        evaluate=True,
        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
        }
    )

    user = UserProxyAgent(
        "user", max_consecutive_auto_reply=0,
        human_input_mode="NEVER", code_execution_config=False
    )

    print(benchmark_instance['prompt'])
    user.initiate_chat(agent, message=benchmark_instance['prompt'], silent = True, clear_history=True)

    try:
        predicted_param = parse_json(eval(list(log_history.keys())[-1])[-1]['content'])
    except:
        benchmark_result.append((0, 'Invalid Response: ' + eval(list(log_history.keys())[-1])[-1]['content']))

    bench_json_str = benchmark_instance['json_str']
    
    truth_param = json.loads('{'+extract_code(bench_json_str)[0][1]+'}')
    result_instance_str = "; bench: " + extract_code(bench_json_str)[0][1] + " pred: " + json.dumps(predicted_param)
    result_instance = {'result' : truth_param == predicted_param, 
                       'index' : benchmark_instance['index'], 
                       'bench' : '{' + extract_code(bench_json_str)[0][1] + '}',
                       'pred' : json.dumps(predicted_param)}

    count_correct_second_pass = count_correct_second_pass + int(truth_param == predicted_param)
    benchmark_result_second_pass.append(result_instance)

    time.sleep(0.5)


{'0': 0.3, '1': 0.3, '2': 0.3, '3': 0.3, '4': 0.3, '5': 0.3, '6': 0.3, '7': 0.35, '8': 0.35, '9': 0.35, '10': 0.35, '11': 0.3, '12': 0.3, '13': 0.3, '14': 0.3, '15': 0.3, '16': 0.3, '17': 0.3, '18': 0.3, '19': 0.35, '20': 0.35, '21': 0.35, '22': 0.3, '23': 0.3}
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (mac64[arm])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 2 rows, 48 columns and 96 nonzeros
Model fingerprint: 0x7c396a84
Variable types: 0 continuous, 48 integer (0 binary)
Coefficient statistics:
  Matrix range     [3e-01, 1e+00]
  Objective range  [8e+00, 8e+00]
  Bounds range     [1e+01, 1e+01]
  RHS range        [2e+01, 4e+01]
Presolve removed 2 rows and 48 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 8 available processors)

Solution count 1: 327.6 

Optimal solution found (toleran

In [47]:
count_correct_second_pass

0

In [49]:
sampled_benchmark

[{'json_str': '```JSON\n"battery_capacity": 70,\n"end_charge_time": 0\n```',
  'prompt': 'Begin charging immediately.',
  'index': 828},
 {'json_str': '```JSON\n"carbon_cost_weight": 0.1,\n"end_charge_time": 21\n```',
  'prompt': 'The change is that the "carbon_cost_weight" parameter has been updated to 0.1, and the "end_charge_time" parameter has been updated to 21.',
  'index': 349},
 {'json_str': '```JSON\n"carbon_cost_weight": 0.15,\n"end_charge": 1.15\n```',
  'prompt': 'Penalize carbon emissions less during charging and stop charging at 40% battery level.',
  'index': 214},
 {'json_str': '```JSON\n"carbon_cost_weight": 0.65,\n"end_charge": 1.15\n```',
  'prompt': 'Increase the factor of carbon emission to the cost by 65% and only charge the car to 40% this time.',
  'index': 193},
 {'json_str': '```JSON\n"carbon_cost_weight": 0.0,\n"battery_capacity": 40\n```',
  'prompt': 'I want to not consider carbon emissions while charging and change the battery capacity to 90 KWH.',
  'inde

In [36]:
benchmark_result_second_pass

[{'result': False,
  'index': 828,
  'bench': '{"battery_capacity": 70,\n"end_charge_time": 0}',
  'pred': '{"end_charge": 0.65, "max_power": 12.3}'},
 {'result': False,
  'index': 349,
  'bench': '{"carbon_cost_weight": 0.1,\n"end_charge_time": 21}',
  'pred': '{"end_charge": 0.65, "max_power": 12.3}'},
 {'result': False,
  'index': 214,
  'bench': '{"carbon_cost_weight": 0.15,\n"end_charge": 1.15}',
  'pred': '{"end_charge": 0.65, "max_power": 12.3}'},
 {'result': False,
  'index': 193,
  'bench': '{"carbon_cost_weight": 0.65,\n"end_charge": 1.15}',
  'pred': '{"end_charge": 0.65, "max_power": 12.3}'},
 {'result': False,
  'index': 297,
  'bench': '{"carbon_cost_weight": 0.0,\n"battery_capacity": 40}',
  'pred': '{"end_charge": 0.65, "max_power": 12.3}'},
 {'result': False,
  'index': 292,
  'bench': '{"carbon_cost_weight": 0.9,\n"battery_capacity": 85}',
  'pred': '{"end_charge": 0.65, "max_power": 12.3}'},
 {'result': False,
  'index': 182,
  'bench': '{"carbon_cost_weight": 0.7,\n

In [50]:
eval(list(log_history.keys())[-1])

[{'content': 'You are a chatbot to \n(1) write JSON to edit parameters as per user request for Electric Vehicle Charging.\n\n--- JSON ---\n{\n    "elec_cost": {\n        "0": 0.30,\n        "1": 0.30,\n        "2": 0.30,\n        "3": 0.30,\n        "4": 0.30,\n        "5": 0.30,\n        "6": 0.30,\n        "7": 0.35,\n        "8": 0.35,\n        "9": 0.35,\n        "10": 0.35,\n        "11": 0.30,\n        "12": 0.30,\n        "13": 0.30,\n        "14": 0.30,\n        "15": 0.30,\n        "16": 0.30,\n        "17": 0.30,\n        "18": 0.30,\n        "19": 0.35,\n        "20": 0.35,\n        "21": 0.35,\n        "22": 0.30,\n        "23": 0.30\n    },\n    "elec_cost_desc": "USD per KWH each hour of day",\n    "carbon_cost": {\n        "0": 25,\n        "1": 25,\n        "2": 25,\n        "3": 25,\n        "4": 25,\n        "5": 25,\n        "6": 25,\n        "7": 25,\n        "8": 25,\n        "9": 25,\n        "10": 25,\n        "11": 25,\n        "12": 25,\n        "13": 25,\n    

In [51]:
len(list(log_history.keys()))

164

In [40]:
extract_code(eval(list(log_history.keys())[-1])[-1]['content'])[0][-1]

'{\n    "end_charge_time": 6,\n    "battery_capacity": 70\n}'

In [34]:
benchmark_result[-1]

(0,
 'Invalid Response: ```JSON\n{\n    "end_charge_time": 6,\n    "battery_capacity": 70\n}\n```')

In [17]:
len(benchmark_result)

3

In [25]:
eval(list(log_history.keys())[-1])

[{'content': 'You are a chatbot to \n(1) write JSON to edit parameters as per user request for Electric Vehicle Charging.\n\n--- JSON ---\n{\n    "elec_cost": {\n        "0": 0.30,\n        "1": 0.30,\n        "2": 0.30,\n        "3": 0.30,\n        "4": 0.30,\n        "5": 0.30,\n        "6": 0.30,\n        "7": 0.35,\n        "8": 0.35,\n        "9": 0.35,\n        "10": 0.35,\n        "11": 0.30,\n        "12": 0.30,\n        "13": 0.30,\n        "14": 0.30,\n        "15": 0.30,\n        "16": 0.30,\n        "17": 0.30,\n        "18": 0.30,\n        "19": 0.35,\n        "20": 0.35,\n        "21": 0.35,\n        "22": 0.30,\n        "23": 0.30\n    },\n    "elec_cost_desc": "USD per KWH each hour of day",\n    "carbon_cost": {\n        "0": 25,\n        "1": 25,\n        "2": 25,\n        "3": 25,\n        "4": 25,\n        "5": 25,\n        "6": 25,\n        "7": 25,\n        "8": 25,\n        "9": 25,\n        "10": 25,\n        "11": 25,\n        "12": 25,\n        "13": 25,\n    

In [63]:
eval(list(log_history.keys())[-1])

[{'content': "You are a chatbot to:\n(1) explain solutions from a Gurobi/Python solver.\n\nThe execution result of the original source code is below.\n--- Original Result ---\nOptimization problem solved. The objective value is: 327.6\n\n\nHere are the history of discussions:\n[{'content': 'Change the end charge time to 1 AM.', 'role': 'user'}]",
  'role': 'system'},
 {'content': 'Here are the execution results: Optimization problem solved. The objective value is: 380.55\n\nCan you organize these information to a human readable answer?\nRemember to compare the new results to the original results you obtained in the\nbeginning. A lower objective value is better here.\n\n--- HUMAN READABLE ANSWER ---\n',
  'role': 'user'},
 {'content': "The adjustment of the end charge time to 1 AM resulted in solving the optimization problem with an objective value of 380.55. Comparatively, this is higher than the original objective value of 327.6, indicating a less optimal solution. Remember, in this c

In [64]:
eval(list(log_history.keys())[-2])

[{'content': 'You are a chatbot to \n(1) write JSON to edit parameters as per user request for Electric Vehicle Charging.\n\n--- JSON ---\n{\n    "elec_cost": {\n        "0": 0.3,\n        "1": 0.3,\n        "2": 0.3,\n        "3": 0.3,\n        "4": 0.3,\n        "5": 0.3,\n        "6": 0.3,\n        "7": 0.35,\n        "8": 0.35,\n        "9": 0.35,\n        "10": 0.35,\n        "11": 0.3,\n        "12": 0.3,\n        "13": 0.3,\n        "14": 0.3,\n        "15": 0.3,\n        "16": 0.3,\n        "17": 0.3,\n        "18": 0.3,\n        "19": 0.35,\n        "20": 0.35,\n        "21": 0.35,\n        "22": 0.3,\n        "23": 0.3\n    },\n    "elec_cost_desc": "USD per KWH each hour of day",\n    "carbon_cost": {\n        "0": 25,\n        "1": 25,\n        "2": 25,\n        "3": 25,\n        "4": 25,\n        "5": 25,\n        "6": 25,\n        "7": 25,\n        "8": 25,\n        "9": 25,\n        "10": 25,\n        "11": 25,\n        "12": 25,\n        "13": 25,\n        "14": 25,\n  

In [12]:
config_list = autogen.config_list_from_json(
    env_or_file = "OAI_CONFIG_LIST",
    filter_dict={
        "model": ["gpt-3.5-turbo-16k"],
    },
)

In [13]:
log_history = {}
autogen.oai.ChatCompletion.start_logging(log_history)

In [14]:
benchmark_result_gpt3 = []
for benchmark_instance in benchmark_dataset:
    
    reset_params_file(params_filepath, params_filepath_backup)
    clear_param_backups(params_filepath)

    agent = ChargingAgent(
        name="Tesla Charging Example",
        source_code=code,
        example_qa=example_qa,
        json_filepath=params_filepath,
        llm_config={
            "request_timeout": 600,
            "seed": 42,
            "config_list": config_list,
        }
    )

    user = UserProxyAgent(
        "user", max_consecutive_auto_reply=0,
        human_input_mode="NEVER", code_execution_config=False
    )
    
    user.initiate_chat(agent, message=benchmark_instance['prompt'], silent=True)

    try:
        predicted_param = json.loads('{'+extract_code(eval(list(log_history.keys())[-2])[-1]['content'])[0][1]+'}')
    except:
        benchmark_result_gpt3.append((0, 'Invalid Response: ' + eval(list(log_history.keys())[-2])[-1]['content']))
        continue


    bench_json_str = benchmark_instance['json_str']
    colon_idx = bench_json_str.index(':')
    bench_json_str_adj = bench_json_str[:8] + '"' + bench_json_str[8:colon_idx] + '"' + bench_json_str[colon_idx:-3] + "\n" + bench_json_str[-3:]

    truth_param = json.loads('{'+extract_code(bench_json_str_adj)[0][1]+'}')
    result_instance_str = "; bench: " + extract_code(bench_json_str_adj)[0][1] + " pred: " + json.dumps(predicted_param)
    benchmark_result_gpt3.append((int(truth_param == predicted_param), str(truth_param == predicted_param) + result_instance_str))

    # break
    

    


{'0': 0.3, '1': 0.3, '2': 0.3, '3': 0.3, '4': 0.3, '5': 0.3, '6': 0.3, '7': 0.35, '8': 0.35, '9': 0.35, '10': 0.35, '11': 0.3, '12': 0.3, '13': 0.3, '14': 0.3, '15': 0.3, '16': 0.3, '17': 0.3, '18': 0.3, '19': 0.35, '20': 0.35, '21': 0.35, '22': 0.3, '23': 0.3}
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (mac64[arm])

CPU model: Apple M1
Thread count: 8 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 2 rows, 48 columns and 96 nonzeros
Model fingerprint: 0x7c396a84
Variable types: 0 continuous, 48 integer (0 binary)
Coefficient statistics:
  Matrix range     [3e-01, 1e+00]
  Objective range  [8e+00, 8e+00]
  Bounds range     [1e+01, 1e+01]
  RHS range        [2e+01, 4e+01]
Presolve removed 2 rows and 48 columns
Presolve time: 0.00s
Presolve: All rows and columns removed

Explored 0 nodes (0 simplex iterations) in 0.00 seconds (0.00 work units)
Thread count was 1 (of 8 available processors)

Solution count 1: 327.6 

Optimal solution found (toleran

In [15]:
benchmark_result_gpt3

[(1,
  'True; bench: "carbon_cost_weight": 0.2 pred: {"carbon_cost_weight": 0.2}'),
 (1,
  'True; bench: "carbon_cost_weight": 0.4 pred: {"carbon_cost_weight": 0.4}'),
 (1, 'True; bench: "max_energy_cost": 10 pred: {"max_energy_cost": 10}'),
 (1, 'True; bench: "max_energy_cost": 15 pred: {"max_energy_cost": 15}'),
 (1, 'True; bench: "max_energy_cost": 20 pred: {"max_energy_cost": 20}'),
 (1, 'True; bench: "max_energy_cost": 25 pred: {"max_energy_cost": 25}'),
 (1, 'True; bench: "max_energy_cost": 30 pred: {"max_energy_cost": 30}'),
 (1, 'True; bench: "max_energy_cost": 35 pred: {"max_energy_cost": 35}'),
 (1, 'True; bench: "max_energy_cost": 40 pred: {"max_energy_cost": 40}'),
 (1, 'True; bench: "max_energy_cost": 45 pred: {"max_energy_cost": 45}'),
 (1, 'True; bench: "end_charge": 0.6 pred: {"end_charge": 0.6}'),
 (1, 'True; bench: "end_charge": 0.7 pred: {"end_charge": 0.7}'),
 (1, 'True; bench: "end_charge": 0.9 pred: {"end_charge": 0.9}'),
 (1, 'True; bench: "end_charge": 1.0 pred: