In [1]:
import os
import autogen
from autogen.coding import LocalCommandLineCodeExecutor
from autogen import AssistantAgent, UserProxyAgent
from IPython.display import Image, display
# import fitz  # PyMuPDF
import json
import base64
import re
import time
import pandas as pd
from tqdm.notebook import tqdm


In [2]:
api_key = 'your_api-key'
os.environ["OPENAI_API_KEY"]  = api_key

In [7]:
MODEL_LIMITS = {
    "gpt-3.5-turbo-0125": 16_385,
    "gpt-4-turbo-2024-04-09": 128_000,
    "gpt-4o-2024-05-13": 128_000,
    "gpt-4o-mini-2024-07-18": 128_000
}

# The cost per token for each model input.
MODEL_COST_PER_INPUT = {
    "gpt-3.5-turbo-0125": 0.0000005,
    "gpt-4-turbo-2024-04-09": 0.00001,
    "gpt-4o-2024-05-13": 0.000005,
    "gpt-4o-mini-2024-07-18": 0.00000015
}

# The cost per token for each model output.
MODEL_COST_PER_OUTPUT = {
    "gpt-3.5-turbo-0125": 0.0000015,
    "gpt-4-turbo-2024-04-09": 0.00003,
    "gpt-4o-2024-05-13": 0.000015,
    "gpt-4o-mini-2024-07-18": 0.0000006
}



In [4]:
data = []

with open("./data_filted_csv/kaggle_data_eval.json", "r") as f:
    for line in f:
        data.append(eval(line))
        
len(data)

74

In [20]:

config_list = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={"tags": ["gpt-4"]},  # comment out to get all
)

# create an AssistantAgent named "assistant"
assistant = autogen.AssistantAgent(
    name="assistant",
    llm_config={
        "cache_seed": 41,  # seed for caching and reproducibility
        "config_list": config_list,  # a list of OpenAI API configurations
        "temperature": 0,  # temperature for sampling
    },  # configuration for autogen's enhanced inference API which is compatible with OpenAI API
)

# create a UserProxyAgent instance named "user_proxy"
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
    code_execution_config={
        # the executor to run the generated code
        "executor": LocalCommandLineCodeExecutor(work_dir="coding"),
    },
)
start = time.time()
# the assistant receives a message from the user_proxy, which contains the task description
chat_res = user_proxy.initiate_chat(
    assistant,
    message="""What date is today? Compare the year-to-date gain for META and TESLA.""",
    summary_method="reflection_with_llm",
)
consume = time.time() - start

[33muser_proxy[0m (to assistant):

What date is today? Compare the year-to-date gain for META and TESLA.

--------------------------------------------------------------------------------
[33massistant[0m (to user_proxy):

To get today's date, you can use Python to print the current date. Then, we can proceed to compare the year-to-date gain for META and TESLA. Let's start by getting today's date.

```python
# Get today's date
import datetime
today = datetime.date.today()
print(today)
```

After getting today's date, we need to find the year-to-date gain for META and TESLA. To do this, we would typically need access to the stock prices at the beginning of the year and the current stock prices for META and TESLA. Since this information is not provided, we cannot directly calculate the year-to-date gain for these stocks. You can check financial news websites or stock market platforms to find the necessary stock price information for META and TESLA to calculate the year-to-date gain.



In [21]:
chat_res

ChatResult(chat_id=None, chat_history=[{'content': 'What date is today? Compare the year-to-date gain for META and TESLA.', 'role': 'assistant'}, {'content': "To get today's date, you can use Python to print the current date. Then, we can proceed to compare the year-to-date gain for META and TESLA. Let's start by getting today's date.\n\n```python\n# Get today's date\nimport datetime\ntoday = datetime.date.today()\nprint(today)\n```\n\nAfter getting today's date, we need to find the year-to-date gain for META and TESLA. To do this, we would typically need access to the stock prices at the beginning of the year and the current stock prices for META and TESLA. Since this information is not provided, we cannot directly calculate the year-to-date gain for these stocks. You can check financial news websites or stock market platforms to find the necessary stock price information for META and TESLA to calculate the year-to-date gain.\n\nIf you can provide the stock prices at the beginning of 

In [26]:
model = "gpt-3.5-turbo-0125"
prompt_tokens = chat_res.cost['usage_including_cached_inference'][model]['prompt_tokens']
completion_tokens = chat_res.cost['usage_including_cached_inference'][model]['completion_tokens']
cost = chat_res.cost['usage_including_cached_inference'][model]['cost']
summary = chat_res.summary
history = chat_res.chat_history                

In [15]:
chat_res.cost['usage_including_cached_inference']

{'total_cost': 0.0007615,
 'gpt-3.5-turbo-0125': {'cost': 0.0007615,
  'prompt_tokens': 752,
  'completion_tokens': 257,
  'total_tokens': 1009}}

In [11]:
consume

0.039936065673828125

In [5]:
def get_response(text, config_list):
    assistant = autogen.AssistantAgent(
        name="assistant",
        llm_config={
            "cache_seed": 41,  # seed for caching and reproducibility
            "config_list": config_list,  # a list of OpenAI API configurations
            "temperature": 0,  # temperature for sampling
        },  # configuration for autogen's enhanced inference API which is compatible with OpenAI API
    )
    
    # create a UserProxyAgent instance named "user_proxy"
    user_proxy = autogen.UserProxyAgent(
        name="user_proxy",
        human_input_mode="NEVER",
        max_consecutive_auto_reply=10,
        is_termination_msg=lambda x: x.get("content", "").rstrip().endswith("TERMINATE"),
        code_execution_config={
            # the executor to run the generated code
            "executor": LocalCommandLineCodeExecutor(work_dir="coding"),
        },
    )
    # the assistant receives a message from the user_proxy, which contains the task description
    chat_res = user_proxy.initiate_chat(
            assistant,
            message=text,
            summary_method="reflection_with_llm",
        )
    return chat_res
    

In [6]:
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o-2024-05-13"
model = "gpt-4-turbo-2024-04-09"
config_list = [{"model": model, "api_key": api_key}]
total_cost = 0

instruction = "You are a data scientist. I have a data modeling task. You must give me the predicted results as a CSV file as detailed in the following content. You should try your best to predict the answer. I provide you with three files. One is training data, one is test data. There is also a sample file for submission"

save_path = "./output_model/"


for id in tqdm(range(0, len(data))):
# for id in tqdm([0]):
    # print(sample)
    name = data[id]['name']
    with open(f"./data_filted_csv/cleaned_task/{name}.txt", "r") as f:
        description = f.read()
    
    text = (f"\n \n All three data files can be found in the folder: /Users/tencentintern/PycharmProjects/DSBench/kaggle_data/data_filted_csv/data_resplit/{name}/. After the data modeling, please give me the prediction resutls for the test file. You must"
            f" save the answer as a csv file. I won't run your code and you must run the code for the predicted results and give the submission file. The file should be saved in the path /Users/tencentintern/PycharmProjects/DSBench/kaggle_data/output_model/{model}-autoagent/{name}.csv") 
    
    all_context = instruction + "\n" + description + "\n" + text
    input_t = all_context

    # input_t = truncate_text(all_context, 2000)
    start = time.time()
    cost = 0
    error = ""
    prompt_tokens = completion_tokens = 0
    try:
        response = get_response(input_t, config_list)
        prompt_tokens = response.cost['usage_including_cached_inference'][model]['prompt_tokens']
        completion_tokens = response.cost['usage_including_cached_inference'][model]['completion_tokens']
        cost = response.cost['usage_including_cached_inference'][model]['cost']
        summary = response.summary
        history = response.chat_history
    except Exception as e:
        print(e)
        # time.sleep(3)
        error = str(e)
        # cost = 0
        history = "I cannot solve this task."
        summary = "I cannot solve this task."
        print(history)
        print(e)
        time.sleep(3)
                # all_mess.append("I cannot solve this task.")
    total_cost += cost
    print("Total cost: ", total_cost)

    if not os.path.exists(f"{save_path}{model}-autoagent/"):
        os.makedirs(f"{save_path}{model}-autoagent/")
    with open(f"{save_path}{model}-autoagent/{name}.json", "w") as f:
        json.dump({"name": name, "model": model, "input": prompt_tokens,
                            "output": completion_tokens, "cost": cost, "time": time.time()-start, "error": error, 'summary': summary, "history": history}, f)
    

    if total_cost > 100:
        break

  0%|          | 0/74 [00:00<?, ?it/s]

[33muser_proxy[0m (to assistant):

You are a data scientist. I have a data modeling task. You must give me the predicted results as a CSV file as detailed in the following content. You should try your best to predict the answer. I provide you with three files. One is training data, one is test data. There is also a sample file for submission
Description

👋🛳️ Ahoy, welcome to Kaggle! You’re in the right place. This is the legendary Titanic ML competition – the best, first challenge for you to dive into ML competitions and familiarize yourself with how the Kaggle platform works.

If you want to talk with other users about this competition, come join our Discord! We've got channels for competitions, job postings and career discussions, resources, and socializing with your fellow data scientists. Follow the link here: https://discord.gg/kaggle

The competition is simple: use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

Read on or watc

In [8]:
# model = "gpt-3.5-turbo-0125"
# model = "gpt-4o-2024-05-13"
# model = "gpt-4-turbo-2024-04-09"
model = "gpt-4o-mini-2024-07-18"
config_list = [{"model": model, "api_key": api_key}]
total_cost = 0

instruction = "You are a data scientist. I have a data modeling task. You must give me the predicted results as a CSV file as detailed in the following content. You should try your best to predict the answer. I provide you with three files. One is training data, one is test data. There is also a sample file for submission"

save_path = "./output_model/"


for id in tqdm(range(0, len(data))):
# for id in tqdm([0]):
    # print(sample)
    name = data[id]['name']
    with open(f"/Users/tencentintern/PycharmProjects/DSBench/kaggle_data/data_filted_csv/cleaned_task/{name}.txt", "r") as f:
        description = f.read()
    
    text = (f"\n \n All three data files can be found in the folder: /Users/tencentintern/PycharmProjects/DSBench/kaggle_data/data_filted_csv/data_resplit/{name}/. After the data modeling, please give me the prediction resutls for the test file. You must"
            f" save the answer as a csv file. I won't run your code and you must run the code for the predicted results and give the submission file. The file should be saved in the path /Users/tencentintern/PycharmProjects/DSBench/kaggle_data/output_model/{model}-autoagent/{name}.csv") 
    
    all_context = instruction + "\n" + description + "\n" + text
    input_t = all_context

    # input_t = truncate_text(all_context, 2000)
    start = time.time()
    cost = 0
    error = ""
    prompt_tokens = completion_tokens = 0
    try:
        response = get_response(input_t, config_list)
        prompt_tokens = response.cost['usage_including_cached_inference'][model]['prompt_tokens']
        completion_tokens = response.cost['usage_including_cached_inference'][model]['completion_tokens']
        # cost = response.cost['usage_including_cached_inference'][model]['cost']
        cost = prompt_tokens + completion_tokens
        summary = response.summary
        history = response.chat_history
    except Exception as e:
        print(e)
        # time.sleep(3)
        error = str(e)
        # cost = 0
        history = "I cannot solve this task."
        summary = "I cannot solve this task."
        print(history)
        print(e)
        time.sleep(3)
                # all_mess.append("I cannot solve this task.")
    total_cost += cost
    print("Total cost: ", total_cost)

    if not os.path.exists(f"{save_path}{model}-autoagent/"):
        os.makedirs(f"{save_path}{model}-autoagent/")
    with open(f"{save_path}{model}-autoagent/{name}.json", "w") as f:
        json.dump({"name": name, "model": model, "input": prompt_tokens,
                            "output": completion_tokens, "cost": cost, "time": time.time()-start, "error": error, 'summary': summary, "history": history}, f)
    

    if total_cost > 100:
        break

  0%|          | 0/74 [00:00<?, ?it/s]

[33muser_proxy[0m (to assistant):

You are a data scientist. I have a data modeling task. You must give me the predicted results as a CSV file as detailed in the following content. You should try your best to predict the answer. I provide you with three files. One is training data, one is test data. There is also a sample file for submission
Description

👋🛳️ Ahoy, welcome to Kaggle! You’re in the right place. This is the legendary Titanic ML competition – the best, first challenge for you to dive into ML competitions and familiarize yourself with how the Kaggle platform works.

If you want to talk with other users about this competition, come join our Discord! We've got channels for competitions, job postings and career discussions, resources, and socializing with your fellow data scientists. Follow the link here: https://discord.gg/kaggle

The competition is simple: use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

Read on or watc