### Minimal ensemble code

This code builds a simple bagging ensemble based on Kfold train/val split. Use it for quick idea testing with GPT-5-assisted tree generation.

In [16]:
# Local dataset settings
DATA_PATH = "mydata/data.csv"
TARGET_COL = "PM2.5"
TASK_TYPE = "regression"  # regression | binary | multiclass
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.2
N_REPEATS = 1

dataset_name = "pm25_local"
results_path = "data/tree_scores.pm25_local.json"

AUTH_SCHEME = "Bearer"  # e.g., DeepSeek uses Bearer; set to "OAuth" if required

In [17]:
from Token import *

In [18]:
import os
import re
import json
import numpy as np
import pandas as pd
import smolagents
from huggingface_hub import login
import proxy_api_model
import prompting
import tree_agent
from sklearn.model_selection import train_test_split
from task import metric_func_by_task
from smolagents import LiteLLMModel

# Debug: confirm which proxy_api_model is loaded
print("proxy_api_model file:", proxy_api_model.__file__)

login(token=HF_TOKEN_HERE)
# Use the OpenAI-like proxy API wrapper (supports callback)
model = LiteLLMModel(
    model_id="deepseek-reasoner",
    api_base=API_ENDPOINT_HERE,  # <-- https://api.deepseek.com/v1/chat/completions
    api_key=API_TOKEN_HERE,  # <-- raw sk- token only
    max_new_tokens=1024 * 8,
    verify_ssl=True,
    timeout=120,
    auth_scheme=AUTH_SCHEME,
    # callback=lambda msg, **etc: print(  # print model thoughts before code
    #     re.sub(r"<code>.*?</code>", "<code omitted>", msg.content, flags=re.DOTALL)
    # ),
)

# Load local dataset
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"DATA_PATH not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)

if TARGET_COL not in df.columns:
    matches = [c for c in df.columns if c.lower() == TARGET_COL.lower()]
    if matches:
        TARGET_COL = matches[0]
    else:
        raise ValueError(
            f"Target column '{TARGET_COL}' not found in data columns: {list(df.columns)}"
        )

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].to_numpy()

task_type = TASK_TYPE
metric_func = metric_func_by_task[task_type]
metric_name = prompting.metrics_by_task[task_type]

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]

# Dataset description to help the LLM form hypotheses
feature_list = ", ".join(X.columns)
cat_list = ", ".join(cat_cols) if cat_cols else "none"
num_list = ", ".join(num_cols) if num_cols else "none"

dataset_desc = f"""
<dataset>
Your task is to predict PM2.5.
Size: {len(X)} total (full dataset)
Feature columns: {feature_list}
Categorical columns: {cat_list}
Numeric columns: {num_list}
Targets: {TARGET_COL} (float)
Metric: {metric_name}
</dataset>
""".strip()

proxy_api_model file: c:\Users\CaoBo\OneDrive\VScode\Python_workspace\Python_Project_B\TalkingTrees\proxy_api_model.py


In [None]:
import os
import requests

# Ensure API config is available even if earlier cells were not run
if "API_ENDPOINT_HERE" not in globals():
    API_ENDPOINT_HERE = (
        os.getenv("API_ENDPOINT_HERE") or os.getenv("API_ENDPOINT") or ""
    )
if "API_TOKEN_HERE" not in globals():
    API_TOKEN_HERE = os.getenv("API_TOKEN_HERE") or os.getenv("API_TOKEN") or ""
if not API_ENDPOINT_HERE or not API_TOKEN_HERE:
    raise ValueError(
        "Missing API config. Run the config cell or set API_ENDPOINT_HERE/API_TOKEN_HERE env vars."
    )

API_ENDPOINT = API_ENDPOINT_HERE
API_TOKEN = API_TOKEN_HERE

r = requests.post(
    API_ENDPOINT,
    headers={
        "authorization": f"Bearer {API_TOKEN}",
        "content-type": "application/json",
    },
    json={
        "model": "deepseek-reasoner",
        "messages": [{"role": "user", "content": "ping"}],
        "max_tokens": 1,
    },
    timeout=30,
)
print("status:", r.status_code)
print("body:", r.text[:300])


test_scores = []
r2_scores = []
rmse_scores = []
last_y_test = None
last_y_pred = None
for repeat_index in range(N_REPEATS):
    print("Beginning repeat", repeat_index)
    split_seed = RANDOM_STATE if N_REPEATS == 1 else RANDOM_STATE + repeat_index

    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X,
        y,
        test_size=TEST_SIZE,
        random_state=split_seed,
        stratify=None if task_type == "regression" else y,
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full,
        y_train_full,
        test_size=VAL_SIZE,
        random_state=split_seed,
        stratify=None if task_type == "regression" else y_train_full,
    )

    result = tree_agent.TreeAgent(model=model).run(
        task=f"""
Build the optimal decision tree for the '{dataset_name}' dataset.
You are given access to 4 data variables in your python environment:
 - X_train, X_val are pandas dataframes with named feature columns (see below) that may need preprocessing;
 - y_train, y_val are numpy arrays (1d) with targets, also described below;

Dataset description (use it to form hypotheses):
{dataset_desc}

Here's one way you could construct before you begin editing it manually:
{prompting.starter_snippets_by_task[task_type]}

Now begin: view the data variables, preprocess as necessary, train a baseline tree, then propose the first hypothesis and start improving.
Focus on drawing conclusions from data, looking at the tree (e.g. via print) and using your own intuition about the problem for manual tree edits.
Quality is more important than speed: take as many steps as you need to get the best tree.
""".strip(),
        additional_args=dict(
            X_train=X_train.copy(),
            y_train=y_train.copy(),
            X_val=X_val.copy(),
            y_val=y_val.copy(),
        ),
    )

    y_pred_i = result["model"].predict(result["preprocess_features"](X_test.copy()))
    if task_type == "multiclass":  # normalize for logloss
        y_pred_i = y_pred_i / y_pred_i.sum(axis=-1, keepdims=True)
    test_score = metric_func(y_test, y_pred_i)
    if task_type == "regression":
        from sklearn.metrics import r2_score, root_mean_squared_error

        r2 = r2_score(y_test, y_pred_i)
        rmse = root_mean_squared_error(
            y_test,
            y_pred_i,
        )
        print(f"Test R2 score #{repeat_index}: {r2:.5f}")
        print(f"Test RMSE score #{repeat_index}: {rmse:.5f}")
        r2_scores.append(r2)
        rmse_scores.append(rmse)
    print(f"Test {metric_name} score #{repeat_index}: {test_score:.5f}")
    test_scores.append(test_score)
    with open(results_path, "w") as f:
        json.dump(test_scores, f, ensure_ascii=False, indent=2)

status: 200
body: {"id":"7dc00f1b-5d29-4850-9fc2-e833aacafe5c","object":"chat.completion","created":1770646606,"model":"deepseek-reasoner","choices":[{"index":0,"message":{"role":"assistant","content":"","reasoning_content":"H"},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":5,"completion_tokens"
Beginning repeat 0



[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m




[1;31mProvider List: https://docs.litellm.ai/docs/providers[0m



Test R2 score #0: 0.21667
Test RMSE score #0: 12.08273
Test RMSE (sqrt of sklearn.metrics.mean_squared_error) score #0: 12.08273


In [None]:
# Summary outputs
print(f"Final {metric_name} score list: {test_scores}")
if len(test_scores) > 0:
    print(f"Mean {metric_name}: {np.mean(test_scores):.5f}")
    if len(test_scores) > 1:
        print(f"Std {metric_name}: {np.std(test_scores, ddof=1):.5f}")

if task_type == "regression":
    if len(r2_scores) > 0:
        print(f"Mean R2: {np.mean(r2_scores):.5f}")
        if len(r2_scores) > 1:
            print(f"Std R2: {np.std(r2_scores, ddof=1):.5f}")
    if len(rmse_scores) > 0:
        print(f"Mean RMSE: {np.mean(rmse_scores):.5f}")
        if len(rmse_scores) > 1:
            print(f"Std RMSE: {np.std(rmse_scores, ddof=1):.5f}")

    if last_y_test is not None and last_y_pred is not None:
        from sklearn.metrics import r2_score

        try:
            from sklearn.metrics import root_mean_squared_error as _rmse

            final_rmse = _rmse(last_y_test, last_y_pred)
        except Exception:
            from sklearn.metrics import mean_squared_error

            final_rmse = mean_squared_error(last_y_test, last_y_pred) ** 0.5
        final_r2 = r2_score(last_y_test, last_y_pred)
        print(f"Final R2: {final_r2:.5f}")
        print(f"Final RMSE: {final_rmse:.5f}")