In [26]:
import numpy as np
import optuna
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost.callback import EarlyStopping


optuna.logging.set_verbosity(optuna.logging.WARNING)

RANDOM_STATE = 42
N_TRIALS = 25



In [27]:
all_features_data = pd.read_csv("/Users/suhaniagarwal/Downloads/all_features_data.csv")
X = all_features_data.drop(
    columns=["Number of Admissions", "Timestamp"]
)
y = all_features_data["Number of Admissions"]


In [28]:
# Assumes X and y already exist

# 80% train, 20% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=RANDOM_STATE,
)

# Split remaining 20% into 10% val, 10% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.50,   # 50% of 20% → 10%
    random_state=RANDOM_STATE,
)

print("Train / Val / Test sizes:", len(y_train), len(y_val), len(y_test))



Train / Val / Test sizes: 560 70 70


In [29]:

def evaluate(y_true, y_pred):
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "rmse": np.sqrt(mean_squared_error(y_true, y_pred)),
        "r2": r2_score(y_true, y_pred),
    }


In [34]:
import optuna
import xgboost as xgb
import numpy as np
from sklearn.metrics import r2_score

RANDOM_STATE = 42
N_TRIALS = 25

def xgb_objective(trial):
    try:
        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "max_depth": trial.suggest_int("max_depth", 3, 8),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "seed": RANDOM_STATE,
        }

        num_boost_round = trial.suggest_int("n_estimators", 300, 1000)

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dval, "val")],
            early_stopping_rounds=30,
            verbose_eval=False,
        )

        preds = model.predict(dval)
        return r2_score(y_val, preds)

    except Exception as e:
        print("Trial failed:", e)
        return -np.inf




In [35]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE),
)

study.optimize(xgb_objective, n_trials=N_TRIALS)

print("Best hyperparameters:")
print(study.best_params)
print("Best validation R²:", study.best_value)


Best hyperparameters:
{'max_depth': 3, 'learning_rate': 0.10914458059255211, 'subsample': 0.9013940689940868, 'colsample_bytree': 0.8020507070016206, 'min_child_weight': 5, 'n_estimators': 847}
Best validation R²: 0.26381777170590404


In [36]:
best_params = study.best_params

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": best_params["max_depth"],
    "learning_rate": best_params["learning_rate"],
    "subsample": best_params["subsample"],
    "colsample_bytree": best_params["colsample_bytree"],
    "min_child_weight": best_params["min_child_weight"],
    "seed": RANDOM_STATE,
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

final_model = xgb.train(
    params,
    dtrain,
    num_boost_round=best_params["n_estimators"],
    evals=[(dval, "val")],
    early_stopping_rounds=30,
    verbose_eval=False,
)

test_preds = final_model.predict(dtest)
print("Test R²:", r2_score(y_test, test_preds))


Test R²: 0.30965468326706513


In [37]:
# Create lagged features for days 3–5
lags = [3, 4, 5]

lag_data = all_features_data.copy()

# Columns to lag (exclude target + timestamp)
cols_to_lag = lag_data.columns.difference(
    ["Number of Admissions", "Timestamp"]
)

for lag in lags:
    X_shifted = lag_data[cols_to_lag].shift(lag)
    X_shifted.columns = [f"{col}_lag{lag}" for col in cols_to_lag]
    lag_data = pd.concat([lag_data, X_shifted], axis=1)

# Drop rows with NaNs introduced by lagging
lag_data = lag_data.dropna()

# Align target
y_lag_3_5 = y.loc[lag_data.index]

In [38]:
X_lag = lag_data.drop(columns=["Number of Admissions", "Timestamp"])
y_lag = y_lag_3_5

In [41]:
RANDOM_STATE = 42
N_TRIALS = 25

# 80% train, 20% temp (RANDOM)
X_train_l, X_temp_l, y_train_l, y_temp_l = train_test_split(
    X_lag,
    y_lag,
    test_size=0.20,
    random_state=RANDOM_STATE,
    shuffle=True,   # default, but explicit
)

# Split temp into 10% val, 10% test (RANDOM)
X_val_l, X_test_l, y_val_l, y_test_l = train_test_split(
    X_temp_l,
    y_temp_l,
    test_size=0.50,
    random_state=RANDOM_STATE,
    shuffle=True,
)

print("Train / Val / Test sizes:", len(y_train_l), len(y_val_l), len(y_test_l))


Train / Val / Test sizes: 556 69 70


In [42]:
def xgb_objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "seed": RANDOM_STATE,
    }

    num_boost_round = trial.suggest_int("n_estimators", 200, 800)

    dtrain = xgb.DMatrix(X_train_l, label=y_train_l)
    dval   = xgb.DMatrix(X_val_l, label=y_val_l)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "val")],
        early_stopping_rounds=30,
        verbose_eval=False,
    )

    preds_l = model.predict(dval)
    return r2_score(y_val_l, preds_l)


In [43]:
study_l= optuna.create_study(direction="maximize")
study_l.optimize(xgb_objective, n_trials=N_TRIALS)

print("Best hyperparameters:")
print(study_l.best_params)
print("Best validation R²:", study_l.best_value)


Best hyperparameters:
{'max_depth': 7, 'learning_rate': 0.019828443100086447, 'subsample': 0.661630780700746, 'colsample_bytree': 0.6002590021901413, 'min_child_weight': 8, 'n_estimators': 361}
Best validation R²: 0.09579420373176495


In [45]:
best_l = study_l.best_params

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": best_l["max_depth"],
    "learning_rate": best_l["learning_rate"],
    "subsample": best_l["subsample"],
    "colsample_bytree": best_l["colsample_bytree"],
    "min_child_weight": best_l["min_child_weight"],
    "seed": RANDOM_STATE,
}

X_train_full = np.vstack([X_train_l, X_val_l])
y_train_full = np.concatenate([y_train_l, y_val_l])

dtrain_full = xgb.DMatrix(X_train_full, label=y_train_full)
dtest = xgb.DMatrix(X_test_l, label=y_test_l)

final_model = xgb.train(
    params,
    dtrain_full,
    num_boost_round=best_l["n_estimators"],
    verbose_eval=False,
)


In [46]:
test_preds = final_model.predict(dtest)
test_r2 = r2_score(y_test_l, test_preds)

print("Final Test R² (XGBoost, random split, lags 3–5):", test_r2)


Final Test R² (XGBoost, random split, lags 3–5): 0.22780380905700237
