In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm

DATA_PATH = r"C:\Users\Kasper Hassing\Desktop\Speciale_KryptoSentiment\data\combined\BTCUSDT_features_final_sentiment.csv"
TARGET = "future_return_7d"
N_ITER = 4000 

# === Load og forbered data ===
df = pd.read_csv(DATA_PATH, parse_dates=["timestamp"])
df.set_index("timestamp", inplace=True)
feature_cols = [col for col in df.columns if not col.startswith("future_return")]
df = df.dropna(subset=feature_cols + [TARGET])
X = df[feature_cols]
y = df[TARGET]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

# === Para
param_dist = {
    "learning_rate": [0.01, 0.02, 0.03, 0.05, 0.06, 0.07],      
    "max_depth": [3, 4, 5, 6],                           
    "min_child_weight": [1, 2, 3],                     
    "subsample": [0.5, 0.7, 0.8],                          
    "colsample_bytree": [0.6, 0.7, 0.8],                    
    "gamma": [0.0, 0.05, 0.1],                          
    "reg_alpha": [0.01, 0.05, 0.1],                     
    "reg_lambda": [0.5, 1.0, 1.5]                       
}


param_samples = list(ParameterSampler(param_dist, n_iter=N_ITER, random_state=42))

# === Tracking
best_score = float("inf")
best_params = None
best_mae = None

print(f"🔍 Starter tuning af {N_ITER} kombinationer...\n")
for params in tqdm(param_samples):
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        n_estimators=1000,
        early_stopping_rounds=100,
        seed=42,
        verbosity=0,
        **params
    )

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    mae = mean_absolute_error(y_val, preds)

    if rmse < best_score:
        best_score = rmse
        best_mae = mae
        best_params = params

# === Print resultatet i copy-paste format
print("\n Best:\n")
ordered_keys = [
    "objective", "learning_rate", "max_depth", "min_child_weight",
    "subsample", "colsample_bytree", "reg_alpha", "reg_lambda", "gamma",
    "eval_metric", "seed"
]

print("params = {")
for key in ordered_keys:
    if key in best_params:
        print(f'    "{key}": {best_params[key]},')
    elif key == "objective":
        print('    "objective": "reg:squarederror",')
    elif key == "eval_metric":
        print('    "eval_metric": "rmse",')
    elif key == "seed":
        print('    "seed": 42,')
print("}")
print(f"\n✅ RMSE på validering: {best_score:.5f}")
print(f"📊 MAE  på validering: {best_mae:.5f}")



🔍 Starter tuning af 2000 kombinationer...



100%|██████████| 2000/2000 [02:44<00:00, 12.18it/s]


🏆 Bedste parametre fundet efter 2000 forsøg (ordnet som din produktionsmodel):

params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.07,
    "max_depth": 4,
    "min_child_weight": 1,
    "subsample": 0.5,
    "colsample_bytree": 0.6,
    "reg_alpha": 0.05,
    "reg_lambda": 0.5,
    "gamma": 0.0,
    "eval_metric": "rmse",
    "seed": 42,
}

✅ RMSE på validering: 0.08539
📊 MAE  på validering: 0.07025



