In [None]:
# Notebook-like script: Model training, tuning, MLflow logging, evaluation
import sys
from pathlib import Path
import pandas as pd
import numpy as np

# ensure src on path
src_path = str((Path('..') / 'src').resolve())
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from data_processing import DataLoader
from model_training import (
    prepare_data, build_default_models, default_param_grids,
    make_pipeline_with_scaler, fit_and_tune, evaluate_model,
    log_experiment_mlflow, save_model_local
)
from proxy_target import ProxyTargetEngineer
import json
import pprint

# 1. Load processed features (with is_high_risk)
data_path = Path('..') / 'data' / 'processed' / 'features_with_proxy.csv'
if not data_path.exists():
    raise FileNotFoundError(f"Processed features not found: {data_path}")
df = pd.read_csv(data_path)
print("Loaded processed features:", df.shape)

# 2. Prepare data
X_train, X_test, y_train, y_test = prepare_data(df, target_col="is_high_risk", test_size=0.2, random_state=42)
print("Train/Test shapes:", X_train.shape, X_test.shape)

# 3. Build models and param grids
models = build_default_models(random_state=42)
param_grids = default_param_grids()

# choose two model keys to run (at least two)
candidates = ["logistic", "random_forest"]

results = []
for key in candidates:
    clf = models[key]
    # wrap in pipeline (scaler+clf) for logistic; RF can be left unscaled but pipeline is fine
    pipe = make_pipeline_with_scaler(clf)
    params = param_grids.get(key, {})
    print(f"Training & tuning {key} ...")
    search = fit_and_tune(pipe, params, X_train, y_train, cv=3, search_type="grid", scoring="roc_auc")
    best = search.best_estimator_
    metrics = evaluate_model(best, X_test, y_test)
    # save model locally
    model_path = f"../models/{key}_best.pkl"
    save_model_local(best, model_path)
    # log to mlflow (if available)
    run_id = log_experiment_mlflow(
        name="PTV_modeling",
        estimator=best,
        params=search.best_params_,
        metrics=metrics,
        X_train=X_train, X_test=X_test,
        artifacts={"model_pkl": model_path},
        model_save_path=model_path,
        register_name=f"PTV-{key}"
    )
    results.append({"model": key, "best_params": search.best_params_, "metrics": metrics, "run_id": run_id})
    print(f"Done {key}, metrics:")
    pprint.pprint(metrics)

# 4. Compare results
print("All results:")
pprint.pprint(results)