In [None]:
# Notebook-like script: Model training, tuning, MLflow logging, evaluation (robust to Task 3/4 availability)
# Usage: run this in a Jupyter cell or as a script. Adjust src path if necessary.

import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pprint
import os

# ensure local src is importable
src_path = str((Path('..') / 'src').resolve())
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Project imports (model training utilities)
from model_training import (
    prepare_data, build_default_models, default_param_grids,
    make_pipeline_with_scaler, fit_and_tune, evaluate_model,
    log_experiment_mlflow, save_model_local
)
from proxy_target import ProxyTargetEngineer  # Task 4 helper (used if target missing)

# Configuration
RAW_TX_PATH = Path('..') / 'data' / 'raw' / 'data.csv'
PROCESSED_DIR = Path('..') / 'data' / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
FEATURES_WITH_PROXY = PROCESSED_DIR / 'features_with_proxy.csv'
FEATURES_BASE = PROCESSED_DIR / 'features.csv'

RANDOM_STATE = 42
TEST_SIZE = 0.2
CANDIDATE_MODELS = ["logistic", "random_forest"]  # run at least two

# Helper: create proxy target and merge into feat_df
def ensure_features_with_proxy(feat_df: pd.DataFrame, raw_tx_path: Path) -> pd.DataFrame:
    """
    Ensure feat_df contains 'is_high_risk'. If missing, compute using ProxyTargetEngineer
    and raw transaction-level data at raw_tx_path. Returns updated feat_df.
    """
    if "is_high_risk" in feat_df.columns:
        print("is_high_risk already present in features. No action needed.")
        return feat_df

    if not raw_tx_path.exists():
        raise FileNotFoundError(f"Raw transaction data required to create proxy target not found: {raw_tx_path}")

    print("is_high_risk missing â€” computing proxy target using RFM + KMeans...")
    raw_tx = pd.read_csv(raw_tx_path)
    proxy = ProxyTargetEngineer(
        id_col="CustomerId",
        time_col="TransactionStartTime",
        amount_col="Amount",
        n_clusters=3,
        random_state=RANDOM_STATE,
        scale=True
    )
    proxy.fit(raw_tx)
    feat_df_with_target = proxy.assign_labels(feat_df)
    print("Assigned proxy target; distribution:")
    print(feat_df_with_target["is_high_risk"].value_counts(dropna=False).to_string())
    return feat_df_with_target


In [None]:

# ---------- Load or build features ----------
# 1) Prefer a ready-made features_with_proxy.csv
if FEATURES_WITH_PROXY.exists():
    print(f"Loading processed features with proxy target from: {FEATURES_WITH_PROXY}")
    df = pd.read_csv(FEATURES_WITH_PROXY)
else:
    # 2) Try to load base features.csv (Task 3 output) and then compute proxy target (Task 4)
    if FEATURES_BASE.exists():
        print(f"Loading base features from: {FEATURES_BASE}")
        df = pd.read_csv(FEATURES_BASE)
        # ensure target present or compute
        if 'is_high_risk' not in df.columns:
            df = ensure_features_with_proxy(df, RAW_TX_PATH)
            # persist
            df.to_csv(FEATURES_WITH_PROXY, index=False)
            print(f"Saved features_with_proxy.csv to: {FEATURES_WITH_PROXY}")
    else:
        # 3) As a last resort, try to build features via a pipeline module if available
        try:
            from feature_engineering_4_PTV import feature_engineering_pipeline
            print("Building features using feature_engineering_4_PTV.feature_engineering_pipeline...")
            raw_tx = pd.read_csv(RAW_TX_PATH)
            df, feat_desc = feature_engineering_pipeline(raw_tx, categorical_cols=None, create_proxy_target=False)
            # save base features
            df.to_csv(FEATURES_BASE, index=False)
            print(f"Saved base features to {FEATURES_BASE}")
            # compute proxy target
            df = ensure_features_with_proxy(df, RAW_TX_PATH)
            df.to_csv(FEATURES_WITH_PROXY, index=False)
            print(f"Saved features_with_proxy.csv to: {FEATURES_WITH_PROXY}")
        except Exception as exc:
            raise FileNotFoundError(
                "No processed features found, pipeline unavailable, and cannot continue. "
                "Provide ../data/processed/features_with_proxy.csv or ../data/processed/features.csv, "
                "or ensure feature_engineering_4_PTV is importable."
            ) from exc

print("Final features shape:", df.shape)
print("Columns:", list(df.columns))
print("Target distribution (if present):")
if "is_high_risk" in df.columns:
    print(df["is_high_risk"].value_counts(dropna=False).to_string())
else:
    raise RuntimeError("is_high_risk target not present after attempted creation.")


In [None]:

# ---------- Data preparation ----------
X_train, X_test, y_train, y_test = prepare_data(df, target_col="is_high_risk", test_size=TEST_SIZE, random_state=RANDOM_STATE)
print("Train/Test shapes:", X_train.shape, X_test.shape)
print("Train target distribution:")
print(y_train.value_counts(normalize=True).to_string())

# ---------- Model selection, tuning, training ----------
models = build_default_models(random_state=RANDOM_STATE)
param_grids = default_param_grids()

# Directory to save models
MODELS_DIR = Path('..') / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

results = []
for key in CANDIDATE_MODELS:
    clf = models.get(key)
    if clf is None:
        print(f"Model '{key}' not found in available models. Skipping.")
        continue

    print(f"\n=== Training & tuning model: {key} ===")
    pipe = make_pipeline_with_scaler(clf)
    params = param_grids.get(key, {})

    # Tune with GridSearch (fallback to fit if tuning fails)
    try:
        search = fit_and_tune(pipe, params, X_train, y_train, cv=3, search_type="grid", scoring="roc_auc")
        best = search.best_estimator_
        best_params = search.best_params_
    except Exception as e:
        print(f"Hyperparameter tuning failed for {key} (error: {e}). Falling back to default fit.")
        best = pipe.fit(X_train, y_train)
        best_params = getattr(best, "get_params", lambda: {})()

    metrics = evaluate_model(best, X_test, y_test)
    print(f"Evaluation metrics for {key}:")
    pprint.pprint(metrics)
    # Log experiment to MLflow
    # Save model locally
    model_path = MODELS_DIR / f"{key}_best.pkl"
    save_model_local(best, str(model_path))

    # Log to MLflow (if configured)
    run_id = log_experiment_mlflow(
        name="PTV_modeling",
        estimator=best,
        params=best_params,
        metrics=metrics,
        X_train=X_train, X_test=X_test,
        artifacts={"model_pkl": str(model_path)},
        model_save_path=str(model_path),
        register_name=f"PTV-{key}"
    )

    results.append({
        "model": key,
        "best_params": best_params,
        "metrics": metrics,
        "run_id": run_id,
        "model_path": str(model_path)
    })


In [None]:

# ---------- Compare and report ----------
print("\n=== All results ===")
pprint.pprint(results)

# choose best by roc_auc (fallback to f1)
def score_for(r):
    m = r["metrics"]
    return m.get("roc_auc", m.get("f1", float("-inf")))

if results:
    best_result = max(results, key=score_for)
    print("\nBest model overall (by ROC-AUC or fallback):")
    pprint.pprint(best_result)
    print(f"Best model saved at: {best_result['model_path']}")
else:
    print("No model results to compare.")

print("\nModel training script complete.")