In [None]:
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.utils.class_weight import compute_class_weight


In [None]:
class ModelEvaluation:

    def __init__(self, target_vars: list[str], model_specs: dict, random_state: int = 42):
        """
        target_vars: list of target column names
        model_specs: dict with keys as model names, values as dicts with:
                     - 'model': a scikit-learn estimator class or a callable returning an instance
                     - 'param_grid': dict of hyperparameter grid for grid search
        Example:
        model_specs = {
            'logreg': {'model': LogisticRegression, 'param_grid': {...}},
            'rf': {'model': RandomForestClassifier, 'param_grid': {...}}
        }
        """
        self.target_vars = target_vars
        self.model_specs = model_specs
        self.random_state = random_state
        self.best_params = {m: {t: None for t in target_vars} for m in model_specs}
        self.models = {m: {t: None for t in target_vars} for m in model_specs}
        self.cv_metrics = {}
        self.metrics_report = {}

    def load_data(self, df: pd.DataFrame, feature_cols: list[str]):
        self.df = df.copy()
        self.feature_cols = feature_cols

    def find_best_hyperparameters(
        self,
        param_grid: dict | None = None,
        cv_folds: int = 3,
        max_iter: int = 1000
    ):
        """
        Finds best hyperparameters using grid search CV for each
        target and stores them in self.best_params.
        """
        for model_name, spec in self.model_specs.items():

            ModelClass = spec['model']
            param_grid = spec['param_grid']

            for target in self.target_vars:
                print(f"\n[{model_name}] Hyperparameter tuning for target: {target}")
                X = self.df[self.feature_cols].values
                y = self.df[target].values
                fit_params = {}
                classes = np.unique(y)
                if 'class_weight' in param_grid or hasattr(ModelClass(), 'class_weight'):
                    class_weights = compute_class_weight('balanced', classes=classes, y=y)
                    cw_dict = {cls: w for cls, w in zip(classes, class_weights)}
                    fit_params['class_weight'] = cw_dict
                if 'random_state' in ModelClass().get_params():
                    fit_params['random_state'] = self.random_state
                if 'n_jobs' in ModelClass().get_params():
                    fit_params['n_jobs'] = -1
                if 'max_iter' in ModelClass().get_params():
                    fit_params['max_iter'] = max_iter
                model = ModelClass(**fit_params)
                skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=self.random_state)
                gs = GridSearchCV(
                    estimator=model,
                    param_grid=param_grid,
                    cv=skf,
                    scoring='roc_auc',
                    n_jobs=-1
                )
                gs.fit(X, y)
                self.best_params[model_name][target] = gs.best_params_
                print(f"Best params for {model_name} / {target}: {gs.best_params_}")

    def kfold_cv_with_best_params(self, k: int = 5, max_iter: int = 500):
        results = {}
        for model_name, spec in self.model_specs.items():
            ModelClass = spec['model']
            for target in self.target_vars:
                print(f"\n[{model_name}] {str(k)}-fold CV for target: {target}")
                X = self.df[self.feature_cols].values
                y = self.df[target].values
                aucs = []
                skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=self.random_state)
                # Try to inject class_weight and other params if possible
                classes = np.unique(y)
                fit_params = {}
                if 'class_weight' in self.best_params[model_name][target] or hasattr(ModelClass(), 'class_weight'):
                    class_weights = compute_class_weight('balanced', classes=classes, y=y)
                    cw_dict = {cls: w for cls, w in zip(classes, class_weights)}
                    fit_params['class_weight'] = cw_dict
                if 'random_state' in ModelClass().get_params():
                    fit_params['random_state'] = self.random_state
                if 'n_jobs' in ModelClass().get_params():
                    fit_params['n_jobs'] = -1
                if 'max_iter' in ModelClass().get_params():
                    fit_params['max_iter'] = max_iter
                best_params = self.best_params[model_name][target]
                for train_idx, test_idx in skf.split(X, y):
                    X_train, X_test = X[train_idx], X[test_idx]
                    y_train, y_test = y[train_idx], y[test_idx]
                    params = dict(best_params, **fit_params)
                    model = ModelClass(**params)
                    model.fit(X_train, y_train)
                    y_pred_proba = model.predict_proba(X_test)[:, 1]
                    auc = roc_auc_score(y_test, y_pred_proba)
                    aucs.append(auc)
                # Save
                results.setdefault(model_name, {})
                results[model_name][target] = aucs
                self.metrics_report.setdefault(model_name, {})
                self.metrics_report[model_name][target] = {
                    'mean_auc': np.mean(aucs),
                    'std_auc': np.std(aucs),
                    'all_aucs': aucs,
                    'best_params': best_params
                }
                print(f"{model_name}/{target}: mean AUC={np.mean(aucs):.3f}, std={np.std(aucs):.3f}")
        self.cv_metrics = results

    def fit_and_store_models(self, out_folder: Path, max_iter: int = 500):
        out_folder.mkdir(parents=True, exist_ok=True)
        for model_name, spec in self.model_specs.items():
            ModelClass = spec['model']
            for target in self.target_vars:
                X = self.df[self.feature_cols].values
                y = self.df[target].values
                best_params = self.best_params[model_name][target]
                classes = np.unique(y)
                fit_params = {}
                if 'class_weight' in best_params or hasattr(ModelClass(), 'class_weight'):
                    class_weights = compute_class_weight('balanced', classes=classes, y=y)
                    cw_dict = {cls: w for cls, w in zip(classes, class_weights)}
                    fit_params['class_weight'] = cw_dict
                if 'random_state' in ModelClass().get_params():
                    fit_params['random_state'] = self.random_state
                if 'n_jobs' in ModelClass().get_params():
                    fit_params['n_jobs'] = -1
                if 'max_iter' in ModelClass().get_params():
                    fit_params['max_iter'] = max_iter
                params = dict(best_params, **fit_params)
                model = ModelClass(**params)
                model.fit(X, y)
                self.models[model_name][target] = model
                model_path = out_folder / f"{model_name}_{target}.pkl"
                with open(model_path, "wb") as f:
                    pickle.dump(model, f)
                print(f"Saved model for {model_name}/{target} to {model_path}")

        # Save metrics and hyperparameters as pickle and CSV for convenience
        metrics_path = out_folder / "cv_metrics_report.pkl"
        with open(metrics_path, "wb") as f:
            pickle.dump(self.metrics_report, f)
        print(f"Saved metrics report to {metrics_path}")

        # Also save as readable CSV
        summary_rows = []
        for model_name, model_results in self.metrics_report.items():
            for target, d in model_results.items():
                summary_rows.append({
                    'model': model_name,
                    'target': target,
                    'mean_auc': d['mean_auc'],
                    'std_auc': d['std_auc'],
                    'best_params': d['best_params']
                })
        pd.DataFrame(summary_rows).to_csv(out_folder / "cv_metrics_report.csv", index=False)
        print(f"Saved metrics summary to {str(out_folder / 'cv_metrics_report.csv')}")

        best_params_path = out_folder, "best_hyperparameters.pkl"
        with open(best_params_path, "wb") as f:
            pickle.dump(self.best_params, f)
        print(f"Saved best hyperparameters to {best_params_path}")

In [None]:
model_specs = {
    'logreg': {
        'model': LogisticRegression,
        'param_grid': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['saga'],
            'l1_ratio': [0.1, 0.5, 0.9]
        }
    },
    'rf': {
        'model': RandomForestClassifier,
        'param_grid': {
            'n_estimators': [100, 300],
            'max_depth': [None, 10, 30],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'class_weight': ['balanced', 'balanced_subsample'],
        }
    }
}

df = pd.read_csv("your_data.csv")
feature_cols = [col for col in df.columns if col not in ['target1', 'target2', 'target3', 'target4']]
target_vars = ['target1', 'target2', 'target3', 'target4']

meval = ModelEvaluation(target_vars, model_specs)
meval.load_data(df, feature_cols)

# 1. Find best hyperparameters for each model and target
meval.find_best_hyperparameters()

# 2. k-fold CV for each model and target using best params
meval.kfold_cv_with_best_params(k=5)

# 3. Fit and store final models, metrics, and best params
meval.fit_and_store_models(out_folder="./multimodel_eval")