## Setup (env, Spark, load features)

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from pyspark.sql import SparkSession

In [2]:
# 1) Env & paths
load_dotenv(find_dotenv(usecwd=True), override=True)
DATA_DIR = os.getenv("DATA_DIR")
assert DATA_DIR and os.path.isdir(DATA_DIR), f"DATA_DIR invalid: {DATA_DIR}"

# 2) Spark (used only to load parquet and convert to pandas)
spark = SparkSession.builder.appName("HealthClaims_TrainXGB").getOrCreate()

# 3) Load features saved by 02_label_features
feat_dir = os.path.abspath(os.path.join(DATA_DIR, "..", "processed", "features_parquet"))
features = spark.read.parquet(feat_dir).cache()
features.createOrReplaceTempView("features_v0")

print("Features rows:", features.count())
features.printSchema()
features.limit(5).show(truncate=False)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/28 16:46:01 WARN Utils: Your hostname, JINUTSA, resolves to a loopback address: 127.0.1.1; using 10.4.8.103 instead (on interface enp37s0f0)
25/10/28 16:46:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 16:46:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/28 16:46:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Features rows: 1163
root
 |-- patient_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- race: string (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- age_at_index: decimal(13,0) (nullable = true)
 |-- index_date: date (nullable = true)
 |-- last_enc_date: date (nullable = true)
 |-- n_encounters: long (nullable = true)
 |-- n_conditions: long (nullable = true)
 |-- n_procedures: long (nullable = true)
 |-- n_medications: long (nullable = true)
 |-- n_observations: long (nullable = true)
 |-- n_claims: long (nullable = true)
 |-- hist_total_cost: double (nullable = true)
 |-- n_unique_providers: long (nullable = true)
 |-- n_unique_departments: long (nullable = true)
 |-- n_claims_with_diag: long (nullable = true)
 |-- claim_span_days: integer (nullable = true)
 |-- cost_next_window: double (nullable = true)
 |-- label: integer (nullable = true)

+------------------------------------+------+-----+-----------+------------+----------+-------------+----

## Time-based split (index_date) & pandas conversion

In [4]:
import pandas as pd
from pyspark.sql import functions as F

# Time split (mimics production): train on earlier index dates, test on later ones
CUT = "2020-10-15"
train_df = spark.sql(f"SELECT * FROM features_v0 WHERE index_date < DATE('{CUT}')")
test_df  = spark.sql(f"SELECT * FROM features_v0 WHERE index_date >= DATE('{CUT}')")

print("Train rows:", train_df.count(), " Test rows:", test_df.count())

# Columns we’ll use (v0 features)
CAT = ["gender","race","ethnicity"]
NUM = [
    "age_at_index", "n_encounters", "n_conditions", "n_procedures", "n_medications",
    "n_observations", "n_claims", "hist_total_cost",
    "n_unique_providers", "n_unique_departments", "claim_span_days"
]
TARGET = "label"

train_pd = train_df.select(*(CAT + NUM + [TARGET])).toPandas()
test_pd  = test_df.select(*(CAT + NUM + [TARGET])).toPandas()

# Sanity on class balance (train/test)
print("Train label counts:\n", train_pd[TARGET].value_counts(dropna=False))
print("Test  label counts:\n", test_pd[TARGET].value_counts(dropna=False))


Train rows: 914  Test rows: 249
Train label counts:
 label
0    662
1    252
Name: count, dtype: int64
Test  label counts:
 label
0    152
1     97
Name: count, dtype: int64


## One-hot encoding, alignment, and class weight

In [5]:
# Categoricals → category dtype
for c in CAT:
    train_pd[c] = train_pd[c].astype("category")
    test_pd[c]  = test_pd[c].astype("category")

# One-hot (drop_first avoids perfect multicollinearity)
X_train = pd.get_dummies(train_pd[CAT + NUM], drop_first=True)
X_test  = pd.get_dummies(test_pd[CAT + NUM], drop_first=True)

# Align columns (add missing columns to test, in same order as train)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

y_train = train_pd[TARGET].astype(int).values
y_test  = test_pd[TARGET].astype(int).values

# Optional: drop any constant columns (rare but possible)
const_cols = [c for c in X_train.columns if X_train[c].nunique() <= 1]
if const_cols:
    X_train = X_train.drop(columns=const_cols)
    X_test  = X_test.drop(columns=const_cols)
    print("Dropped constant columns:", const_cols)

# Compute scale_pos_weight to handle imbalance: (#neg / #pos) on train
import numpy as np
pos = np.sum(y_train == 1)
neg = np.sum(y_train == 0)
scale_pos_weight = (neg / pos) if pos > 0 else 1.0
print(f"scale_pos_weight (neg/pos): {scale_pos_weight:.2f} (neg={neg}, pos={pos})")

# Keep metadata for inference
feature_columns = X_train.columns.tolist()
cat_snapshot = {c: list(train_pd[c].cat.categories) for c in CAT}


scale_pos_weight (neg/pos): 2.63 (neg=662, pos=252)


## Logistic Regression with MLflow

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.preprocessing import StandardScaler
from mlflow.tracking import MlflowClient
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import mlflow

# -------------------- Scaling --------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------- MLflow Setup --------------------
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
mlruns_clean_path = os.path.join(project_root, "mlruns_clean")
os.makedirs(mlruns_clean_path, exist_ok=True)
mlflow.set_tracking_uri(f"file://{mlruns_clean_path}")
print("Tracking URI set to:", mlflow.get_tracking_uri())

experiment_name = "health_claims_highcost_logreg"
client = MlflowClient()
if not client.get_experiment_by_name(experiment_name):
    client.create_experiment(name=experiment_name)
mlflow.set_experiment(experiment_name)

# -------------------- Cross-validation setup --------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

C_values = [0.01, 0.1, 1.0, 10.0]

# -------------------- Run experiments --------------------
for i, C in enumerate(C_values):
    with mlflow.start_run(run_name=f"logreg_run_cv_{i}"):
        mlflow.set_tag("mlflow.runName", f"logreg_run_cv_{i}")
        mlflow.log_param("C", C)
        mlflow.log_param("class_weight", "balanced")

        model = LogisticRegression(
            C=C,
            class_weight="balanced",
            max_iter=1000,
            solver="lbfgs",
            random_state=42
        )

        # Cross-validation with added metrics
        scoring = {
            "roc_auc": "roc_auc",
            "pr_auc": "average_precision",
            "precision": "precision",
            "recall": "recall",
            "f1": "f1"
        }

        scores = cross_validate(
            model,
            X_train_scaled, y_train,
            scoring=scoring,
            cv=cv,
            return_train_score=False
        )

        # Log CV metrics
        for metric in scoring.keys():
            mlflow.log_metric(f"cv_{metric}", scores[f"test_{metric}"].mean())

        # -------------------- Fit and evaluate on test set --------------------
        model.fit(X_train_scaled, y_train)
        proba_test = model.predict_proba(X_test_scaled)[:, 1]
        pred_test = (proba_test >= 0.5).astype(int)

        roc_auc = roc_auc_score(y_test, proba_test)
        pr_auc = average_precision_score(y_test, proba_test)
        precision = precision_score(y_test, pred_test)
        recall = recall_score(y_test, pred_test)
        f1 = f1_score(y_test, pred_test)

        mlflow.log_metric("test_roc_auc", roc_auc)
        mlflow.log_metric("test_pr_auc", pr_auc)
        mlflow.log_metric("test_precision", precision)
        mlflow.log_metric("test_recall", recall)
        mlflow.log_metric("test_f1", f1)

        print(f"\nC={C}")
        print(f"Test Precision: {precision:.3f}")
        print(f"Test Recall:    {recall:.3f}")
        print(f"Test F1:        {f1:.3f}")
        print(f"Test ROC-AUC:   {roc_auc:.3f}")
        print(f"Test PR-AUC:    {pr_auc:.3f}")

        # -------------------- Confusion Matrix --------------------
        fig, ax = plt.subplots(figsize=(4, 4))
        ConfusionMatrixDisplay.from_predictions(y_test, pred_test, ax=ax)
        plt.title(f"Confusion Matrix (C={C})")
        cm_path = f"conf_matrix_C{C}.png"
        plt.savefig(cm_path)
        mlflow.log_artifact(cm_path)
        plt.close()

print("\n All runs complete. Check MLflow UI or printed results for best precision/F1.")


Tracking URI set to: file:///home/utsajinlab/health_claims_ml/notebooks/mlruns_clean

C=0.01
Test Precision: 0.509
Test Recall:    0.557
Test F1:        0.532
Test ROC-AUC:   0.638
Test PR-AUC:    0.512

C=0.1
Test Precision: 0.531
Test Recall:    0.536
Test F1:        0.533
Test ROC-AUC:   0.627
Test PR-AUC:    0.522

C=1.0
Test Precision: 0.532
Test Recall:    0.515
Test F1:        0.524
Test ROC-AUC:   0.624
Test PR-AUC:    0.527

C=10.0
Test Precision: 0.533
Test Recall:    0.505
Test F1:        0.519
Test ROC-AUC:   0.625
Test PR-AUC:    0.529

 All runs complete. Check MLflow UI or printed results for best precision/F1.


In [7]:
from mlflow.tracking import MlflowClient
import pandas as pd

# -------------------- Load experiment data --------------------
experiment_name = "health_claims_highcost_logreg"
client = MlflowClient()
experiment = client.get_experiment_by_name(experiment_name)

if experiment is None:
    raise ValueError(f"Experiment '{experiment_name}' not found. Check MLflow tracking URI.")

runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.test_f1 DESC"]
)

# -------------------- Extract relevant info --------------------
records = []
for run in runs:
    metrics = run.data.metrics
    params = run.data.params

    record = {
        "run_name": run.info.run_name,
        "C": params.get("C"),
        "class_weight": params.get("class_weight"),
        "cv_precision": metrics.get("cv_precision"),
        "cv_recall": metrics.get("cv_recall"),
        "cv_f1": metrics.get("cv_f1"),
        "test_precision": metrics.get("test_precision"),
        "test_recall": metrics.get("test_recall"),
        "test_f1": metrics.get("test_f1"),
        "test_roc_auc": metrics.get("test_roc_auc"),
        "test_pr_auc": metrics.get("test_pr_auc")
    }
    records.append(record)

df_results = pd.DataFrame(records)

# -------------------- Sort and format --------------------
df_results = df_results.sort_values(by="test_f1", ascending=False).reset_index(drop=True)
df_results = df_results.round(3)

# Display neatly
print("\n===== Logistic Regression MLflow Results (sorted by test F1) =====")
display(df_results)



===== Logistic Regression MLflow Results (sorted by test F1) =====


Unnamed: 0,run_name,C,class_weight,cv_precision,cv_recall,cv_f1,test_precision,test_recall,test_f1,test_roc_auc,test_pr_auc
0,logreg_run_cv_1,0.1,balanced,0.458,0.619,0.526,0.531,0.536,0.533,0.627,0.522
1,logreg_run_cv_0,0.01,balanced,0.447,0.602,0.512,0.509,0.557,0.532,0.638,0.512
2,logreg_run_cv_2,1.0,balanced,0.45,0.607,0.517,0.532,0.515,0.524,0.624,0.527
3,logreg_run_cv_3,10.0,balanced,0.44,0.591,0.504,0.533,0.505,0.519,0.625,0.529
