In [2]:
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting, enable_halving_search_cv  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, HalvingRandomSearchCV 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 

# from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../../src')
import cb_utils
import cb_model_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2



### Load raw data

In [5]:
query = f"select * from junk.ml_training_samples_20250210;"
df = cb_utils.sql_query_to_df(query, use_cache=False, source='msh_analytics')

Pulling query from db


In [6]:
df.head()

Unnamed: 0,payer_id,period_number,member_id,pre_elg_days,age_ft,is_male_ft,is_female_ft,ip_tc_pre_pmpm_ft,ed_tc_pre_pmpm_ft,snf_tc_pre_pmpm_ft,icf_tc_pre_pmpm_ft,hh_tc_pre_pmpm_ft,out_tc_pre_pmpm_ft,pro_tc_pre_pmpm_ft,hcbs_tc_pre_pmpm_ft,sphs_tc_pre_pmpm_ft,amb_tc_pre_pmpm_ft,dme_tc_pre_pmpm_ft,hosp_tc_pre_pmpm_ft,dialysis_ddos_pre_pmpm_ft,pulmonar_ddos_pre_pmpm_ft,copd_ddos_pre_pmpm_ft,chf_ddos_pre_pmpm_ft,heart_ddos_pre_pmpm_ft,cancer_ddos_pre_pmpm_ft,ckd_ddos_pre_pmpm_ft,esrd_ddos_pre_pmpm_ft,hyperlipid_ddos_pre_pmpm_ft,diab_ddos_pre_pmpm_ft,alzh_ddos_pre_pmpm_ft,dementia_ddos_pre_pmpm_ft,neurocognitive_ddos_pre_pmpm_ft,stroke_ddos_pre_pmpm_ft,hypertension_ddos_pre_pmpm_ft,fall_ddos_pre_pmpm_ft,transplant_ddos_pre_pmpm_ft,liver_ddos_pre_pmpm_ft,hippfract_ddos_pre_pmpm_ft,depression_ddos_pre_pmpm_ft,psychosis_ddos_pre_pmpm_ft,drug_ddos_pre_pmpm_ft,alcohol_ddos_pre_pmpm_ft,paralysis_ddos_pre_pmpm_ft,hemophilia_ddos_pre_pmpm_ft,pressure_ulcer_ddos_pre_pmpm_ft,tbi_ddos_pre_pmpm_ft,obese_ddos_pre_pmpm_ft,post_elig_days,tc_tg,tc_pmpm_tg
0,81,19,20362,181,68.0,1,0,0.0,0.0,0.0,0.0,0.0,6.3,119.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,676.96,5.56
1,81,19,20363,181,90.0,0,1,2530.75,0.0,3756.4,0.0,387.25,0.0,434.1,0.0,0.0,100.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,27889.78,229.23
2,81,19,20368,181,76.0,0,1,0.0,0.0,0.0,0.0,1240.69,0.0,69.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.17,0.17,0.17,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,13185.44,108.37
3,81,19,20371,181,73.0,1,0,0.0,0.0,0.0,0.0,0.0,1.21,123.45,0.0,0.0,0.0,7.05,0.0,0.0,0.0,0.0,0.33,1.49,0.0,0.0,0.0,0.99,0.0,0.0,0.0,0.0,0.17,0.33,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,31207.91,256.5
4,81,19,20374,181,71.0,1,0,0.0,0.0,0.0,0.0,583.64,0.0,46.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,4454.73,36.61


In [7]:
df.shape

(1765592, 50)

In [8]:
df.member_id.nunique()

146076

### Train test split

In [9]:
train_pct = 0.8
# Get unique member IDs and randomly split them
all_members = df.member_id.unique()
np.random.seed(42)
train_members = np.random.choice(all_members, size=int(len(all_members) * train_pct), replace=False)
test_members = np.setdiff1d(all_members, train_members)

# Split dataframes based on member lists
train_df = df[df.member_id.isin(train_members)].copy()
test_df = df[df.member_id.isin(test_members)].copy()

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nUnique members in train: {train_df.member_id.nunique()}")
print(f"Unique members in test: {test_df.member_id.nunique()}")



Train shape: (1411865, 50)
Test shape: (353727, 50)

Unique members in train: 116860
Unique members in test: 29216


In [10]:
# Get feature columns ending in _ft
feature_cols = [col for col in train_df.columns if col.endswith('_ft')]
target_col = 'tc_pmpm_tg'

# Create feature matrix X and target vector y
X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

# Import required modules
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

# Initialize scaler and model
scaler = StandardScaler()
model = LinearRegression()

# Setup k-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

# Perform cross-validation
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    # Split data
    X_fold_train = X_train.iloc[train_idx]
    y_fold_train = y_train.iloc[train_idx]
    X_fold_val = X_train.iloc[val_idx]
    y_fold_val = y_train.iloc[val_idx]
    
    # Scale features
    X_fold_train_scaled = scaler.fit_transform(X_fold_train)
    X_fold_val_scaled = scaler.transform(X_fold_val)
    
    # Train model
    model.fit(X_fold_train_scaled, y_fold_train)
    
    # Make predictions
    y_pred = model.predict(X_fold_val_scaled)
    
    # Calculate R2 score
    r2 = r2_score(y_fold_val, y_pred)
    cv_scores.append(r2)
    
    print(f"Fold {fold + 1} R² Score: {r2:.4f}")

print(f"\nMean CV R² Score: {np.mean(cv_scores):.4f}")
print(f"Std CV R² Score: {np.std(cv_scores):.4f}")

# Train final model on full training set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': model.coef_
})
feature_importance['abs_coefficient'] = abs(feature_importance['coefficient'])
feature_importance = feature_importance.sort_values('abs_coefficient', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

# # Evaluate on test set
# y_pred_test = model.predict(X_test_scaled)
# test_r2 = r2_score(y_test, y_pred_test)
# test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# print(f"\nTest Set R² Score: {test_r2:.4f}")
# print(f"Test Set RMSE: {test_rmse:.4f}")


Fold 1 R² Score: 0.1867
Fold 2 R² Score: 0.1990
Fold 3 R² Score: 0.2046
Fold 4 R² Score: 0.1788
Fold 5 R² Score: 0.2042

Mean CV R² Score: 0.1946
Std CV R² Score: 0.0102

Top 10 Most Important Features:
                feature   coefficient  abs_coefficient
1            is_male_ft  4.439177e+11     4.439177e+11
2          is_female_ft  4.439177e+11     4.439177e+11
10  hcbs_tc_pre_pmpm_ft -2.854876e+08     2.854876e+08
6    icf_tc_pre_pmpm_ft -1.729264e+08     1.729264e+08
11  sphs_tc_pre_pmpm_ft  1.307570e+08     1.307570e+08
8    out_tc_pre_pmpm_ft  5.008530e+01     5.008530e+01
9    pro_tc_pre_pmpm_ft  2.495750e+01     2.495750e+01
7     hh_tc_pre_pmpm_ft  1.974131e+01     1.974131e+01
13   dme_tc_pre_pmpm_ft  1.693095e+01     1.693095e+01
0                age_ft  1.395207e+01     1.395207e+01
