In [2]:
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting, enable_halving_search_cv  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, HalvingRandomSearchCV 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 

# from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../../src')
import cb_utils
import cb_model_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2



### Load raw data

In [5]:
query = f"select * from junk.ml_training_samples_20250210;"
df = cb_utils.sql_query_to_df(query, use_cache=False, source='msh_analytics')

Pulling query from db


In [6]:
df.head()

Unnamed: 0,payer_id,period_number,member_id,pre_elg_days,age_ft,is_male_ft,is_female_ft,ip_tc_pre_pmpm_ft,ed_tc_pre_pmpm_ft,snf_tc_pre_pmpm_ft,icf_tc_pre_pmpm_ft,hh_tc_pre_pmpm_ft,out_tc_pre_pmpm_ft,pro_tc_pre_pmpm_ft,hcbs_tc_pre_pmpm_ft,sphs_tc_pre_pmpm_ft,amb_tc_pre_pmpm_ft,dme_tc_pre_pmpm_ft,hosp_tc_pre_pmpm_ft,dialysis_ddos_pre_pmpm_ft,pulmonar_ddos_pre_pmpm_ft,copd_ddos_pre_pmpm_ft,chf_ddos_pre_pmpm_ft,heart_ddos_pre_pmpm_ft,cancer_ddos_pre_pmpm_ft,ckd_ddos_pre_pmpm_ft,esrd_ddos_pre_pmpm_ft,hyperlipid_ddos_pre_pmpm_ft,diab_ddos_pre_pmpm_ft,alzh_ddos_pre_pmpm_ft,dementia_ddos_pre_pmpm_ft,neurocognitive_ddos_pre_pmpm_ft,stroke_ddos_pre_pmpm_ft,hypertension_ddos_pre_pmpm_ft,fall_ddos_pre_pmpm_ft,transplant_ddos_pre_pmpm_ft,liver_ddos_pre_pmpm_ft,hippfract_ddos_pre_pmpm_ft,depression_ddos_pre_pmpm_ft,psychosis_ddos_pre_pmpm_ft,drug_ddos_pre_pmpm_ft,alcohol_ddos_pre_pmpm_ft,paralysis_ddos_pre_pmpm_ft,hemophilia_ddos_pre_pmpm_ft,pressure_ulcer_ddos_pre_pmpm_ft,tbi_ddos_pre_pmpm_ft,obese_ddos_pre_pmpm_ft,post_elig_days,tc_tg,tc_pmpm_tg
0,81,19,20362,181,68.0,1,0,0.0,0.0,0.0,0.0,0.0,6.3,119.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,676.96,5.56
1,81,19,20363,181,90.0,0,1,2530.75,0.0,3756.4,0.0,387.25,0.0,434.1,0.0,0.0,100.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,27889.78,229.23
2,81,19,20368,181,76.0,0,1,0.0,0.0,0.0,0.0,1240.69,0.0,69.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.17,0.17,0.17,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,13185.44,108.37
3,81,19,20371,181,73.0,1,0,0.0,0.0,0.0,0.0,0.0,1.21,123.45,0.0,0.0,0.0,7.05,0.0,0.0,0.0,0.0,0.33,1.49,0.0,0.0,0.0,0.99,0.0,0.0,0.0,0.0,0.17,0.33,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,31207.91,256.5
4,81,19,20374,181,71.0,1,0,0.0,0.0,0.0,0.0,583.64,0.0,46.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,365,4454.73,36.61


In [7]:
df.shape

(1765592, 50)

In [8]:
df.member_id.nunique()

146076