In [None]:
import os
import sys
import time
import random
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
scoring_run_id = 1
use_cache = False
seed = random.randint(0, 100)
test_set_pct = 0.2
print(f'Seed: {seed}')

In [None]:
# Load data
# 2018
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 4;"
mab4 = cb_utils.sql_query_to_df(query, use_cache=use_cache)
mab4.fillna(0, inplace=True)

# 2017
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 3;"
mab3 = cb_utils.sql_query_to_df(query, use_cache=use_cache)
mab3.fillna(0, inplace=True)

In [None]:
mom_feature_columns = [
#     'lob_at_id'
#   , 'grp_at_id'
    'lob_1_days'
  , 'lob_2_days'
  , 'lob_3_days'
  , 'grp_1_days'
  , 'grp_2_days'
  , 'grp_3_days'
  , 'grp_5_days'
  , 'grp_6_days'
  , 'grp_7_days'
  , 'grp_8_days'
  , 'unaligned_days'
#   , 'is_unaligned'
  , 'tc'
  , 'hcbs_tc'
  , 'icf_tc'
  , 'ip_tc'
  , 'rx_tc'
  , 'ed_tc'
  , 'snf_tc'
  , 'out_tc'
  , 'pro_tc'
  , 'spfac_tc'
  , 'amb_tc'
  , 'hh_tc'
  , 'hosp_tc'
  , 'oth_tc'
  , 'p_mm'
  , 'mm'
  , 'hcbs_respite_tc'
  , 'hcbs_fam_care_stip_tc'
  , 'hcbs_com_trans_tc'
  , 'hcbs_educ_train_tc'
  , 'hcbs_com_liv_fam_tc'
  , 'hcbs_com_liv_tc'
  , 'hcbs_attend_care_tc'
  , 'hcbs_com_trans_waiv_tc'
  , 'hcbs_home_meal_tc'
  , 'hcbs_pers_care_tc'
  , 'hcbs_ther_behav_tc'
  , 'hcbs_unsk_respite_tc'
  , 'hcbs_waiv_svc_tc'
  , 'ddos'
  , 'hcbs_ddos'
  , 'icf_ddos'
  , 'ip_ddos'
  , 'rx_ddos'
  , 'ed_ddos'
  , 'snf_ddos'
  , 'out_ddos'
  , 'pro_ddos'
  , 'spfac_ddos'
  , 'amb_ddos'
  , 'hh_ddos'
  , 'hosp_ddos'
  , 'oth_ddos'
  , 'pcp_ddos'
  , 'chf_ddos'
  , 'heart_ddos'
  , 'copd_ddos'
  , 'pulmonar_ddos'
  , 'cancer_ddos'
  , 'ckd_ddos'
  , 'esrd_ddos'
  , 'hyperlipid_ddos'
  , 'diab_ddos'
  , 'alzh_ddos'
  , 'dementia_ddos'
  , 'stroke_ddos'
  , 'hypertension_ddos'
  , 'fall_ddos'
  , 'transplant_ddos'
  , 'liver_ddos'
  , 'hippfract_ddos'
  , 'depression_ddos'
  , 'psychosis_ddos'
  , 'drug_ddos'
  , 'alcohol_ddos'
  , 'paralysis_ddos'
]
annual_feature_columns = [
#   , 'lvl_ft'
#   , 'is_unaligned_ft'
    'unaligned_mm_ft'
  , 'is_self_directed_ft'
  , 'is_cat0_ft'
  , 'is_cat1_ft'
  , 'is_cat2_ft'
  , 'is_cat3_ft'
  , 'is_cat4_ft'
  , 'is_lob1_ft'
  , 'is_lob2_ft'
  , 'is_lob3_ft'
  , 'is_grp1_ft'
  , 'is_grp2_ft'
  , 'is_grp3_ft'
  , 'is_grp45678_ft'
  , 'sav_pct_ft'
  , 'raf_sav_pct_ft'
  , 'ds_sav_pct_ft'
  , 'ip_sav_pct_ft'
  , 'snf_sav_pct_ft'
  , 'icf_sav_pct_ft'
  , 'ed_sav_pct_ft'
  , 'hh_sav_pct_ft'
  , 'pro_sav_pct_ft'
  , 'out_sav_pct_ft'
  , 'savings_ft'
  , 'raf_savings_ft'
  , 'ds_savings_ft'
  , 'ip_savings_ft'
  , 'snf_savings_ft'
  , 'icf_savings_ft'
  , 'ed_savings_ft'
  , 'hh_savings_ft'
  , 'pro_savings_ft'
  , 'out_savings_ft'
  , 'tc_ft'
  , 'hcbs_atd_pcs_tc_ft'
  , 'ip_tc_ft'
  , 'snf_tc_ft'
  , 'icf_tc_ft'
  , 'ed_tc_ft'
  , 'hh_tc_ft'
  , 'pro_tc_ft'
  , 'out_tc_ft'
  , 'savings_pmpm_ft'
  , 'raf_sav_pmpm_ft'
  , 'ds_sav_pmpm_ft'
  , 'ip_sav_pmpm_ft'
  , 'snf_sav_pmpm_ft'
  , 'icf_sav_pmpm_ft'
  , 'ed_sav_pmpm_ft'
  , 'hh_sav_pmpm_ft'
  , 'pro_sav_pmpm_ft'
  , 'out_sav_pmpm_ft'
  , 'tc_pmpm_ft'
  , 'hcbs_attd_pmpm_ft'
  , 'ip_pmpm_ft'
  , 'snf_pmpm_ft'
  , 'icf_pmpm_ft'
  , 'ed_pmpm_ft'
  , 'hh_pmpm_ft'
  , 'pro_pmpm_ft'
  , 'out_pmpm_ft'
  , 'mm_ft'

]
target_col = 'savings_tgt'
lvl = '2. m'

feature_columns = mom_feature_columns + annual_feature_columns
print(f'N Potential Features: {len(feature_columns)}')

### Features pulled from feature selection in separate notebook
`bjp_multi_model_v4_feature_selection_chf_copd_added_in.ipynb`

In [None]:
# features from selection method actually performs worse than all features!
feature_cols = ['p_mm', 'alzh_ddos', 'amb_ddos', 'amb_tc', 'cancer_ddos', 'chf_ddos', 'copd_ddos', 'ddos', 'dementia_ddos', 'depression_ddos', 'diab_ddos', 'ds_sav_pct_ft', 'ds_sav_pmpm_ft', 'ds_savings_ft', 'ed_ddos', 'ed_sav_pct_ft', 'ed_sav_pmpm_ft', 'ed_savings_ft', 'ed_tc', 'esrd_ddos', 'fall_ddos', 'grp_1_days', 'hcbs_attd_pmpm_ft', 'hcbs_attend_care_tc', 'hcbs_com_liv_fam_tc', 'hcbs_com_trans_waiv_tc', 'hcbs_ddos', 'hcbs_home_meal_tc', 'hcbs_tc', 'hcbs_unsk_respite_tc', 'heart_ddos', 'hh_ddos', 'hh_sav_pct_ft', 'hh_sav_pmpm_ft', 'hh_savings_ft', 'hh_tc', 'hh_tc_ft', 'hosp_ddos', 'hyperlipid_ddos', 'hypertension_ddos', 'icf_ddos', 'icf_sav_pct_ft', 'icf_sav_pmpm_ft', 'icf_savings_ft', 'icf_tc', 'ip_ddos', 'ip_sav_pct_ft', 'ip_sav_pmpm_ft', 'ip_savings_ft', 'is_cat3_ft', 'is_cat4_ft', 'is_grp1_ft', 'is_grp2_ft', 'is_lob3_ft', 'is_self_directed_ft', 'liver_ddos', 'mm', 'oth_ddos', 'out_ddos', 'out_sav_pct_ft', 'out_sav_pmpm_ft', 'out_savings_ft', 'out_tc', 'pcp_ddos', 'pro_ddos', 'pro_sav_pct_ft', 'pro_sav_pmpm_ft', 'pro_savings_ft', 'psychosis_ddos', 'pulmonar_ddos', 'rx_ddos', 'sav_pct_ft', 'savings_ft', 'savings_pmpm_ft', 'snf_pmpm_ft', 'snf_sav_pct_ft', 'snf_sav_pmpm_ft', 'snf_savings_ft', 'snf_tc', 'stroke_ddos', 'unaligned_mm_ft', 'ckd_ddos', 'is_cat0_ft', 'is_lob2_ft', 'mm_ft', 'snf_ddos']

mom_feats = [c for c in feature_cols if c in mom_feature_columns]
annual_feats = [c for c in feature_cols if c in annual_feature_columns]

In [None]:
df = mab4.loc[mab4.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

# features_df = cb_utils.features_annual(df, mom_feature_columns)
# pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()
features_df = cb_utils.features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split

# Check ridge vs lasso regression lift perf

In [None]:
# lasso
l_models = [Lasso(alpha=1.0, normalize=True) for _ in range(1000)]
l_lifts = []

for model in l_models:
    seed = random.randint(0, 10000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_misses, rule_misses = cb_utils.get_miss_ided(X_test, y_test, preds, verbose=False)
    l_lifts.append(rule_misses - pred_misses)

In [None]:
#ridge
r_models = [Ridge(alpha=1.0, normalize=True) for _ in range(1000)]
r_lifts = []

for model in r_models:
    seed = random.randint(0, 10000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_misses, rule_misses = cb_utils.get_miss_ided(X_test, y_test, preds, verbose=False)
    r_lifts.append(rule_misses - pred_misses)

In [None]:
# mixed
m_models = [Ridge(alpha=1.0, normalize=True) for _ in range(500)] + [Lasso(alpha=1.0, normalize=True) for _ in range(500)]
m_lifts = []

for model in m_models:
    seed = random.randint(0, 10000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_misses, rule_misses = cb_utils.get_miss_ided(X_test, y_test, preds, verbose=False)
    m_lifts.append(rule_misses - pred_misses)

In [None]:
# elastic net
e_models = [ElasticNet(alpha=1.0, normalize=True) for _ in range(1000)]
e_lifts = []

for model in e_models:
    seed = random.randint(0, 10000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_misses, rule_misses = cb_utils.get_miss_ided(X_test, y_test, preds, verbose=False)
    e_lifts.append(rule_misses - pred_misses)

In [None]:
# bayes
b_models = [BayesianRidge(normalize=True) for _ in range(1000)]
b_lifts = []

for model in b_models:
    seed = random.randint(0, 10000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_misses, rule_misses = cb_utils.get_miss_ided(X_test, y_test, preds, verbose=False)
    b_lifts.append(rule_misses - pred_misses)

In [None]:
# bayes lasso
bl_models = [BayesianRidge(normalize=True) for _ in range(500)] + [Lasso(alpha=1.0, normalize=True) for _ in range(500)]
bl_lifts = []

for model in bl_models:
    seed = random.randint(0, 10000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    pred_misses, rule_misses = cb_utils.get_miss_ided(X_test, y_test, preds, verbose=False)
    bl_lifts.append(rule_misses - pred_misses)

In [None]:
np.mean(l_lifts)

In [None]:
np.mean(r_lifts)

In [None]:
np.mean(m_lifts)

In [None]:
np.mean(e_lifts)

In [None]:
np.mean(b_lifts)

In [None]:
np.mean(bl_lifts)

In [None]:
# test model on 2017 all features
df = mab3.loc[mab3.lvl_tgt == lvl]
targets_df = df[['member_id', target_col]].groupby('member_id').first()

# features_df = cb_utils.features_annual(df, mom_feats)
# pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()
features_df = cb_utils.features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

preds = np.mean([model.predict(features_df) for model in b_models], axis=0)
pred_misses, rule_misses = cb_utils.get_miss_ided(features_df, targets_df[target_col], preds)

In [None]:
# test model on 2017 features subset
df = mab3.loc[mab3.lvl_tgt == lvl]
targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()
# features_df = cb_utils.features_annual(df, mom_feature_columns)
# pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

preds = np.mean([model.predict(features_df) for model in b_models], axis=0)
pred_misses, rule_misses = cb_utils.get_miss_ided(features_df, targets_df[target_col], preds)

In [None]:
meta = {'name': '4_m_savings_br_1000', 'features': list(features_df.columns), 'lvl_tgt': '2. m', 'family': 'bayes ridge regr'}
cb_utils.publish_model(b_models, '4_m_savings_br_1000', '1000 bayes ridge regressors m target', 'brr', meta, inserted_by='bpierson')

In [None]:
# model_name = cb_utils.save_model(models, '4_m_savings_brr_1000', meta)
# model_name

In [None]:
# model_name = '3_m_savings_multi_ensemble'
# model, meta = cb_utils.load_model(model_name)

In [None]:
cb_utils.create_scoring_run(mab_id=3, model_id=4, description='2017 test', inserted_by='bpierson')

In [None]:
cb_utils.save_scores(targets_df.assign(pred=preds), 5)

# Test on 2018

In [None]:
cb_utils.create_scoring_run(mab_id=4, model_id=4, description='2018 test', inserted_by='bpierson')

In [None]:
model_name = '20200805_135604_4_m_savings_br_1000'
models, meta = cb_utils.load_model(model_name)
feature_cols = meta['features']

mom_feats = [c for c in feature_cols if c in mom_feature_columns]
annual_feats = [c for c in feature_cols if c in annual_feature_columns]

In [None]:
df = mab4.loc[mab4.lvl_tgt == meta['lvl_tgt']]
targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

preds = np.mean([model.predict(features_df) for model in models], axis=0)
pred_misses, rule_misses = cb_utils.get_miss_ided(features_df, targets_df[target_col], preds)

In [None]:
cb_utils.save_scores(targets_df.assign(pred=preds), 6)

In [None]:
preds[:10]

In [None]:
raise Exception("Stop here, Old code below")