In [None]:
import os
import sys
import time
import random
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.linear_model import Ridge

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from pymc3 import  *
import theano
import pandas as pd
# from statsmodels.formula.api import glm as glm_sm
# import statsmodels.api as sm
from pandas.plotting import scatter_matrix

In [None]:
# configuration
scoring_run_id = 1
use_cache = True
seed = random.randint(0, 100)
test_set_pct = 0.2
print(f'Seed: {seed}')

In [None]:
# Load data
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 1;"
data = cb_utils.sql_query_to_df(query, use_cache=use_cache)
data.fillna(0, inplace=True)

query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 2;"
mab2 = cb_utils.sql_query_to_df(query, use_cache=use_cache)
mab2.fillna(0, inplace=True)

In [None]:
mom_feature_columns = [
    'lob_1_days'
  , 'lob_2_days'
  , 'lob_3_days'
  , 'grp_1_days'
  , 'grp_2_days'
  , 'grp_3_days'
  , 'grp_5_days'
  , 'grp_6_days'
  , 'grp_7_days'
  , 'grp_8_days'
  , 'unaligned_days'
#   , 'is_unaligned'
  , 'tc'
  , 'hcbs_tc'
  , 'icf_tc'
  , 'ip_tc'
  , 'rx_tc'
  , 'ed_tc'
  , 'snf_tc'
  , 'out_tc'
  , 'pro_tc'
  , 'spfac_tc'
  , 'amb_tc'
  , 'hh_tc'
  , 'hosp_tc'
  , 'oth_tc'
  , 'p_mm'
  , 'mm'
  , 'hcbs_respite_tc'
  , 'hcbs_fam_care_stip_tc'
  , 'hcbs_com_trans_tc'
  , 'hcbs_educ_train_tc'
  , 'hcbs_com_liv_fam_tc'
  , 'hcbs_com_liv_tc'
  , 'hcbs_attend_care_tc'
  , 'hcbs_com_trans_waiv_tc'
  , 'hcbs_home_meal_tc'
  , 'hcbs_pers_care_tc'
  , 'hcbs_ther_behav_tc'
  , 'hcbs_unsk_respite_tc'
  , 'hcbs_waiv_svc_tc'
  , 'ddos'
  , 'hcbs_ddos'
  , 'icf_ddos'
  , 'ip_ddos'
  , 'rx_ddos'
  , 'ed_ddos'
  , 'snf_ddos'
  , 'out_ddos'
  , 'pro_ddos'
  , 'spfac_ddos'
  , 'amb_ddos'
  , 'hh_ddos'
  , 'hosp_ddos'
  , 'oth_ddos'
  , 'pcp_ddos'
  , 'pulmonar_ddos'
  , 'cancer_ddos'
  , 'ckd_ddos'
  , 'esrd_ddos'
  , 'hyperlipid_ddos'
  , 'diab_ddos'
  , 'alzh_ddos'
  , 'dementia_ddos'
  , 'stroke_ddos'
  , 'hypertension_ddos'
  , 'fall_ddos'
  , 'transplant_ddos'
  , 'liver_ddos'
  , 'hippfract_ddos'
  , 'depression_ddos'
  , 'psychosis_ddos'
  , 'drug_ddos'
  , 'alcohol_ddos'
  , 'paralysis_ddos'
]
annual_feature_columns = [
#   , 'lvl_ft'
#   , 'is_unaligned_ft'
    'unaligned_mm_ft'
  , 'is_self_directed_ft'
  , 'is_cat0_ft'
  , 'is_cat1_ft'
  , 'is_cat2_ft'
  , 'is_cat3_ft'
  , 'is_cat4_ft'
  , 'is_lob1_ft'
  , 'is_lob2_ft'
  , 'is_lob3_ft'
  , 'is_grp1_ft'
  , 'is_grp2_ft'
  , 'is_grp3_ft'
  , 'is_grp45678_ft'
  , 'sav_pct_ft'
  , 'raf_sav_pct_ft'
  , 'ds_sav_pct_ft'
  , 'ip_sav_pct_ft'
  , 'snf_sav_pct_ft'
  , 'icf_sav_pct_ft'
  , 'ed_sav_pct_ft'
  , 'hh_sav_pct_ft'
  , 'pro_sav_pct_ft'
  , 'out_sav_pct_ft'
  , 'savings_ft'
  , 'raf_savings_ft'
  , 'ds_savings_ft'
  , 'ip_savings_ft'
  , 'snf_savings_ft'
  , 'icf_savings_ft'
  , 'ed_savings_ft'
  , 'hh_savings_ft'
  , 'pro_savings_ft'
  , 'out_savings_ft'
  , 'tc_ft'
  , 'hcbs_atd_pcs_tc_ft'
  , 'ip_tc_ft'
  , 'snf_tc_ft'
  , 'icf_tc_ft'
  , 'ed_tc_ft'
  , 'hh_tc_ft'
  , 'pro_tc_ft'
  , 'out_tc_ft'
  , 'savings_pmpm_ft' # start pmpms
  , 'raf_sav_pmpm_ft'
  , 'ds_sav_pmpm_ft'
  , 'ip_sav_pmpm_ft'
  , 'snf_sav_pmpm_ft'
  , 'icf_sav_pmpm_ft'
  , 'ed_sav_pmpm_ft'
  , 'hh_sav_pmpm_ft'
  , 'pro_sav_pmpm_ft'
  , 'out_sav_pmpm_ft'
  , 'tc_pmpm_ft'
  , 'hcbs_attd_pmpm_ft'
  , 'ip_pmpm_ft'
  , 'snf_pmpm_ft'
  , 'icf_pmpm_ft'
  , 'ed_pmpm_ft'
  , 'hh_pmpm_ft'
  , 'pro_pmpm_ft'
  , 'out_pmpm_ft'
  , 'mm_ft'
  , 'age'
  , 'is_male'
]
target_col = 'savings_tgt'
# target_col = 'savings_pmpm_tgt'

feature_columns = mom_feature_columns + annual_feature_columns
len(feature_columns)

### Try agg features at year and half year level

In [None]:
# fully broken out month over month features
def features_mom(df, cols):
#     print('building month over month features')
    df = df.fillna(0)
    pre = df.query("period < 0")
    pre= pre.pivot(index='member_id', columns='period', values=cols)
    pre.columns = [f'{period}-{name}' for (name, period) in pre.columns]
    return pre.fillna(0)
features_mom.name = 'MOM'

In [None]:
# agg semi yearly_features
def features_semi_annual(df, cols):
#     print('building semi annual features')
    df = df.fillna(0)
    pre = df.query("period < 0")
    h1 = pre.query('period < -6').groupby('member_id')
    h2 = pre.query('period >= -6').groupby('member_id')

    h1 = h1[cols].sum()
    h2 = h2[cols].sum()

    features_h1 = np.divide(h1[cols],  h1[['p_mm']])
    features_h2 = np.divide(h2[cols],  h2[['p_mm']])
    res = features_h2.merge(features_h1, left_index=True, right_index=True, suffixes=('_h2', '_h1'))
    return res.fillna(0)
features_semi_annual.name = 'Semi Annual'

In [None]:
# agg yearly_features
def features_annual(df, cols):
#     print('building annual features')
    df = df.fillna(0)
    pre = df.query("period < 0").groupby('member_id')
    pre_sums = pre[cols].sum()
    res = np.divide(pre_sums[cols],  pre_sums[['p_mm']])
    return res.fillna(0)
features_annual.name = 'Annual'

In [None]:
def print_feature_importance(regr, cols, max_cols=20):
    print('Feature Importance')
    i = 0
    for imp, feat in sorted([(b, a) for a, b in zip(cols, regr.feature_importances_)], reverse=True):
        if imp > 0.001:
            print('%0.3f: %s' % (imp, feat))
            i += 1
        if i > max_cols:
            break
            
def print_coef_importance(regr, cols, max_cols=20):
    print('Feature Importance')
    i = 0
    for imp, feat in sorted([(b, a) for a, b in zip(cols, regr.coef_)], reverse=True):
        if imp > 0.001:
            print('%0.3f: %s' % (imp, feat))
            i += 1
        if i > max_cols:
            break
            

In [None]:
def get_miss_ided(X_test, y_test, preds, verbose=True):
    id_pop_size = 100 # test split is 20%, 20% of 500 == 100
    test_df = X_test.assign(target=y_test, pred=preds)
    
    pre_rule_id = test_df.sort_values('savings_ft', ascending=False).iloc[:id_pop_size]
    perf_id = test_df.sort_values('target', ascending=False).iloc[:id_pop_size]
    pred_id = test_df.sort_values('pred', ascending=False).iloc[:id_pop_size]
    
    pred_misses = perf_id.index.difference(pred_id.index).shape[0]
    rule_misses = perf_id.index.difference(pre_rule_id.index).shape[0]
    
    if verbose:
        print(f'Miss IDed: {pred_misses * 100.0 / id_pop_size}%')
        print(f'Rule Miss IDed: {rule_misses * 100.0 / id_pop_size}%')
    return pred_misses, rule_misses 

# Multilevel modeling
Couldn't get pymc3 to work due to theano issues
training separate models for lob's didn't improve performance

In [None]:
# load model to get features
model_name = '20200803_121030_3_m_savings_lr_1000_bag_pre_processed'
models, meta = cb_utils.load_model(model_name)
feature_cols = meta['features']

mom_feats = [c for c in feature_cols if c in mom_feature_columns]
annual_feats = [c for c in feature_cols if c in annual_feature_columns]

In [None]:
# lvl = '1. h'
lvl = '2. m'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# add multi level model group column
features_df = features_df.assign(model_grp=99)
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 0
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 1
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 2
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 3
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 4
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 5
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 6
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 7
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 8
assert features_df.loc[features_df.model_grp > 8].shape[0] == 0

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
# X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

In [None]:
model = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000, oob_score=True, n_jobs=os.cpu_count())

In [None]:
model.fit(features_df, targets_df[target_col])

In [None]:
# test model on 2017
df = mab2.loc[data.lvl_tgt == lvl]
targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# add multi level model group column
features_df = features_df.assign(model_grp=99)
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 0
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 1
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 2
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 3
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 4
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 5
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 6
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 7
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 8
assert features_df.loc[features_df.model_grp > 8].shape[0] == 0

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

preds = model.predict(features_df)

In [None]:
pred_misses, rule_misses = get_miss_ided(features_df, targets_df[target_col], preds)

In [None]:
# try separate models for lobs

In [None]:
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# add multi level model group column
features_df = features_df.assign(model_grp=99)
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 0
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 1
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 2
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 3
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 4
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 5
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 6
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 7
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 8
assert features_df.loc[features_df.model_grp > 8].shape[0] == 0

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

feats_lob_1 = features_df.query('model_grp < 6')
feats_lob_3 = features_df.query('model_grp >= 6')

targets_lob_1 = targets_df.loc[feats_lob_1.index]
targets_lob_3 = targets_df.loc[feats_lob_3.index]
assert feats_lob_1.index.difference(targets_lob_1.index).shape[0] == 0
assert feats_lob_3.index.difference(targets_lob_3.index).shape[0] == 0

# train test split
# X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

In [None]:
lob_1_model = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000, oob_score=True, n_jobs=os.cpu_count())
lob_3_model = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000, oob_score=True, n_jobs=os.cpu_count())

In [None]:
lob_1_model.fit(feats_lob_1, targets_lob_1[target_col])
lob_3_model.fit(feats_lob_3, targets_lob_3[target_col])

In [None]:
# test model on 2017
df = mab2.loc[data.lvl_tgt == lvl]
targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# add multi level model group column
features_df = features_df.assign(model_grp=99)
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 0
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 1
features_df.loc[(features_df.is_lob1_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 2
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 3
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 4
features_df.loc[(features_df.is_lob2_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 5
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp1_ft == 1), 'model_grp'] = 6
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp2_ft == 1), 'model_grp'] = 7
features_df.loc[(features_df.is_lob3_ft == 1) & (features_df.is_grp3_ft == 1), 'model_grp'] = 8
assert features_df.loc[features_df.model_grp > 8].shape[0] == 0

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0
feats_lob_1 = features_df.query('model_grp < 6')
feats_lob_3 = features_df.query('model_grp >= 6')

targets_lob_1 = targets_df.loc[feats_lob_1.index]
targets_lob_3 = targets_df.loc[feats_lob_3.index]
assert feats_lob_1.index.difference(targets_lob_1.index).shape[0] == 0
assert feats_lob_3.index.difference(targets_lob_3.index).shape[0] == 0

lob_1_preds = lob_1_model.predict(feats_lob_1)
lob_3_preds = lob_3_model.predict(feats_lob_3)

In [None]:
pred_misses, rule_misses = get_miss_ided(
    pd.concat([feats_lob_1, feats_lob_3], axis=0),
    pd.concat([targets_lob_1, targets_lob_3], axis=0),
    np.append(lob_1_preds, lob_3_preds)
)

In [None]:
from pymc3 import glm, Model, Metropolis, NUTS, sample

In [None]:
x = X_train
y = y_train
m_data = {'x': x, 'y': y}
with Model() as model:
#     lm = glm.LinearComponent.from_formula('y ~ x', data)
#     sigma = Uniform('sigma', 0, 20)
#     y_obs = Normal('y_obs', mu=lm.y_est, sigma=sigma, observed=y)
    GLM.from_formula('y ~ x', m_data)
    trace = sample(200, cores=2)

plt.figure(figsize=(5, 5))
plt.plot(x, y, 'x')
plot_posterior_predictive_glm(trace)

In [None]:
# example

size = 50
true_intercept = 1
true_slope = 2
x = np.linspace(0, 1, size)
y = true_intercept + x*true_slope + np.random.normal(scale=.5, size=size)
data = {'x': x, 'y': y}
with Model() as model:
    lm = glm.LinearComponent.from_formula('y ~ x', data)
    sigma = Uniform('sigma', 0, 20)
    y_obs = Normal('y_obs', mu=lm.y_est, sigma=sigma, observed=y)
    trace = sample(2000, cores=2)

plt.figure(figsize=(5, 5))
plt.plot(x, y, 'x')
plot_posterior_predictive_glm(trace)

In [None]:
# with Model() as pooled_model:
#     glm('log_radon ~ floor', srrs_mn)
#     pooled_trace = sample(1000, NUTS())

In [None]:
# train 2018
# missed_advantage = []
# models = []

from sklearn.ensemble import BaggingRegressor

# for i in range(10):
feature_cols = [c for c, v in cnt.items() if v >= 3]
feature_cols = meta['features']
mom_feats = [c for c in feature_cols if c in mom_feature_columns]# + ['p_mm']
annual_feats = [c for c in feature_cols if c in annual_feature_columns]
# print(i, ', '.join(feature_cols))

#     lvl = '2. m'
lvl = '1. h'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
# target_col = 'savings_tgt'
seed = random.randint(0, 1000)
print(f'seed: {seed}')
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

# gb = Ridge(alpha=1.0, normalize=True)
gb = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000)
#     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
#     gb = RandomForestRegressor(random_state=seed)

gb.fit(X_train, y_train)
verbose = True
preds = gb.predict(X_test)
r2_score = gb.score(X_test, y_test)
error = np.abs(y_test - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
pred_misses, rule_misses = get_miss_ided(X_test, y_test, preds)

In [None]:
# train 2018
# missed_advantage = []
models = []
lifts = []

feature_cols = [c for c, v in cnt.items() if v >= 3]
feature_cols = meta['features']
mom_feats = [c for c in feature_cols if c in mom_feature_columns]# + ['p_mm']
annual_feats = [c for c in feature_cols if c in annual_feature_columns]
# print(i, ', '.join(feature_cols))

#     lvl = '2. m'
lvl = '1. h'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

for i in range(1000):

    # train test split
    # target_col = 'savings_tgt'
    seed = random.randint(0, 1000)
#     print(f'seed: {seed}')
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

    gb = Ridge(alpha=1.0, normalize=True)
#     gb = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000)
    #     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
    #     gb = RandomForestRegressor(random_state=seed)

    gb.fit(X_train, y_train)
    models.append(gb)
    preds = gb.predict(X_test)
    pred_misses, rule_misses = get_miss_ided(X_test, y_test, preds, verbose=False)
    lifts.append(rule_misses - pred_misses)


#     verbose = True
#     r2_score = gb.score(X_test, y_test)
#     error = np.abs(y_test - preds)
#     mean_hrs_error = error.mean()
#     median_hrs_error = error.median()


#     print(f'R^2 Score: {r2_score}')
#     print(f'Mean absolute $ error: {mean_hrs_error}')
#     print(f'Median absolute $ error: {median_hrs_error}')

In [None]:
# model_name = cb_utils.save_model(models, '2_h_savings_lr_1000_bag', {'name': '2_h_savings_lr_1000_bag', 'features': list(features_df.columns)})

In [None]:
model_name = '20200802_172434_2_h_savings_lr_10_bag'
model, meta = cb_utils.load_model(model_name)

In [None]:
feature_cols = meta['features']

mom_feats = [c for c in feature_cols if c in mom_feature_columns] 
annual_feats = [c for c in feature_cols if c in annual_feature_columns]

lvl = '1. h'
df = mab2.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
# target_col = 'savings_tgt'
seed = random.randint(0, 1000)
print(f'seed: {seed}')
# X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

# gb = Ridge(alpha=1.0, normalize=True)
# gb = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=10, random_state=0)
#     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
#     gb = RandomForestRegressor(random_state=seed)
scores = [model.predict(features_df) for model in models]
preds = np.mean(scores, axis=0)
# gb = model

# r2_score = gb.score(features_df, targets_df[target_col])
# preds = gb.predict(features_df)
error = np.abs(targets_df[target_col] - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


# print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
pred_misses, rule_misses = get_miss_ided(features_df, targets_df[target_col], preds)

In [None]:
result = features_df.assign(pred=preds)

In [None]:
cb_utils.save_scores(result, 1)

In [None]:
raise Exception("Stop here, Old code below")

In [None]:
df = data.loc[data.lvl_tgt == '1. h']

targets_df = df[['member_id', target_col]].groupby('member_id').first()
targets_df.fillna(0, inplace=True)

df = df.fillna(0)
features_df = features_annual(df, feature_columns)
model = train_and_test(features_df, targets_df)

In [None]:
preds = model.predict(features_df)
top_500_preds = features_df.assign(pred=preds).sort_values('pred', ascending=False).iloc[:500][['pred']]
id_pop = data.loc[(data.lvl_tgt == '1. h')]
# id_pop = data.loc[(data.lvl_tgt == lvl) & (data.period > 0)]
id_pop = id_pop.set_index('member_id')
id_pop = id_pop.merge(top_500_preds, left_index=True, right_index=True) 
id_pop = id_pop.merge(targets_df, left_index=True, right_index=True) 
id_pop = id_pop.fillna(0)

In [None]:
id_pop.head()
# features_df.assign(pred=preds)[['pred']].to_csv('../outputs/h_preds.csv')

In [None]:
tcs = [c for c in features_df.columns if c[-3:] == '_tc'] + ['tc']
tcs = [c for c in tcs if len(c) < 8 or c[:5] != 'hcbs_']

g = sns.relplot(
    x="period",
    y="value",
    hue="variable",
    kind="line",
    data=id_pop.melt(id_vars=['period'], value_vars=tcs),
    height=12,
    aspect=2
)

In [None]:
# tune hyperparameters: didn't see much benifit, takes about 30 minutes to run

# lvl  = '1. h'
# df = data.loc[data.lvl_tgt == lvl]
# df = df.fillna(0)

# targets_df = df[['member_id', target_col]].groupby('member_id').first()
# feats_df = features_annual(df, feature_columns)

# feats_df.sort_index(inplace=True)
# targets_df.sort_index(inplace=True)
# assert sum(targets_df.index - feats_df.index) == 0

# # train test split
# target_col = 'savings_tgt'
# X_train, X_test, y_train, y_test = train_test_split(feats_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

# gb = GradientBoostingRegressor(random_state=seed)

# grid = {
#     'learning_rate': [0.001, 0.01, 0.1, 0.2],
#     'n_estimators': [10, 100, 500, 1000],
#     'max_depth': [2, 3, 4, 5, 10],
#     'min_samples_leaf': [1, 2, 3, 4, 5],
# }

# clf = GridSearchCV(gb, grid, n_jobs=os.cpu_count())

# search = clf.fit(X_train, y_train)

# cv_res = pd.DataFrame(search.cv_results_)
# cv_res.sort_values('rank_test_score')

In [None]:
# model_name = cb_utils.save_model(model, '1_xgb_cat_savings', {'name': '1_xgb_cat_savings', 'features': list(features_df.columns)})

In [None]:
# load model trained on mab 5
model_name = '20200731_153512_1_xgb_cat_savings'
model, meta = cb_utils.load_model(model_name)

In [None]:
# cb_utils.save_scores(final)

In [None]:
# test on a different population


In [None]:
targets_df = mab6[['member_id', target_col]].groupby('member_id').first()
targets_df.fillna(0, inplace=True)

mab6 = mab6.fillna(0)
features_df = features_annual(mab6, feature_columns)
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)


preds = model.predict(features_df)
r2_score = model.score(features_df, targets_df[target_col])

error = np.abs(targets_df[target_col] - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()

print('Testing MAB 6 data on model trained with mab 5')
print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')

fig, ax = plt.subplots(nrows=1, figsize=(20,10))
ax.scatter(preds, targets_df[target_col])
ax.set_xlabel('preds')
ax.set_ylabel('actual');
ax.set_title(f'Predicted vs actual savings')
plt.show()


In [None]:
# try training on early set 
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 6;"
mab6 = cb_utils.sql_query_to_df(query, use_cache=use_cache)

df = mab6.loc[data.lvl_tgt == '1. h']

targets_df = df[['member_id', target_col]].groupby('member_id').first()
targets_df.fillna(0, inplace=True)

df = df.fillna(0)
features_df = features_annual(df, feature_columns)
model = train_and_test(features_df, targets_df)