In [None]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
scoring_run_id = 1
use_cache = True
seed = 0
test_set_pct = 0.2

In [None]:
# Load data
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 5;"
data = cb_utils.sql_query_to_df(query, use_cache=use_cache)
data.fillna(0, inplace=True)

In [None]:
mom_feature_columns = [
    'lob_1_days'
  , 'lob_2_days'
  , 'lob_3_days'
  , 'grp_1_days'
  , 'grp_2_days'
  , 'grp_3_days'
  , 'grp_5_days'
  , 'grp_6_days'
  , 'grp_7_days'
  , 'grp_8_days'
  , 'unaligned_days'
#   , 'is_unaligned'
  , 'tc'
  , 'hcbs_tc'
  , 'icf_tc'
  , 'ip_tc'
  , 'rx_tc'
  , 'ed_tc'
  , 'snf_tc'
  , 'out_tc'
  , 'pro_tc'
  , 'spfac_tc'
  , 'amb_tc'
  , 'hh_tc'
  , 'hosp_tc'
  , 'oth_tc'
  , 'p_mm'
  , 'mm'
  , 'hcbs_respite_tc'
  , 'hcbs_fam_care_stip_tc'
  , 'hcbs_com_trans_tc'
  , 'hcbs_educ_train_tc'
  , 'hcbs_com_liv_fam_tc'
  , 'hcbs_com_liv_tc'
  , 'hcbs_attend_care_tc'
  , 'hcbs_com_trans_waiv_tc'
  , 'hcbs_home_meal_tc'
  , 'hcbs_pers_care_tc'
  , 'hcbs_ther_behav_tc'
  , 'hcbs_unsk_respite_tc'
  , 'hcbs_waiv_svc_tc'
  , 'ddos'
  , 'hcbs_ddos'
  , 'icf_ddos'
  , 'ip_ddos'
  , 'rx_ddos'
  , 'ed_ddos'
  , 'snf_ddos'
  , 'out_ddos'
  , 'pro_ddos'
  , 'spfac_ddos'
  , 'amb_ddos'
  , 'hh_ddos'
  , 'hosp_ddos'
  , 'oth_ddos'
  , 'pcp_ddos'
  , 'pulmonar_ddos'
  , 'cancer_ddos'
  , 'ckd_ddos'
  , 'esrd_ddos'
  , 'hyperlipid_ddos'
  , 'diab_ddos'
  , 'alzh_ddos'
  , 'dementia_ddos'
  , 'stroke_ddos'
  , 'hypertension_ddos'
  , 'fall_ddos'
  , 'transplant_ddos'
  , 'liver_ddos'
  , 'hippfract_ddos'
  , 'depression_ddos'
  , 'psychosis_ddos'
  , 'drug_ddos'
  , 'alcohol_ddos'
  , 'paralysis_ddos'
]
annual_feature_columns = [
#   , 'lvl_ft'
#   , 'is_unaligned_ft'
    'unaligned_mm_ft'
  , 'is_self_directed_ft'
  , 'is_cat0_ft'
  , 'is_cat1_ft'
  , 'is_cat2_ft'
  , 'is_cat3_ft'
  , 'is_cat4_ft'
  , 'is_lob1_ft'
  , 'is_lob2_ft'
  , 'is_lob3_ft'
  , 'is_grp1_ft'
  , 'is_grp2_ft'
  , 'is_grp3_ft'
  , 'is_grp45678_ft'
  , 'sav_pct_ft'
  , 'raf_sav_pct_ft'
  , 'ds_sav_pct_ft'
  , 'ip_sav_pct_ft'
  , 'snf_sav_pct_ft'
  , 'icf_sav_pct_ft'
  , 'ed_sav_pct_ft'
  , 'hh_sav_pct_ft'
  , 'pro_sav_pct_ft'
  , 'out_sav_pct_ft'
  , 'savings_ft'
  , 'raf_savings_ft'
  , 'ds_savings_ft'
  , 'ip_savings_ft'
  , 'snf_savings_ft'
  , 'icf_savings_ft'
  , 'ed_savings_ft'
  , 'hh_savings_ft'
  , 'pro_savings_ft'
  , 'out_savings_ft'
  , 'tc_ft'
  , 'hcbs_atd_pcs_tc_ft'
  , 'ip_tc_ft'
  , 'snf_tc_ft'
  , 'icf_tc_ft'
  , 'ed_tc_ft'
  , 'hh_tc_ft'
  , 'pro_tc_ft'
  , 'out_tc_ft'
  , 'savings_pmpm_ft' # start pmpms
  , 'raf_sav_pmpm_ft'
  , 'ds_sav_pmpm_ft'
  , 'ip_sav_pmpm_ft'
  , 'snf_sav_pmpm_ft'
  , 'icf_sav_pmpm_ft'
  , 'ed_sav_pmpm_ft'
  , 'hh_sav_pmpm_ft'
  , 'pro_sav_pmpm_ft'
  , 'out_sav_pmpm_ft'
  , 'tc_pmpm_ft'
  , 'hcbs_attd_pmpm_ft'
  , 'ip_pmpm_ft'
  , 'snf_pmpm_ft'
  , 'icf_pmpm_ft'
  , 'ed_pmpm_ft'
  , 'hh_pmpm_ft'
  , 'pro_pmpm_ft'
  , 'out_pmpm_ft'
  , 'mm_ft'
]
target_col = 'savings_tgt'
# target_col = 'savings_pmpm_tgt'

feature_columns = mom_feature_columns + annual_feature_columns
len(feature_columns)

### Try agg features at year and half year level

In [None]:
# fully broken out month over month features
def features_mom(df, cols):
#     print('building month over month features')
    df = df.fillna(0)
    pre = df.query("period < 0")
    pre= pre.pivot(index='member_id', columns='period', values=cols)
    pre.columns = [f'{period}-{name}' for (name, period) in pre.columns]
    return pre.fillna(0)
features_mom.name = 'MOM'

In [None]:
# agg semi yearly_features
def features_semi_annual(df, cols):
#     print('building semi annual features')
    df = df.fillna(0)
    pre = df.query("period < 0")
    h1 = pre.query('period < -6').groupby('member_id')
    h2 = pre.query('period >= -6').groupby('member_id')

    h1 = h1[cols].sum()
    h2 = h2[cols].sum()

    features_h1 = np.divide(h1[cols],  h1[['p_mm']])
    features_h2 = np.divide(h2[cols],  h2[['p_mm']])
    res = features_h2.merge(features_h1, left_index=True, right_index=True, suffixes=('_h2', '_h1'))
    return res.fillna(0)
features_semi_annual.name = 'Semi Annual'

In [None]:
# agg yearly_features
def features_annual(df, cols):
#     print('building annual features')
    df = df.fillna(0)
    pre = df.query("period < 0").groupby('member_id')
    pre_sums = pre[cols].sum()
    res = np.divide(pre_sums[cols],  pre_sums[['p_mm']])
    return res.fillna(0)
features_annual.name = 'Annual'

In [None]:
def print_feature_importance(regr, cols, max_cols=20):
    print('Feature Importance')
    i = 0
    for imp, feat in sorted([(b, a) for a, b in zip(cols, regr.feature_importances_)], reverse=True):
        if imp > 0.001:
            print('%0.3f: %s' % (imp, feat))
            i += 1
        if i > max_cols:
            break

In [None]:
def get_miss_ided(df, y_test, preds):
    id_pop_size = 500
    test_df = pd.DataFrame(y_test).assign(pred=preds)
    simulated_n_id = int((id_pop_size / df.shape[0]) * test_df.shape[0])

    act_top = test_df.sort_values(target_col, ascending=False).iloc[:simulated_n_id]
    pred_top = test_df.sort_values('pred', ascending=False).iloc[:simulated_n_id]
    
    pre_top = df.groupby('member_id').first().sort_values('savings_ft', ascending=False).iloc[:id_pop_size]
    post_top = df.groupby('member_id').first().sort_values(target_col, ascending=False).iloc[:id_pop_size]
    
    print(f'Miss IDed: {act_top.index.difference(pred_top.index).shape[0] * 100.0 / simulated_n_id}%')
    print(f'Rul Miss IDed: {post_top.index.difference(pre_top.index).shape[0] * 100.0 / id_pop_size}%')

# Try training three separate models on the three targets

In [None]:
lvl = '1. h'

df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', 'savings_tgt', 'raf_savings_tgt', 'ds_savings_tgt']].groupby('member_id').first()
targets_df = targets_df.assign(med_savings_tgt=targets_df.savings_tgt - targets_df.ds_savings_tgt - targets_df.raf_savings_tgt)
targets_df = targets_df[['med_savings_tgt', 'raf_savings_tgt', 'ds_savings_tgt', 'savings_tgt']]

features_df = features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
# target_col = 'savings_tgt'
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df, test_size=test_set_pct, random_state=seed)

med_gb = GradientBoostingRegressor(random_state=seed)
raf_gb = GradientBoostingRegressor(random_state=seed)
ds_gb = GradientBoostingRegressor(random_state=seed)

med_gb.fit(X_train, y_train.values[:,0])
raf_gb.fit(X_train, y_train.values[:,1])
ds_gb.fit(X_train, y_train.values[:,2])

In [None]:
verbose = True
preds = med_gb.predict(X_test) + raf_gb.predict(X_test) + ds_gb.predict(X_test)

# r2_score = gb.score(X_test, y_test)
error = np.abs(y_test.savings_tgt - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


# print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
# get_miss_ided(df, y_test, preds)
# print_feature_importance(gb, features_df.columns)
if verbose:
    fig, ax = plt.subplots(nrows=1, figsize=(20,10))
    ax.scatter(preds, y_test.savings_tgt)
    ax.set_xlabel('preds')
    ax.set_ylabel('actual');
    ax.set_title(f'Predicted vs actual savings')
    plt.show()

# Kitchen sink of features, ridge reg
Result: better than previous approach, slightly better than rf and untuned boosted

In [None]:
lvl = '1. h'

df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

from sklearn.linear_model import Ridge
gb = Ridge(alpha=1.0, normalize=True)
# gb = GradientBoostingRegressor(random_state=seed)
# gb = RandomForestRegressor(random_state=seed)

gb.fit(X_train, y_train)
verbose = True
preds = gb.predict(X_test)
r2_score = gb.score(X_test, y_test)
error = np.abs(y_test - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
get_miss_ided(df,y_test, preds)
print('Feature Importance')
i = 0
for imp, feat in sorted([(b, a) for a, b in zip(features_df.columns, gb.coef_)], reverse=True):
    if imp > 0.001:
        print('%0.3f: %s' % (imp, feat))
        i += 1
    if i > 20:
        break

if verbose:
    fig, ax = plt.subplots(nrows=1, figsize=(20,10))
    ax.scatter(preds, y_test)
    ax.set_xlabel('preds')
    ax.set_ylabel('actual');
    ax.set_title(f'Predicted vs actual savings')
    plt.show()

# Kitchen sink of features, boosted tree
Result: better than previous approach, slightly worse than RF

In [None]:
lvl = '1. h'

df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

gb = GradientBoostingRegressor(random_state=seed)
#     gb = RandomForestRegressor(random_state=seed)

gb.fit(X_train, y_train)
verbose = True
preds = gb.predict(X_test)
r2_score = gb.score(X_test, y_test)
error = np.abs(y_test - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
get_miss_ided(df, y_test, preds)
print_feature_importance(gb, features_df.columns)

if verbose:
    fig, ax = plt.subplots(nrows=1, figsize=(20,10))
    ax.scatter(preds, y_test)
    ax.set_xlabel('preds')
    ax.set_ylabel('actual');
    ax.set_title(f'Predicted vs actual savings')
    plt.show()

# Kitchen sink of features, boosted tree, max depth = 1
Result: best r2 so far

In [None]:
lvl = '1. h'

df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

gb = GradientBoostingRegressor(random_state=seed, max_depth=1, min_samples_leaf=2)
#     gb = RandomForestRegressor(random_state=seed)

gb.fit(X_train, y_train)
verbose = True
preds = gb.predict(X_test)
r2_score = gb.score(X_test, y_test)
error = np.abs(y_test - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
get_miss_ided(df, y_test, preds)
print_feature_importance(gb, features_df.columns)

if verbose:
    fig, ax = plt.subplots(nrows=1, figsize=(20,10))
    ax.scatter(preds, y_test)
    ax.set_xlabel('preds')
    ax.set_ylabel('actual');
    ax.set_title(f'Predicted vs actual savings')
    plt.show()

# Just the new anual features
Result: not quite as good as the kitchen sink

In [None]:
lvl = '1. h'

df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

# features_df = features_annual(df, mom_feature_columns)
features_df = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
target_col = 'savings_tgt'
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

gb = GradientBoostingRegressor(random_state=seed)
# gb = GradientBoostingRegressor(random_state=seed, max_depth=1)
#     gb = RandomForestRegressor(random_state=seed)

gb.fit(X_train, y_train)
verbose = True
preds = gb.predict(X_test)
r2_score = gb.score(X_test, y_test)
error = np.abs(y_test - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
get_miss_ided(df, y_test, preds)
print_feature_importance(gb, features_df.columns)

if verbose:
    fig, ax = plt.subplots(nrows=1, figsize=(20,10))
    ax.scatter(preds, y_test)
    ax.set_xlabel('preds')
    ax.set_ylabel('actual');
    ax.set_title(f'Predicted vs actual savings')
    plt.show()

In [None]:
raise Exception("Stop here when running all")

In [None]:
search_level_feature_space = False
if search_level_feature_space:
    levels = ('1. h', '2. m', '3. l')
    feature_funcs = (features_mom, features_semi_annual, features_annual)
    for lvl in levels:
        df = data.loc[data.lvl_tgt == lvl]

        targets_df = df[['member_id', target_col]].groupby('member_id').first()
        targets_df.fillna(0, inplace=True)

        for feat_func in feature_funcs: 
            print('Level: ', lvl)
            print('Features: ', feat_func.name)
            df = df.fillna(0)
            features_df = feat_func(df, feature_columns)
            model = train_and_test(features_df, targets_df)


In [None]:
df = data.loc[data.lvl_tgt == '1. h']

targets_df = df[['member_id', target_col]].groupby('member_id').first()
targets_df.fillna(0, inplace=True)

df = df.fillna(0)
features_df = features_annual(df, feature_columns)
model = train_and_test(features_df, targets_df)

In [None]:
preds = model.predict(features_df)
top_500_preds = features_df.assign(pred=preds).sort_values('pred', ascending=False).iloc[:500][['pred']]
id_pop = data.loc[(data.lvl_tgt == '1. h')]
# id_pop = data.loc[(data.lvl_tgt == lvl) & (data.period > 0)]
id_pop = id_pop.set_index('member_id')
id_pop = id_pop.merge(top_500_preds, left_index=True, right_index=True) 
id_pop = id_pop.merge(targets_df, left_index=True, right_index=True) 
id_pop = id_pop.fillna(0)

In [None]:
id_pop.head()
# features_df.assign(pred=preds)[['pred']].to_csv('../outputs/h_preds.csv')

In [None]:
tcs = [c for c in features_df.columns if c[-3:] == '_tc'] + ['tc']
tcs = [c for c in tcs if len(c) < 8 or c[:5] != 'hcbs_']

g = sns.relplot(
    x="period",
    y="value",
    hue="variable",
    kind="line",
    data=id_pop.melt(id_vars=['period'], value_vars=tcs),
    height=12,
    aspect=2
)

In [None]:
# tune hyperparameters: didn't see much benifit, takes about 30 minutes to run

# lvl  = '1. h'
# df = data.loc[data.lvl_tgt == lvl]
# df = df.fillna(0)

# targets_df = df[['member_id', target_col]].groupby('member_id').first()
# feats_df = features_annual(df, feature_columns)

# feats_df.sort_index(inplace=True)
# targets_df.sort_index(inplace=True)
# assert sum(targets_df.index - feats_df.index) == 0

# # train test split
# target_col = 'savings_tgt'
# X_train, X_test, y_train, y_test = train_test_split(feats_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

# gb = GradientBoostingRegressor(random_state=seed)

# grid = {
#     'learning_rate': [0.001, 0.01, 0.1, 0.2],
#     'n_estimators': [10, 100, 500, 1000],
#     'max_depth': [2, 3, 4, 5, 10],
#     'min_samples_leaf': [1, 2, 3, 4, 5],
# }

# clf = GridSearchCV(gb, grid, n_jobs=os.cpu_count())

# search = clf.fit(X_train, y_train)

# cv_res = pd.DataFrame(search.cv_results_)
# cv_res.sort_values('rank_test_score')

In [None]:
# model_name = cb_utils.save_model(model, '1_xgb_cat_savings', {'name': '1_xgb_cat_savings', 'features': list(features_df.columns)})

In [None]:
# load model trained on mab 5
model_name = '20200731_153512_1_xgb_cat_savings'
model, meta = cb_utils.load_model(model_name)

In [None]:
# cb_utils.save_scores(final)

In [None]:
# test on a different population
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 6;"
mab6 = cb_utils.sql_query_to_df(query, use_cache=use_cache)

In [None]:
targets_df = mab6[['member_id', target_col]].groupby('member_id').first()
targets_df.fillna(0, inplace=True)

mab6 = mab6.fillna(0)
features_df = features_annual(mab6, feature_columns)
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)


preds = model.predict(features_df)
r2_score = model.score(features_df, targets_df[target_col])

error = np.abs(targets_df[target_col] - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()

print('Testing MAB 6 data on model trained with mab 5')
print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')

fig, ax = plt.subplots(nrows=1, figsize=(20,10))
ax.scatter(preds, targets_df[target_col])
ax.set_xlabel('preds')
ax.set_ylabel('actual');
ax.set_title(f'Predicted vs actual savings')
plt.show()


In [None]:
# try training on early set 
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 6;"
mab6 = cb_utils.sql_query_to_df(query, use_cache=use_cache)

df = mab6.loc[data.lvl_tgt == '1. h']

targets_df = df[['member_id', target_col]].groupby('member_id').first()
targets_df.fillna(0, inplace=True)

df = df.fillna(0)
features_df = features_annual(df, feature_columns)
model = train_and_test(features_df, targets_df)