In [None]:
import os
import sys
import time
import random
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# from fastai.tabular import * 

In [None]:
# configuration
lvl = '2. m'
target_col = 'savings_tgt'
use_cache = True
seed = random.randint(0, 100)
test_set_pct = 0.2
print(f'Seed: {seed}')

In [None]:
# Load data
# 2018
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 4;"
mab4 = cb_utils.sql_query_to_df(query, use_cache=use_cache)
# mab4.fillna(0, inplace=True)

# 2017
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 3;"
mab3 = cb_utils.sql_query_to_df(query, use_cache=use_cache)
# mab3.fillna(0, inplace=True)

In [None]:
mom_feature_columns = [
#     'lob_at_id'
#   , 'grp_at_id'
    'lob_1_days'
  , 'lob_2_days'
  , 'lob_3_days'
  , 'grp_1_days'
  , 'grp_2_days'
  , 'grp_3_days'
  , 'grp_5_days'
  , 'grp_6_days'
  , 'grp_7_days'
  , 'grp_8_days'
  , 'unaligned_days'
#   , 'is_unaligned'
  , 'tc'
  , 'hcbs_tc'
  , 'icf_tc'
  , 'ip_tc'
  , 'rx_tc'
  , 'ed_tc'
  , 'snf_tc'
  , 'out_tc'
  , 'pro_tc'
  , 'spfac_tc'
  , 'amb_tc'
  , 'hh_tc'
  , 'hosp_tc'
  , 'oth_tc'
  , 'p_mm'
  , 'mm'
  , 'hcbs_respite_tc'
  , 'hcbs_fam_care_stip_tc'
  , 'hcbs_com_trans_tc'
  , 'hcbs_educ_train_tc'
  , 'hcbs_com_liv_fam_tc'
  , 'hcbs_com_liv_tc'
  , 'hcbs_attend_care_tc'
  , 'hcbs_com_trans_waiv_tc'
  , 'hcbs_home_meal_tc'
  , 'hcbs_pers_care_tc'
  , 'hcbs_ther_behav_tc'
  , 'hcbs_unsk_respite_tc'
  , 'hcbs_waiv_svc_tc'
  , 'ddos'
  , 'hcbs_ddos'
  , 'icf_ddos'
  , 'ip_ddos'
  , 'rx_ddos'
  , 'ed_ddos'
  , 'snf_ddos'
  , 'out_ddos'
  , 'pro_ddos'
  , 'spfac_ddos'
  , 'amb_ddos'
  , 'hh_ddos'
  , 'hosp_ddos'
  , 'oth_ddos'
  , 'pcp_ddos'
  , 'chf_ddos'
  , 'heart_ddos'
  , 'copd_ddos'
  , 'pulmonar_ddos'
  , 'cancer_ddos'
  , 'ckd_ddos'
  , 'esrd_ddos'
  , 'hyperlipid_ddos'
  , 'diab_ddos'
  , 'alzh_ddos'
  , 'dementia_ddos'
  , 'stroke_ddos'
  , 'hypertension_ddos'
  , 'fall_ddos'
  , 'transplant_ddos'
  , 'liver_ddos'
  , 'hippfract_ddos'
  , 'depression_ddos'
  , 'psychosis_ddos'
  , 'drug_ddos'
  , 'alcohol_ddos'
  , 'paralysis_ddos'
]
annual_feature_columns = [
#   , 'lvl_ft'
#   , 'is_unaligned_ft'
    'unaligned_mm_ft'
  , 'is_self_directed_ft'
  , 'is_cat0_ft'
  , 'is_cat1_ft'
  , 'is_cat2_ft'
  , 'is_cat3_ft'
  , 'is_cat4_ft'
  , 'is_lob1_ft'
  , 'is_lob2_ft'
  , 'is_lob3_ft'
  , 'is_grp1_ft'
  , 'is_grp2_ft'
  , 'is_grp3_ft'
  , 'is_grp45678_ft'
  , 'sav_pct_ft'
  , 'raf_sav_pct_ft'
  , 'ds_sav_pct_ft'
  , 'ip_sav_pct_ft'
  , 'snf_sav_pct_ft'
  , 'icf_sav_pct_ft'
  , 'ed_sav_pct_ft'
  , 'hh_sav_pct_ft'
  , 'pro_sav_pct_ft'
  , 'out_sav_pct_ft'
  , 'savings_ft'
  , 'raf_savings_ft'
  , 'ds_savings_ft'
  , 'ip_savings_ft'
  , 'snf_savings_ft'
  , 'icf_savings_ft'
  , 'ed_savings_ft'
  , 'hh_savings_ft'
  , 'pro_savings_ft'
  , 'out_savings_ft'
  , 'tc_ft'
  , 'hcbs_atd_pcs_tc_ft'
  , 'ip_tc_ft'
  , 'snf_tc_ft'
  , 'icf_tc_ft'
  , 'ed_tc_ft'
  , 'hh_tc_ft'
  , 'pro_tc_ft'
  , 'out_tc_ft'
  , 'savings_pmpm_ft'
  , 'raf_sav_pmpm_ft'
  , 'ds_sav_pmpm_ft'
  , 'ip_sav_pmpm_ft'
  , 'snf_sav_pmpm_ft'
  , 'icf_sav_pmpm_ft'
  , 'ed_sav_pmpm_ft'
  , 'hh_sav_pmpm_ft'
  , 'pro_sav_pmpm_ft'
  , 'out_sav_pmpm_ft'
  , 'tc_pmpm_ft'
  , 'hcbs_attd_pmpm_ft'
  , 'ip_pmpm_ft'
  , 'snf_pmpm_ft'
  , 'icf_pmpm_ft'
  , 'ed_pmpm_ft'
  , 'hh_pmpm_ft'
  , 'pro_pmpm_ft'
  , 'out_pmpm_ft'
  , 'mm_ft'
  , 'age'
  , 'is_male'

]
target_col = 'savings_tgt'
lvl = '2. m'

feature_columns = mom_feature_columns + annual_feature_columns
print(f'N Potential Features: {len(feature_columns)}')

In [None]:
df = mab4.loc[mab4.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
df = features_df.merge(targets_df, left_index=True, right_index=True)

df = df.assign(cat=0, grp=1, lob=1)
df.loc[df.is_cat1_ft == 1, 'cat'] = 1
df.loc[df.is_cat2_ft == 1, 'cat'] = 2
df.loc[df.is_cat3_ft == 1, 'cat'] = 3
df.loc[df.is_cat4_ft == 1, 'cat'] = 4

df.loc[df.is_grp2_ft == 1, 'grp'] = 2
df.loc[df.is_grp3_ft == 1, 'grp'] = 3
df.loc[df.is_grp45678_ft == 1, 'grp'] = 4

df.loc[df.is_lob2_ft == 1, 'lob'] = 2
df.loc[df.is_lob3_ft == 1, 'lob'] = 3

df.head()

In [None]:
procs = [FillMissing, Categorify, Normalize]

In [None]:
valid_idx = []
for i, x in enumerate(df.itertuples()):
    if x.Index in X_test.index:
        valid_idx.append(i)

In [None]:
dep_var = target_col
cat_names = [  
    'is_self_directed_ft'
  , 'cat'
  , 'grp'
  , 'lob'
  , 'is_male'
]
cont_names = [
    'age',
    'tc',
    'hcbs_tc',
    'ip_tc',
    'savings_pmpm_ft',
    'raf_savings_ft',
    'ds_savings_ft',
    'ip_savings_ft',
    'p_mm',
    'ddos'
]

In [None]:
data = TabularDataBunch.from_df('test_bunch_path', df, dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names, cont_names=cont_names,num_workers=os.cpu_count())
print(data.train_ds.cat_names)
print(data.train_ds.cont_names)

In [None]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

In [None]:
# learn = tabular_learner(data, layers=[20,10], emb_szs={'grp': 5, 'cat': 5}, metrics=accuracy)
learn = tabular_learner(data, layers=[32,16], metrics=[root_mean_squared_error, r2_score])
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
# learn.fit(100, 1e-2)
learn.fit_one_cycle(10, 1e-1)

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.recorder.plot_metrics()

# Doesn't make any sense that it does so poorly, investigate
First up, plot some corelations

In [None]:
y = "savings_pmpm_tgt"
x = "savings_pmpm_ft"

In [None]:
df = mab4.loc[mab4.lvl_tgt == lvl]
# df = mab3.loc[mab3.lvl_tgt == lvl]
target_col = y

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
# df = features_df.merge(targets_df, left_index=True, right_index=True)

df = df.assign(cat=0, grp=1, lob=1)
df.loc[df.is_cat1_ft == 1, 'cat'] = 1
df.loc[df.is_cat2_ft == 1, 'cat'] = 2
df.loc[df.is_cat3_ft == 1, 'cat'] = 3
df.loc[df.is_cat4_ft == 1, 'cat'] = 4

df.loc[df.is_grp2_ft == 1, 'grp'] = 2
df.loc[df.is_grp3_ft == 1, 'grp'] = 3
df.loc[df.is_grp45678_ft == 1, 'grp'] = 4

df.loc[df.is_lob2_ft == 1, 'lob'] = 2
df.loc[df.is_lob3_ft == 1, 'lob'] = 3
df = df.fillna(0)
# df =     df.sort_values('savings_ft', ascending=False).iloc[:850]

In [None]:
df.shape

In [None]:

g = sns.relplot(x=x, y=y, hue="cat", col="lob", row="grp", data=df)

In [None]:
g = sns.relplot(x=x, y=y, hue="cat", col="lob", row="grp", data=df)

In [None]:
g = sns.relplot(x="savings_ft", y="savings_tgt", hue="age", col="cat", row="grp", data=df)

In [None]:
g = sns.relplot(x="savings_ft", y="savings_tgt", hue="ddos", col="cat", row="grp", data=df)

In [None]:
# df = mab4.loc[mab4.lvl_tgt == lvl]
df = mab3.loc[mab3.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
df = features_df.merge(targets_df, left_index=True, right_index=True)

df = df.assign(cat=0, grp=1, lob=1)
df.loc[df.is_cat1_ft == 1, 'cat'] = 1
df.loc[df.is_cat2_ft == 1, 'cat'] = 2
df.loc[df.is_cat3_ft == 1, 'cat'] = 3
df.loc[df.is_cat4_ft == 1, 'cat'] = 4

df.loc[df.is_grp2_ft == 1, 'grp'] = 2
df.loc[df.is_grp3_ft == 1, 'grp'] = 3
df.loc[df.is_grp45678_ft == 1, 'grp'] = 4

df.loc[df.is_lob2_ft == 1, 'lob'] = 2
df.loc[df.is_lob3_ft == 1, 'lob'] = 3
df = df.fillna(0)

In [None]:
g = sns.relplot(x=x, y=y, hue="cat", col="lob", row="grp", data=df)

# check feature importance permeations

In [None]:
## see how diff features affect perf
hot_cat_names = ['is_self_directed_ft','is_cat0_ft','is_cat1_ft','is_cat2_ft','is_cat3_ft','is_cat4_ft','is_lob1_ft','is_lob2_ft','is_lob3_ft','is_grp1_ft','is_grp2_ft','is_grp3_ft','is_grp45678_ft']
df = df.fillna(0)
df[hot_cat_names] = df[hot_cat_names].astype(int)
df[cont_names + hot_cat_names ].head()

# features = ['age','savings_ft', 'tc_ft', 'ddos']
# features = cont_names + hot_cat_names
features = [c for c in df.columns if 'tgt' not in c]
# model = Lasso(alpha=1.0)
model = Ridge(alpha=1.0)
model.fit(df[features], df[target_col])

model.score(df[features], df[target_col])

In [None]:
df = mab4.loc[mab4.lvl_tgt == lvl]
df = df.fillna(0)

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

In [None]:
# model = Lasso(alpha=1.)
model = BayesianRidge()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=0)
feats = []

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{X_train.columns[i]:<8} {r.importances_mean[i]:.3f} +/- {r.importances_std[i]:.3f}")
        feats.append(X_train.columns[i])

In [None]:
best_feats = [
 'savings_ft',
 'ip_tc_ft',
 'hh_tc_ft',
 'hcbs_atd_pcs_tc_ft',
 'pro_tc_ft',
 'snf_tc_ft',
 'ds_savings_ft',
 'rx_tc'
]

### Try on 2017

In [None]:
df = mab3.loc[mab3.lvl_tgt == lvl]
df = df.fillna(0)

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = cb_utils.features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

model.score(features_df, targets_df[target_col])
# X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(model, features_df, targets_df[target_col], n_repeats=30, random_state=0)
feats = []

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{X_train.columns[i]:<8} {r.importances_mean[i]:.3f} +/- {r.importances_std[i]:.3f}")
        feats.append(X_train.columns[i])

# Double check that rules are really winning

In [None]:
raise Exception("STOP")