In [None]:
import os
import sys
import time
import random
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn import preprocessing

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
scoring_run_id = 1
use_cache = True
seed = random.randint(0, 100)
test_set_pct = 0.2
print(f'Seed: {seed}')

In [None]:
# Load data
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 1;"
data = cb_utils.sql_query_to_df(query, use_cache=use_cache)
data.fillna(0, inplace=True)

In [None]:
mom_feature_columns = [
    'lob_1_days'
  , 'lob_2_days'
  , 'lob_3_days'
  , 'grp_1_days'
  , 'grp_2_days'
  , 'grp_3_days'
  , 'grp_5_days'
  , 'grp_6_days'
  , 'grp_7_days'
  , 'grp_8_days'
  , 'unaligned_days'
#   , 'is_unaligned'
  , 'tc'
  , 'hcbs_tc'
  , 'icf_tc'
  , 'ip_tc'
  , 'rx_tc'
  , 'ed_tc'
  , 'snf_tc'
  , 'out_tc'
  , 'pro_tc'
  , 'spfac_tc'
  , 'amb_tc'
  , 'hh_tc'
  , 'hosp_tc'
  , 'oth_tc'
  , 'p_mm'
  , 'mm'
  , 'hcbs_respite_tc'
  , 'hcbs_fam_care_stip_tc'
  , 'hcbs_com_trans_tc'
  , 'hcbs_educ_train_tc'
  , 'hcbs_com_liv_fam_tc'
  , 'hcbs_com_liv_tc'
  , 'hcbs_attend_care_tc'
  , 'hcbs_com_trans_waiv_tc'
  , 'hcbs_home_meal_tc'
  , 'hcbs_pers_care_tc'
  , 'hcbs_ther_behav_tc'
  , 'hcbs_unsk_respite_tc'
  , 'hcbs_waiv_svc_tc'
  , 'ddos'
  , 'hcbs_ddos'
  , 'icf_ddos'
  , 'ip_ddos'
  , 'rx_ddos'
  , 'ed_ddos'
  , 'snf_ddos'
  , 'out_ddos'
  , 'pro_ddos'
  , 'spfac_ddos'
  , 'amb_ddos'
  , 'hh_ddos'
  , 'hosp_ddos'
  , 'oth_ddos'
  , 'pcp_ddos'
  , 'pulmonar_ddos'
  , 'cancer_ddos'
  , 'ckd_ddos'
  , 'esrd_ddos'
  , 'hyperlipid_ddos'
  , 'diab_ddos'
  , 'alzh_ddos'
  , 'dementia_ddos'
  , 'stroke_ddos'
  , 'hypertension_ddos'
  , 'fall_ddos'
  , 'transplant_ddos'
  , 'liver_ddos'
  , 'hippfract_ddos'
  , 'depression_ddos'
  , 'psychosis_ddos'
  , 'drug_ddos'
  , 'alcohol_ddos'
  , 'paralysis_ddos'
]
annual_feature_columns = [
#   , 'lvl_ft'
#   , 'is_unaligned_ft'
    'unaligned_mm_ft'
  , 'is_self_directed_ft'
  , 'is_cat0_ft'
  , 'is_cat1_ft'
  , 'is_cat2_ft'
  , 'is_cat3_ft'
  , 'is_cat4_ft'
  , 'is_lob1_ft'
  , 'is_lob2_ft'
  , 'is_lob3_ft'
  , 'is_grp1_ft'
  , 'is_grp2_ft'
  , 'is_grp3_ft'
  , 'is_grp45678_ft'
  , 'sav_pct_ft'
  , 'raf_sav_pct_ft'
  , 'ds_sav_pct_ft'
  , 'ip_sav_pct_ft'
  , 'snf_sav_pct_ft'
  , 'icf_sav_pct_ft'
  , 'ed_sav_pct_ft'
  , 'hh_sav_pct_ft'
  , 'pro_sav_pct_ft'
  , 'out_sav_pct_ft'
  , 'savings_ft'
  , 'raf_savings_ft'
  , 'ds_savings_ft'
  , 'ip_savings_ft'
  , 'snf_savings_ft'
  , 'icf_savings_ft'
  , 'ed_savings_ft'
  , 'hh_savings_ft'
  , 'pro_savings_ft'
  , 'out_savings_ft'
  , 'tc_ft'
  , 'hcbs_atd_pcs_tc_ft'
  , 'ip_tc_ft'
  , 'snf_tc_ft'
  , 'icf_tc_ft'
  , 'ed_tc_ft'
  , 'hh_tc_ft'
  , 'pro_tc_ft'
  , 'out_tc_ft'
  , 'savings_pmpm_ft' # start pmpms
  , 'raf_sav_pmpm_ft'
  , 'ds_sav_pmpm_ft'
  , 'ip_sav_pmpm_ft'
  , 'snf_sav_pmpm_ft'
  , 'icf_sav_pmpm_ft'
  , 'ed_sav_pmpm_ft'
  , 'hh_sav_pmpm_ft'
  , 'pro_sav_pmpm_ft'
  , 'out_sav_pmpm_ft'
  , 'tc_pmpm_ft'
  , 'hcbs_attd_pmpm_ft'
  , 'ip_pmpm_ft'
  , 'snf_pmpm_ft'
  , 'icf_pmpm_ft'
  , 'ed_pmpm_ft'
  , 'hh_pmpm_ft'
  , 'pro_pmpm_ft'
  , 'out_pmpm_ft'
  , 'mm_ft'
  , 'age'
  , 'is_male'
]
target_col = 'savings_tgt'
# target_col = 'savings_pmpm_tgt'

feature_columns = mom_feature_columns + annual_feature_columns
len(feature_columns)

### Try agg features at year and half year level

In [None]:
# fully broken out month over month features
def features_mom(df, cols):
#     print('building month over month features')
    df = df.fillna(0)
    pre = df.query("period < 0")
    pre= pre.pivot(index='member_id', columns='period', values=cols)
    pre.columns = [f'{period}-{name}' for (name, period) in pre.columns]
    return pre.fillna(0)
features_mom.name = 'MOM'

In [None]:
# agg semi yearly_features
def features_semi_annual(df, cols):
#     print('building semi annual features')
    df = df.fillna(0)
    pre = df.query("period < 0")
    h1 = pre.query('period < -6').groupby('member_id')
    h2 = pre.query('period >= -6').groupby('member_id')

    h1 = h1[cols].sum()
    h2 = h2[cols].sum()

    features_h1 = np.divide(h1[cols],  h1[['p_mm']])
    features_h2 = np.divide(h2[cols],  h2[['p_mm']])
    res = features_h2.merge(features_h1, left_index=True, right_index=True, suffixes=('_h2', '_h1'))
    return res.fillna(0)
features_semi_annual.name = 'Semi Annual'

In [None]:
# agg yearly_features
def features_annual(df, cols):
#     print('building annual features')
    df = df.fillna(0)
    pre = df.query("period < 0").groupby('member_id')
    pre_sums = pre[cols].sum()
    res = np.divide(pre_sums[cols],  pre_sums[['p_mm']])
    return res.fillna(0)
features_annual.name = 'Annual'

In [None]:
def print_feature_importance(regr, cols, max_cols=20):
    print('Feature Importance')
    i = 0
    for imp, feat in sorted([(b, a) for a, b in zip(cols, regr.feature_importances_)], reverse=True):
        if imp > 0.001:
            print('%0.3f: %s' % (imp, feat))
            i += 1
        if i > max_cols:
            break
            
def print_coef_importance(regr, cols, max_cols=20):
    print('Feature Importance')
    i = 0
    for imp, feat in sorted([(b, a) for a, b in zip(cols, regr.coef_)], reverse=True):
        if imp > 0.001:
            print('%0.3f: %s' % (imp, feat))
            i += 1
        if i > max_cols:
            break
            

In [None]:
def get_miss_ided(X_test, y_test, preds, verbose=True):
    id_pop_size = 100 # test split is 20%, 20% of 500 == 100
    test_df = X_test.assign(target=y_test, pred=preds)
    
    pre_rule_id = test_df.sort_values('savings_ft', ascending=False).iloc[:id_pop_size]
    perf_id = test_df.sort_values('target', ascending=False).iloc[:id_pop_size]
    pred_id = test_df.sort_values('pred', ascending=False).iloc[:id_pop_size]
    
    pred_misses = perf_id.index.difference(pred_id.index).shape[0]
    rule_misses = perf_id.index.difference(pre_rule_id.index).shape[0]
    
    if verbose:
        print(f'Miss IDed: {pred_misses * 100.0 / id_pop_size}%')
        print(f'Rule Miss IDed: {rule_misses * 100.0 / id_pop_size}%')
    return pred_misses, rule_misses 

# Run some feature selection
Recursively runs cross validation and gets rid of features that don't have high feature importance

In [None]:
lvl = '1. h'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

transformer = preprocessing.RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)



In [None]:
from sklearn.feature_selection import RFECV

lvl = '1. h'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feature_columns)
pre_annual = df.query("period < 0")[annual_feature_columns + ['member_id']].groupby('member_id').first()

features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)

# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

estimators = [
    ('xgb', GradientBoostingRegressor(random_state=seed)),
    ('xgb max depth 1', GradientBoostingRegressor(random_state=seed, max_depth=1)),
    ('ridge', Ridge(alpha=1.0, normalize=True)),
    ('rf', RandomForestRegressor(random_state=seed)),
]
selectors = []
cnt = collections.Counter()

for name, estimator in estimators:
    print(name)
    selector = RFECV(estimator, step=1, cv=5, n_jobs=os.cpu_count())
    selector = selector.fit(X_train, y_train)
    selected_features = features_df.columns[selector.support_]
    selectors.append((name, selector))
    
    for rank, feat in sorted([(b, a) for a, b in zip(features_df.columns, selector.ranking_)]):
        print(f'{rank}: {feat}')
        if rank == 1:
            cnt[feat] += 1
    
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
    plt.show()

In [None]:
# # of times features appear as #1 rank
cnt.most_common(200)

In [None]:
for i in range(4, 1, -1):
    feature_cols = [c for c, v in cnt.items() if v >= i]
    mom_feats = [c for c in feature_cols if c in mom_feature_columns] + ['p_mm']
    annual_feats = [c for c in feature_cols if c in annual_feature_columns]
    print(i, ', '.join(feature_cols))

#     lvl = '2. m'
    lvl = '1. h'
    df = data.loc[data.lvl_tgt == lvl]

    targets_df = df[['member_id', target_col]].groupby('member_id').first()

    features_df = features_annual(df, mom_feats)
    pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


    features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
    features_df.fillna(0, inplace=True)


    # order features and targets by member id, make sure they line up perfectly
    features_df.sort_index(inplace=True)
    targets_df.sort_index(inplace=True)
    assert sum(targets_df.index - features_df.index) == 0

    # train test split
    # target_col = 'savings_tgt'
    seed = random.randint(0, 1000)
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

    gb = Ridge(alpha=1.0, normalize=True)
    #     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
    #     gb = RandomForestRegressor(random_state=seed)

    gb.fit(X_train, y_train)
    verbose = True
    preds = gb.predict(X_test)
    r2_score = gb.score(X_test, y_test)
    error = np.abs(y_test - preds)
    mean_hrs_error = error.mean()
    median_hrs_error = error.median()


    print(f'R^2 Score: {r2_score}')
    print(f'Mean absolute $ error: {mean_hrs_error}')
    print(f'Median absolute $ error: {median_hrs_error}')
    get_miss_ided(X_test, y_test, preds)
#     print_feature_importance(gb, features_df.columns) # tree reg
    print_coef_importance(gb, features_df.columns) # linear reg
    if verbose:
        fig, ax = plt.subplots(nrows=1, figsize=(20,10))
        ax.scatter(preds, y_test)
        ax.set_xlabel('preds')
        ax.set_ylabel('actual');
        ax.set_title(f'Predicted vs actual savings')
        ax.set_xlim(0, 10000)
        ax.set_ylim(0, 10000)
#         ax.set_xscale('log')
#         ax.set_yscale('log')
        plt.show()

In [None]:
display(df.savings_tgt.describe())
df.savings_tgt.hist(bins=[a for a in range(0, 10000, 500)])

In [None]:
# train 2018
# bagging regressor, didn't work so hot
# missed_advantage = []
# models = []

from sklearn.ensemble import BaggingRegressor

# for i in range(10):
feature_cols = [c for c, v in cnt.items() if v >= 3]
mom_feats = [c for c in feature_cols if c in mom_feature_columns]# + ['p_mm']
annual_feats = [c for c in feature_cols if c in annual_feature_columns]
# print(i, ', '.join(feature_cols))

#     lvl = '2. m'
lvl = '1. h'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
# target_col = 'savings_tgt'
seed = random.randint(0, 1000)
print(f'seed: {seed}')
X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

# gb = Ridge(alpha=1.0, normalize=True)
gb = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000)
#     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
#     gb = RandomForestRegressor(random_state=seed)

gb.fit(X_train, y_train)
verbose = True
preds = gb.predict(X_test)
r2_score = gb.score(X_test, y_test)
error = np.abs(y_test - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
pred_misses, rule_misses = get_miss_ided(X_test, y_test, preds)

In [None]:
# train 2018 - manual avg
# missed_advantage = []
models = []
lifts = []

feature_cols = [c for c, v in cnt.items() if v >= 3]
mom_feats = [c for c in feature_cols if c in mom_feature_columns]# + ['p_mm']
annual_feats = [c for c in feature_cols if c in annual_feature_columns]
# print(i, ', '.join(feature_cols))

#     lvl = '2. m'
lvl = '1. h'
df = data.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

for i in range(1000):

    # train test split
    # target_col = 'savings_tgt'
    seed = random.randint(0, 1000)
#     print(f'seed: {seed}')
    X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)
    transformer = preprocessing.RobustScaler().fit(X_train)
    X_train = transformer.transform(X_train)
    X_test = transformer.transform(X_test)

    gb = Ridge(alpha=1.0, normalize=True)
#     gb = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=1000)
    #     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
    #     gb = RandomForestRegressor(random_state=seed)

    gb.fit(X_train, y_train)
    models.append(gb)
    preds = gb.predict(X_test)
    pred_misses, rule_misses = get_miss_ided(X_test, y_test, preds, verbose=False)
    lifts.append(rule_misses - pred_misses)


#     verbose = True
#     r2_score = gb.score(X_test, y_test)
#     error = np.abs(y_test - preds)
#     mean_hrs_error = error.mean()
#     median_hrs_error = error.median()


#     print(f'R^2 Score: {r2_score}')
#     print(f'Mean absolute $ error: {mean_hrs_error}')
#     print(f'Median absolute $ error: {median_hrs_error}')

In [None]:
# model_name = cb_utils.save_model(models, '2_h_savings_lr_1000_bag', {'name': '2_h_savings_lr_1000_bag', 'features': list(features_df.columns)})

In [None]:
model_name = '20200802_172434_2_h_savings_lr_10_bag'
model, meta = cb_utils.load_model(model_name)

In [None]:
meta['features']

In [None]:
query = "SELECT * FROM cb.vw_mab_training_data WHERE mab_id = 2;"
mab2 = cb_utils.sql_query_to_df(query, use_cache=use_cache)

In [None]:
feature_cols = meta['features']

mom_feats = [c for c in feature_cols if c in mom_feature_columns] 
annual_feats = [c for c in feature_cols if c in annual_feature_columns]

lvl = '1. h'
df = mab2.loc[data.lvl_tgt == lvl]

targets_df = df[['member_id', target_col]].groupby('member_id').first()

features_df = features_annual(df, mom_feats)
pre_annual = df.query("period < 0")[annual_feats + ['member_id']].groupby('member_id').first()


features_df = features_df.merge(pre_annual, left_index=True, right_index=True)
features_df.fillna(0, inplace=True)


# order features and targets by member id, make sure they line up perfectly
features_df.sort_index(inplace=True)
targets_df.sort_index(inplace=True)
assert sum(targets_df.index - features_df.index) == 0

# train test split
# target_col = 'savings_tgt'
seed = random.randint(0, 1000)
print(f'seed: {seed}')
# X_train, X_test, y_train, y_test = train_test_split(features_df, targets_df[target_col], test_size=test_set_pct, random_state=seed)

# gb = Ridge(alpha=1.0, normalize=True)
# gb = BaggingRegressor(base_estimator=Ridge(alpha=1.0, normalize=True), n_estimators=10, random_state=0)
#     gb = GradientBoostingRegressor(random_state=seed, max_depth=2)
#     gb = RandomForestRegressor(random_state=seed)
scores = [model.predict(features_df) for model in models]
preds = np.mean(scores, axis=0)
# gb = model

# r2_score = gb.score(features_df, targets_df[target_col])
# preds = gb.predict(features_df)
error = np.abs(targets_df[target_col] - preds)
mean_hrs_error = error.mean()
median_hrs_error = error.median()


# print(f'R^2 Score: {r2_score}')
print(f'Mean absolute $ error: {mean_hrs_error}')
print(f'Median absolute $ error: {median_hrs_error}')
pred_misses, rule_misses = get_miss_ided(features_df, targets_df[target_col], preds)

In [None]:
result = features_df.assign(pred=preds)

In [None]:
# cb_utils.save_scores(result, 1)

In [None]:
raise Exception("Stop here, Old code below")