In [None]:
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
use_cache = True
seed = 0
test_set_pct = 0.2

In [None]:
# Load data
query = "SELECT * FROM cb.mab_cost_util WHERE mab_id = 1;"
data = cb_utils.sql_query_to_df(query, use_cache=use_cache)
cats = cb_utils.sql_query_to_df("select * from junk.uhc_mbrs_for_model_id_2018_12_31;", use_cache=use_cache)

In [None]:
feature_columns = [
    '_lob_1_days',
    '_lob_2_days',
    '_lob_3_days',
    '_grp_1_days',
    '_grp_2_days',
    '_grp_3_days',
    '_grp_5_days',
    '_grp_6_days',
    '_grp_7_days',
    '_grp_8_days',
    '_unaligned_days',
#     'is_unaligned',
    'tc',
    'hcbs_tc',
    'icf_tc',
    'ip_tc',
    'rx_tc',
    'ed_tc',
    'snf_tc',
    'out_tc',
    'pro_tc',
    'spfac_tc',
    'amb_tc',
    'hh_tc',
    'hosp_tc',
    'oth_tc',
    'p_mm',
    'mm',
    'hcbs_respite_tc',
    'hcbs_fam_care_stip_tc',
    'hcbs_com_trans_tc',
    'hcbs_educ_train_tc',
    'hcbs_com_liv_fam_tc',
    'hcbs_com_liv_tc',
    'hcbs_attend_care_tc',
    'hcbs_com_trans_waiv_tc',
    'hcbs_home_meal_tc',
    'hcbs_pers_care_tc',
#     'hcbs_ther_behav_tc',
    'hcbs_unsk_respite_tc',
    'hcbs_waiv_svc_tc',
    'ddos',
    'hcbs_ddos',
    'icf_ddos',
    'ip_ddos',
    'rx_ddos',
    'ed_ddos',
    'snf_ddos',
    'out_ddos',
    'pro_ddos',
    'spfac_ddos',
    'amb_ddos',
    'hh_ddos',
    'hosp_ddos',
    'oth_ddos',
    'pcp_ddos',
    'pulmonar_ddos',
    'cancer_ddos',
    'ckd_ddos',
    'esrd_ddos',
    'hyperlipid_ddos',
    'diab_ddos',
    'alzh_ddos',
    'dementia_ddos',
    'stroke_ddos',
    'hypertension_ddos',
    'fall_ddos',
    'transplant_ddos',
    'liver_ddos',
    'hippfract_ddos',
    'depression_ddos',
    'psychosis_ddos',
    'drug_ddos',
    'alcohol_ddos',
    'paralysis_ddos'
]
len(feature_columns)

### Try agg features at year and half year level

In [None]:
# fully broken out month over month features
def features_mom(data, feature_columns):
    print('building month over month features')
    pre = data.query("period < 0")
    return pre.pivot(index='member_id', columns='period', values=feature_columns)
features_mom.name = 'MOM'

In [None]:
# agg semi yearly_features
def features_semi_annual(data, feature_columns):
    print('building semi annual features')
    pre = data.query("period < 0")
    h1 = pre.query('period < -6').groupby('member_id')
    h2 = pre.query('period >= -6').groupby('member_id')

    h1 = h1.sum()
    h2 = h2.sum()

    features_h1 = np.divide(h1[feature_columns],  h1[['p_mm']])
    features_h2 = np.divide(h2[feature_columns],  h2[['p_mm']])
    return features_h2.merge(features_h1, left_index=True, right_index=True, suffixes=('_h2', '_h1'))
features_semi_annual.name = 'Semi Annual'

In [None]:
# agg yearly_features
def features_annual(data, feature_columns):
    print('building annual features')
    pre = data.query("period < 0").groupby('member_id')
    pre_sums = pre.sum()
    return np.divide(pre_sums[feature_columns],  pre_sums[['p_mm']])
features_annual.name = 'Annual'

In [None]:
def build_targets(cats, level):
    print(f'building {level} targets')
    return cats.query('lvl == @level')[['member_id', 'savings_tc', 'cat_pre']].set_index('member_id')

In [None]:
def build_train_test_set(data, cats, feature_func, targets, feature_columns=feature_columns):
    features = feature_func(data, feature_columns)
    feature_targets = features.merge(targets, left_index=True, right_index=True)
    feature_targets = feature_targets.fillna(0)
    
    x_cols = [c for c in feature_targets.columns if c != 'savings_tc']
    X = feature_targets[x_cols]
    
#     display(feature_targets.columns)
    y = feature_targets.savings_tc

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_pct, random_state=seed)
    return X_train, X_test, y_train, y_test, x_cols

In [None]:
# train and test some tree models
def train_and_evaluate(regr, X_train=None, X_test=None, y_train=None, y_test=None, verbose=True, plot_result=True, plot_title=""):
    regr.fit(X_train, y_train)
    preds = regr.predict(X_test)
    error = np.abs(y_test - preds)
    mean_hrs_error = error.mean()
    median_hrs_error = error.median()
    r2_score = regr.score(X_test, y_test)


    if verbose:
        print(f'R^2 Score: {r2_score}')
        print(f'Mean absolute $ error: {mean_hrs_error}')
        print(f'Median absolute $ error: {median_hrs_error}')
        print('Feature Importance')
        for imp, feat in sorted([(b, a) for a, b in zip(feature_columns, regr.feature_importances_)], reverse=True):
            if imp > 0.001:
                print('%0.3f: %s' % (imp, feat))

    if plot_result:
        fig, axes = plt.subplots(nrows=2, figsize=(20,20))
        ax = axes[0]
        ax.scatter(preds, y_test)
        ax.set_xlabel('preds')
        ax.set_ylabel('actual');
        ax.set_title(f'{plot_title}: Predicted vs actual savings')

        ax = axes[1]
        ax.hist(error)
        ax.set_title('Histogram of absolute error in TC savings')
        plt.show()
        
    return preds
    

In [None]:
grid_search = True
if grid_search:
    combos = [(f, lv) for f in (features_annual, features_semi_annual, features_mom) for lv in ('1. h', '2. m', '3. l')]
#     combos = [(f, lv) for f in (features_annual, features_semi_annual, features_mom) for lv in ('2. m',)]
    for f, lv in combos:
        X_train, X_test, y_train, y_test, _ = build_train_test_set(data, cats, f, build_targets(cats, lv))
        gb = GradientBoostingRegressor(random_state=seed)
        gb_preds = train_and_evaluate(gb, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, verbose=True, plot_result=True, plot_title=f'Level: {lv} - Features: {f.name}')

In [None]:
# rf = RandomForestRegressor(random_state=seed)
# rf_preds = train_and_evaluate(rf, verbose=False, plot_result=False)

In [None]:
# gb = GradientBoostingRegressor(random_state=seed)
# gb_preds = train_and_evaluate(gb, verbose=False, plot_result=False)

In [None]:
X_train, X_test, y_train, y_test, x_cols = build_train_test_set(data, cats, features_semi_annual, build_targets(cats, '3. l'))
gb = GradientBoostingRegressor(random_state=seed)
gb_preds = train_and_evaluate(gb, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, verbose=False, plot_result=False)

In [None]:
model_name = cb_utils.save_model(gb, 'test_saving_xgb', {'name': 'test', 'features': list(x_cols.values) })

In [None]:
model, meta = cb_utils.load_model(model_name)

In [None]:
features = features_semi_annual(data, feature_columns)
features = features.merge(build_targets(cats, '3. l'), left_index=True, right_index=True) # for now targets has a feature (pre cat)
X = features[meta['features']].fillna(0)

In [None]:
preds = model.predict(X)
results = X.assign(pred=preds).sort_values('pred', ascending=False)[['pred']]

In [None]:
post_mem_grps = data.query('period > 0').groupby('member_id').sum()
final = results.merge(post_mem_grps, left_index=True, right_index=True)
final.head()
# post = post_mem_grps 

In [None]:
top_500 = final.iloc[:500].describe()

In [None]:
cb_utils.save_scores(final)