In [None]:
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
cache = True
seed = 0
test_set_pct = 0.2

In [None]:
# Load data
auths = cb_utils.get_table('vw_ds_auth_mm', use_cache=cache)
claims = cb_utils.get_table('vw_ds_claims_mm', use_cache=cache)
visits = cb_utils.get_table('vw_ds_visit_features_mm', use_cache=cache)
all_df = cb_utils.merge_member_month_dfs(claims, auths, visits)

In [None]:
def limit_df_to_12m_members(d):
    return d.groupby('member_id', as_index=False).filter(lambda x: x.shape[0] == 12)
all_df = limit_df_to_12m_members(all_df) 

In [None]:
# Build training and test features and targets
dxs = [c for c in claims.columns if '_dx' in c]
pmpms = [c for c in claims.columns if '_pmpm' in c]

features = dxs + pmpms
target = 'attd_pcs_visit_hrs'

mem_years = all_df[features + ['member_id', target]].groupby('member_id', as_index=False)[features + [target]].mean().fillna(0)

X = mem_years[features]
y = mem_years.attd_pcs_visit_hrs

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_pct, random_state=seed)

In [None]:
# train and test some tree models
def train_and_evaluate(regr):
    regr.fit(X_train, y_train)
    preds = regr.predict(X_test)
    error = np.abs(y_test - preds)
    mean_hrs_error = error.mean()
    median_hrs_error = error.median()
    r2_score = regr.score(X_test, y_test)


    print(f'R^2 Score: {r2_score}')
    print(f'Mean absolute hrs error: {mean_hrs_error}')
    print(f'Median absolute hrs error: {median_hrs_error}')
    print('Feature Importance')
    for imp, feat in sorted([(b, a) for a, b in zip(features, regr.feature_importances_)], reverse=True):
        print('%0.3f: %s' % (imp, feat))

    fig, axes = plt.subplots(nrows=2, figsize=(20,20))
    ax = axes[0]
    ax.scatter(preds, y_test)
    ax.set_xlabel('preds')
    ax.set_ylabel('actual');
    ax.set_title('Predicted vs actual monthly pcs attd hours')
    
    ax = axes[1]
    ax.hist(error)
    ax.set_title('Histogram of absolute error in hrs')
    

In [None]:
rf = RandomForestRegressor(random_state=seed)
train_and_evaluate(rf)

In [None]:
gb = GradientBoostingRegressor(random_state=seed)
train_and_evaluate(gb)