In [None]:
import os
import sys
import time
import random
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
use_cache = True
seed = random.randint(0, 100)
test_set_pct = 0.2
print(f'Seed: {seed}')

In [None]:
# Load data
# 2019
query = "SELECT * FROM cb.vwm_elig_claims_visits_auths_mm vec ORDER BY member_id, bom"
visits = cb_utils.sql_query_to_df(query, use_cache=use_cache)
visits.fillna(0, inplace=True)

In [None]:
visits.shape

In [None]:
visits.member_id.nunique()

In [None]:
visits.head()

In [None]:
cols = ['bom', 'attd_pcs_appropriate_hrs', 'visit_hrs', 'has_facility_ddos']
mom_appropriate_hours = visits[cols].groupby('bom', as_index=False).mean()
sns.relplot(x='bom', y='attd_pcs_appropriate_hrs', hue='has_facility_ddos', size='has_facility_ddos', data=mom_appropriate_hours, height=4, aspect=3)

In [None]:
sns.relplot(x='bom', y='visit_hrs', data=mom_appropriate_hours, height=4, aspect=3)

In [None]:
mom_appropriate_hours = visits[cols].groupby('bom', as_index=False).mean()
melted = mom_appropriate_hours[['bom', 'visit_hrs', 'attd_pcs_appropriate_hrs']].melt(id_vars=['bom'], var_name='hrs_type', value_name='hrs_val')
g = sns.relplot(x='bom', y='hrs_val',  hue='hrs_type', data=melted, height=5, aspect=3)

In [None]:
dem_feats = [
 'age',
 'is_male',
 'lob',
 'ggroup',
 'is_aligned',
#  'is_unaligned',
 'has_facility_ddos',   
]
yearly_feats = [
 'auth_attd_pcs_util_pct',
 'auth_resp_util_pct',
]
sum_feats = [
'attd_pcs_hrs',
 'auths_n',
 'auth_attd_n',
 'auth_meal_n',
 'auth_pc_n',
 'auth_resp_n',
 'auth_units',
 'auth_hrs',
 'auth_attd_pcs_hrs',
 'auth_attd_hrs',
 'auth_meal_hrs',
 'auth_pcs_hrs',
 'auth_resp_hrs',
 'visit_hrs',
 'attd_pcs_visit_hrs',
 'attd_visit_hrs',
 'pcs_visit_hrs',
 'resp_visit_hrs',
 'night_visit_hrs',
 'attd_pcs_night_visit_hrs',
 'attd_night_visit_hrs',
 'pcs_night_visit_hrs',
 'resp_night_visit_hrs',
 'visit_n',
 'attd_pcs_visit_n',
 'attd_visit_n',
 'pcs_visit_n',
 'resp_visit_n',
 'meal_visit_n',
 'night_visit_n',
 'attd_pcs_night_visit_n',
 'attd_night_visit_n',
 'pcs_night_visit_n',
 'resp_night_visit_n',
 'meal_night_visit_n',
 'avg_visit_hrs',
 'avg_night_visit_hrs',
 'attd_avg_night_visit_hrs',
 'pcs_avg_night_visit_hrs',
 'resp_avg_night_visit_hrs',
 'cg_visit_nd',
 'attd_pcs_cg_visit_nd',
 'attd_cg_visit_nd',
 'pcs_cg_visit_nd',
 'meal_cg_visit_nd',
 'resp_cg_visit_nd',
 'missed_n',
 'attd_pcs_missed_n',
 'attd_missed_n',
 'pcs_missed_n',
 'meal_missed_n',
 'missed_hrs',
 'attd_pcs_missed_hrs',
 'attd_missed_hrs',
 'pcs_missed_hrs',
 'meal_missed_hrs',
 'memb_init_missed_n',
 'attd_pcs_memb_init_missed_n',
 'attd_memb_init_missed_n',
 'pcs_memb_init_missed_n',
 'meal_memb_init_missed_n',
 'memb_init_missed_hrs',
 'attd_pcs_memb_init_missed_hrs',
 'attd_memb_init_missed_hrs',
 'pcs_memb_init_missed_hrs',
 'prov_init_missed_n',
 'attd_pcs_prov_init_missed_n',
 'attd_prov_init_missed_n',
 'pcs_prov_init_missed_n',
 'prov_init_missed_hrs',
 'attd_pcs_prov_init_missed_hrs',
 'attd_prov_init_missed_hrs',
 'pcs_prov_init_missed_hrs',
 'night_missed_hrs',
 'attd_pcs_night_missed_hrs',
 'attd_night_missed_hrs',
 'pcs_night_missed_hrs',
 'night_missed_n',
 'attd_pcs_night_missed_n',
 'attd_night_missed_n',
 'pcs_night_missed_n',
 'meal_night_missed_n',
 'attd_pcs_appropriate_hrs',
 
]

claims_feats = [
    'tc',
 'hcbs_tc',
 'icf_tc',
 'ip_tc',
 'rx_tc',
 'ed_tc',
 'snf_tc',
 'out_tc',
 'pro_tc',
 'spfac_tc',
 'amb_tc',
 'hh_tc',
 'hosp_tc',
 'oth_tc',
 'hcbs_respite_tc',
 'hcbs_fam_care_stip_tc',
 'hcbs_com_trans_tc',
 'hcbs_educ_train_tc',
 'hcbs_com_liv_fam_tc',
 'hcbs_com_liv_tc',
 'hcbs_attend_care_tc',
 'hcbs_com_trans_waiv_tc',
 'hcbs_home_meal_tc',
 'hcbs_pers_care_tc',
 'hcbs_ther_behav_tc',
 'hcbs_unsk_respite_tc',
 'hcbs_waiv_svc_tc',
 'ddos',
 'hcbs_ddos',
 'icf_ddos',
 'ip_ddos',
 'rx_ddos',
 'ed_ddos',
 'snf_ddos',
 'out_ddos',
 'pro_ddos',
 'spfac_ddos',
 'amb_ddos',
 'hh_ddos',
 'hosp_ddos',
 'oth_ddos',
 'pcp_ddos',
 'pulmonar_ddos',
 'copd_ddos',
 'chf_ddos',
 'heart_ddos',
 'cancer_ddos',
 'ckd_ddos',
 'esrd_ddos',
 'hyperlipid_ddos',
 'diab_ddos',
 'alzh_ddos',
 'dementia_ddos',
 'stroke_ddos',
 'hypertension_ddos',
 'fall_ddos',
 'transplant_ddos',
 'liver_ddos',
 'hippfract_ddos',
 'depression_ddos',
 'psychosis_ddos',
 'drug_ddos',
 'alcohol_ddos',
 'paralysis_ddos'
]


In [None]:
def most_frequent(x):
    m = pd.Series.mode(x)
    if type(m) is pd.Series:
        return m[0]
    return m

In [None]:
## good dates 2017-07-01 to  2020-03-01
start_date = pd.to_datetime('2018-01-01')
id_date = pd.to_datetime('2019-01-01')
end_date = pd.to_datetime('2019-12-31')
mco_id = 2
good_visits = visits.loc[(start_date <= visits.bom) & (end_date >= visits.bom) & (visits.mco_id == mco_id)]

good_visits.member_id.nunique()

visits_all_pre_post = good_visits.groupby('member_id', as_index=False).filter(lambda x: x.shape[0] == 24)
visits_all_pre_post.member_id.nunique()

pre = visits_all_pre_post.loc[visits_all_pre_post.bom < id_date] #.groupby('member_id', as_index=False).sum()
post = visits_all_pre_post.loc[visits_all_pre_post.bom >= id_date] #.groupby('member_id', as_index=False).sum() 

# annual_feats = pre[dem_feats + ['member_id']].groupby('member_id').max()
annual_feats = pre[dem_feats + ['member_id']].groupby('member_id').agg({'ggroup': most_frequent, 'lob': most_frequent, 'age': max, 'is_male': max, 'is_aligned': min, 'has_facility_ddos': 'mean'})
mom_feats = pre[claims_feats + ['member_id']].groupby('member_id').mean()
pre_year = annual_feats.merge(mom_feats, left_index=True, right_index=True)
assert annual_feats.shape[0] == mom_feats.shape[0] == pre_year.shape[0]

pre_year = pre_year.sort_index()

target = post.groupby('member_id')[['attd_pcs_appropriate_hrs', 'visit_hrs']].sum()
target = target.sort_index()

assert len(target.index.difference(pre_year.index)) == 0
df = pre_year.merge(target, left_index=True, right_index=True)

In [None]:
df.head()

In [None]:
df.ggroup.value_counts()

In [None]:
# g = sns.relplot(x='attd_pcs_appropriate_hrs_x', y='attd_pcs_appropriate_hrs_y', row='ggroup', col='lob', hue='auth_attd_pcs_util_pct', data=df)
df = df.loc[(df.ggroup < 4) & (df.ggroup > 0)]

# Train lasso model
Seems to do consistently worse than xgboost

In [None]:
seed = random.randint(0, 1000)
features = claims_feats + dem_feats
# model = Lasso(alpha=1.0, random_state=seed,max_iter=10000, normalize=True)
model = Lasso(alpha=.85, random_state=seed,max_iter=10000, normalize=True)

X_train, X_test, y_train, y_test = train_test_split(df[features], df.attd_pcs_appropriate_hrs)
model.fit(X_train, y_train)

In [None]:
for c, f in sorted(zip(model.coef_, features), reverse=True):
    if c != 0:
        print(f, c)

In [None]:
model.score(X_test, y_test)

# Try some other model types

In [None]:
rf = RandomForestRegressor(random_state=seed)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
gb = GradientBoostingRegressor(random_state=seed)
gb.fit(X_train, y_train)
gb.score(X_test, y_test)

# Tree models seemed to do better, see if result is consitent

In [None]:
cnt = collections.Counter()
scores = []
gb_models = []

for i in range(100):
    seed = random.randint(0, 1000)
    alpha = max(random.random(), 0.5)
    model = GradientBoostingRegressor(random_state=seed)

    X_train, X_test, y_train, y_test = train_test_split(df[features], df.attd_pcs_appropriate_hrs)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
    gb_models.append(model)

    for c, f in sorted(zip(model.feature_importances_, features), reverse=True):
        if c != 0:
            cnt[f] += 1
            
np.mean(scores)

In [None]:
cnt = collections.Counter({'hcbs_tc': 100,
         'hcbs_attend_care_tc': 100,
         'hcbs_pers_care_tc': 100,
         'hcbs_ddos': 100,
         'hcbs_unsk_respite_tc': 100,
         'tc': 100,
         'out_ddos': 100,
         'ed_tc': 100,
         'ddos': 100,
         'snf_tc': 98,
         'stroke_ddos': 100,
         'age': 100,
         'hcbs_com_liv_tc': 100,
         'dementia_ddos': 100,
         'icf_ddos': 91,
         'hh_tc': 100,
         'pro_tc': 99,
         'paralysis_ddos': 100,
         'copd_ddos': 98,
         'chf_ddos': 100,
         'has_facility_ddos': 98,
         'hh_ddos': 99,
         'hypertension_ddos': 100,
         'amb_tc': 100,
         'is_male': 78,
         'out_tc': 100,
         'diab_ddos': 100,
         'psychosis_ddos': 82,
         'ed_ddos': 91,
         'alzh_ddos': 100,
         'hcbs_home_meal_tc': 100,
         'rx_tc': 100,
         'pro_ddos': 98,
         'fall_ddos': 98,
         'depression_ddos': 96,
         'ip_tc': 96,
         'ckd_ddos': 98,
         'rx_ddos': 98,
         'esrd_ddos': 71,
         'icf_tc': 64,
         'pulmonar_ddos': 96,
         'is_aligned': 81,
         'lob': 66,
         'hyperlipid_ddos': 95,
         'cancer_ddos': 97,
         'snf_ddos': 61,
         'liver_ddos': 80,
         'heart_ddos': 100,
         'pcp_ddos': 96,
         'ip_ddos': 74,
         'amb_ddos': 96,
         'ggroup': 69,
         'drug_ddos': 45,
         'hosp_ddos': 13,
         'transplant_ddos': 49,
         'hosp_tc': 12,
         'hcbs_com_trans_waiv_tc': 5,
         'hcbs_com_liv_fam_tc': 53,
         'alcohol_ddos': 9,
         'hippfract_ddos': 15,
         'oth_ddos': 2,
         'oth_tc': 1})
cnt.most_common(50)

# Train 1000 lasso's, find the best features
turns out gb is better

In [None]:
# cnt = collections.Counter()
# scores = []

# for i in range(1000):
#     seed = random.randint(0, 1000)
#     alpha = max(random.random(), 0.5)
#     model = Lasso(alpha=alpha, random_state=seed,max_iter=10000, normalize=True)

#     X_train, X_test, y_train, y_test = train_test_split(df[features], df.attd_pcs_appropriate_hrs)
#     model.fit(X_train, y_train)
#     scores.append(model.score(X_test, y_test))

#     for c, f in sorted(zip(model.coef_, features), reverse=True):
#         if c != 0:
#             cnt[f] += 1


# Look at pred vs actual for gb models

In [None]:
## good dates 2017-07-01 to  2020-03-01
start_date = pd.to_datetime('2018-01-01')
id_date = pd.to_datetime('2019-01-01')
end_date = pd.to_datetime('2019-12-31')
# model was trained on mco 2, lets test on mco 1
mco_id = 1
good_visits = visits.loc[(start_date <= visits.bom) & (end_date >= visits.bom) & (visits.mco_id == mco_id)]

visits_all_pre_post = good_visits.groupby('member_id', as_index=False).filter(lambda x: x.shape[0] == 24)

pre = visits_all_pre_post.loc[visits_all_pre_post.bom < id_date]
post = visits_all_pre_post.loc[visits_all_pre_post.bom >= id_date]

annual_feats = pre[dem_feats + ['member_id']].groupby('member_id').agg({'ggroup': most_frequent, 'lob': most_frequent, 'age': max, 'is_male': max, 'is_aligned': min, 'has_facility_ddos': 'mean'})
mom_feats = pre[claims_feats + ['member_id']].groupby('member_id').mean()
pre_year = annual_feats.merge(mom_feats, left_index=True, right_index=True)
assert annual_feats.shape[0] == mom_feats.shape[0] == pre_year.shape[0]

pre_year = pre_year.sort_index()

target = post.groupby('member_id')[['attd_pcs_appropriate_hrs', 'visit_hrs']].sum()
target = target.sort_index()

assert len(target.index.difference(pre_year.index)) == 0
df = pre_year.merge(target, left_index=True, right_index=True)

In [None]:
preds = np.mean([model.predict(df[X_train.columns]) for model in gb_models], axis=0)

In [None]:
result = df.assign(pred=preds)

In [None]:
g = sns.relplot(x='attd_pcs_appropriate_hrs', y='pred', data=result, height=10) #, height=5, aspect=3
g.set(xlabel='Actual', ylabel='Predicted', title='Predicted vs Actual Appropriate Hrs')

# Try KNN with the best features
look at similar members

In [None]:
best_feats = [f for f, n in cnt.most_common(50) if n > 80]
knn = KNeighborsRegressor(n_neighbors=100, weights = 'distance')
X_train, X_test, y_train, y_test = train_test_split(df[best_feats], df.attd_pcs_appropriate_hrs)

knn.fit(X_train, y_train)
knn.score(X_test, y_test)

In [None]:
X_test.head()

In [None]:
i = random.randint(1, X_test.shape[0])
member = X_test.iloc[i-1:i]

In [None]:
dist, indexes = knn.kneighbors(X=member, n_neighbors=5, return_distance=True)

In [None]:
indexes[0]

In [None]:
member.head()

In [None]:
X_train.iloc[indexes[0]]

In [None]:
y_train.iloc[indexes[0]]

In [None]:
y_test.iloc[i-1:i]