In [None]:
import os
import sys
import time
import random
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [None]:
# configuration
use_cache = True
seed = random.randint(0, 100)
test_set_pct = 0.2
print(f'Seed: {seed}')

In [None]:
# Load data
# 2019
query = "SELECT * FROM cb.vwm_elig_claims_visits_auths_mm vec ORDER BY member_id, bom"
visits = cb_utils.sql_query_to_df(query, use_cache=use_cache)
visits.fillna(0, inplace=True)

In [None]:
visits.shape

In [None]:
visits.member_id.nunique()

In [None]:
visits.describe()

In [None]:
visits.head()

In [None]:
cols = ['bom', 'attd_pcs_appropriate_hrs', 'visit_hrs', 'has_facility_ddos']
mom_appropriate_hours = visits[cols].groupby('bom', as_index=False).mean()
sns.relplot(x='bom', y='attd_pcs_appropriate_hrs', hue='has_facility_ddos', size='has_facility_ddos', data=mom_appropriate_hours, height=4, aspect=3)

In [None]:
sns.relplot(x='bom', y='visit_hrs', data=mom_appropriate_hours, height=4, aspect=3)

In [None]:
mom_appropriate_hours = visits[cols].groupby('bom', as_index=False).mean()
melted = mom_appropriate_hours[['bom', 'visit_hrs', 'attd_pcs_appropriate_hrs']].melt(id_vars=['bom'], var_name='hrs_type', value_name='hrs_val')
g = sns.relplot(x='bom', y='hrs_val',  hue='hrs_type', data=melted, height=5, aspect=3)

In [None]:
mom_appropriate_hours

In [None]:
## good dates 2017-07-01 to  2020-03-01
start_date = pd.to_datetime('2018-01-01')
id_date = pd.to_datetime('2019-01-01')
end_date = pd.to_datetime('2019-12-31')
mco_id = 2
good_visits = visits.loc[(start_date <= visits.bom) & (end_date >= visits.bom) & (visits.mco_id == mco_id)]

In [None]:
good_visits.member_id.nunique()

In [None]:
visits_all_pre_post = good_visits.groupby('member_id', as_index=False).filter(lambda x: x.shape[0] == 24)
visits_all_pre_post.member_id.nunique()

In [None]:
pre = visits_all_pre_post.loc[visits_all_pre_post.bom < id_date] #.groupby('member_id', as_index=False).sum()
post = visits_all_pre_post.loc[visits_all_pre_post.bom >= id_date] #.groupby('member_id', as_index=False).sum() 

In [None]:
post.head()

In [None]:
yearly_feats = [
 'lob',
 'ggroup',
 'is_aligned',
 'is_unaligned',
 'has_facility_ddos',
 'auth_attd_pcs_util_pct',
'auth_resp_util_pct',
]
sum_feats = [
'attd_pcs_hrs',
 'auths_n',
 'auth_attd_n',
 'auth_meal_n',
 'auth_pc_n',
 'auth_resp_n',
 'auth_units',
 'auth_hrs',
 'auth_attd_pcs_hrs',
 'auth_attd_hrs',
 'auth_meal_hrs',
 'auth_pcs_hrs',
 'auth_resp_hrs',
 'visit_hrs',
 'attd_pcs_visit_hrs',
 'attd_visit_hrs',
 'pcs_visit_hrs',
 'resp_visit_hrs',
 'night_visit_hrs',
 'attd_pcs_night_visit_hrs',
 'attd_night_visit_hrs',
 'pcs_night_visit_hrs',
 'resp_night_visit_hrs',
 'visit_n',
 'attd_pcs_visit_n',
 'attd_visit_n',
 'pcs_visit_n',
 'resp_visit_n',
 'meal_visit_n',
 'night_visit_n',
 'attd_pcs_night_visit_n',
 'attd_night_visit_n',
 'pcs_night_visit_n',
 'resp_night_visit_n',
 'meal_night_visit_n',
 'avg_visit_hrs',
 'avg_night_visit_hrs',
 'attd_avg_night_visit_hrs',
 'pcs_avg_night_visit_hrs',
 'resp_avg_night_visit_hrs',
 'cg_visit_nd',
 'attd_pcs_cg_visit_nd',
 'attd_cg_visit_nd',
 'pcs_cg_visit_nd',
 'meal_cg_visit_nd',
 'resp_cg_visit_nd',
 'missed_n',
 'attd_pcs_missed_n',
 'attd_missed_n',
 'pcs_missed_n',
 'meal_missed_n',
 'missed_hrs',
 'attd_pcs_missed_hrs',
 'attd_missed_hrs',
 'pcs_missed_hrs',
 'meal_missed_hrs',
 'memb_init_missed_n',
 'attd_pcs_memb_init_missed_n',
 'attd_memb_init_missed_n',
 'pcs_memb_init_missed_n',
 'meal_memb_init_missed_n',
 'memb_init_missed_hrs',
 'attd_pcs_memb_init_missed_hrs',
 'attd_memb_init_missed_hrs',
 'pcs_memb_init_missed_hrs',
 'prov_init_missed_n',
 'attd_pcs_prov_init_missed_n',
 'attd_prov_init_missed_n',
 'pcs_prov_init_missed_n',
 'prov_init_missed_hrs',
 'attd_pcs_prov_init_missed_hrs',
 'attd_prov_init_missed_hrs',
 'pcs_prov_init_missed_hrs',
 'night_missed_hrs',
 'attd_pcs_night_missed_hrs',
 'attd_night_missed_hrs',
 'pcs_night_missed_hrs',
 'night_missed_n',
 'attd_pcs_night_missed_n',
 'attd_night_missed_n',
 'pcs_night_missed_n',
 'meal_night_missed_n',
 'attd_pcs_appropriate_hrs',
 'tc',
 'hcbs_tc',
 'icf_tc',
 'ip_tc',
 'rx_tc',
 'ed_tc',
 'snf_tc',
 'out_tc',
 'pro_tc',
 'spfac_tc',
 'amb_tc',
 'hh_tc',
 'hosp_tc',
 'oth_tc',
 'hcbs_respite_tc',
 'hcbs_fam_care_stip_tc',
 'hcbs_com_trans_tc',
 'hcbs_educ_train_tc',
 'hcbs_com_liv_fam_tc',
 'hcbs_com_liv_tc',
 'hcbs_attend_care_tc',
 'hcbs_com_trans_waiv_tc',
 'hcbs_home_meal_tc',
 'hcbs_pers_care_tc',
 'hcbs_ther_behav_tc',
 'hcbs_unsk_respite_tc',
 'hcbs_waiv_svc_tc',
 'ddos',
 'hcbs_ddos',
 'icf_ddos',
 'ip_ddos',
 'rx_ddos',
 'ed_ddos',
 'snf_ddos',
 'out_ddos',
 'pro_ddos',
 'spfac_ddos',
 'amb_ddos',
 'hh_ddos',
 'hosp_ddos',
 'oth_ddos',
 'pcp_ddos',
 'pulmonar_ddos',
 'copd_ddos',
 'chf_ddos',
 'heart_ddos',
 'cancer_ddos',
 'ckd_ddos',
 'esrd_ddos',
 'hyperlipid_ddos',
 'diab_ddos',
 'alzh_ddos',
 'dementia_ddos',
 'stroke_ddos',
 'hypertension_ddos',
 'fall_ddos',
 'transplant_ddos',
 'liver_ddos',
 'hippfract_ddos',
 'depression_ddos',
 'psychosis_ddos',
 'drug_ddos',
 'alcohol_ddos',
 'paralysis_ddos'
]


In [None]:
pre_year = pre.groupby('member_id').mean()
pre_year.ggroup =  pre_year.ggroup.astype(int)
pre_year.lob =  pre_year.lob.astype(int)
pre_year = pre_year.sort_index()
pre_year

In [None]:
target = post.groupby('member_id')[['attd_pcs_appropriate_hrs', 'visit_hrs']].sum()
target = target.sort_index()
target

In [None]:
df = pre_year.merge(target, left_index=True, right_index=True)

In [None]:
df.head()

In [None]:
df.ggroup.value_counts()

In [None]:
# g = sns.relplot(x='attd_pcs_appropriate_hrs_x', y='attd_pcs_appropriate_hrs_y', row='ggroup', col='lob', hue='auth_attd_pcs_util_pct', data=df)
df = df.loc[(df.ggroup < 4) & (df.ggroup > 0)]

In [None]:
g = sns.relplot(x='visit_hrs_x', y='visit_hrs_y', row='ggroup', col='lob', hue='auth_attd_pcs_util_pct', data=df)

In [None]:
g = sns.relplot(x='attd_pcs_appropriate_hrs_x', y='attd_pcs_appropriate_hrs_y', row='ggroup', col='lob', hue='auth_attd_pcs_util_pct', data=df)

# Train lasso model

In [None]:
seed = random.randint(0, 1000)
features = [c for c in df if c != 'attd_pcs_appropriate_hrs_y']
# model = Lasso(alpha=1.0, random_state=seed,max_iter=10000, normalize=True)
model = Lasso(alpha=.85, random_state=seed,max_iter=10000, normalize=True)

X_train, X_test, y_train, y_test = train_test_split(df[features], df.attd_pcs_appropriate_hrs_y)
model.fit(X_train, y_train)

In [None]:
for c, f in sorted(zip(model.coef_, features), reverse=True):
    if c != 0:
        print(f, c)

In [None]:
model.score(X_test, y_test)

# Try some other model types

In [None]:
rf = RandomForestRegressor(random_state=seed)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
gb = GradientBoostingRegressor(random_state=seed)
gb.fit(X_train, y_train)
gb.score(X_test, y_test)

# Train 1000 lasso's, find the best features

In [None]:
cnt = collections.Counter()
scores = []

In [None]:
for i in range(1000):
    seed = random.randint(0, 1000)
    alpha = max(random.random(), 0.5)
    model = Lasso(alpha=alpha, random_state=seed,max_iter=10000, normalize=True)

    X_train, X_test, y_train, y_test = train_test_split(df[features], df.attd_pcs_appropriate_hrs_y)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))

    for c, f in sorted(zip(model.coef_, features), reverse=True):
        if c != 0:
            cnt[f] += 1


In [None]:
np.mean(scores)

In [None]:
cnt.most_common(50)

In [None]:
old = collections.Counter({'icf_ddos': 61,
         'resp_cg_visit_nd': 283,
         'auth_attd_pcs_util_pct': 960,
         'avg_visit_hrs': 889,
         'attd_pcs_cg_visit_nd': 950,
         'attd_pcs_appropriate_hrs_x': 1000,
         'attd_pcs_visit_n': 914,
         'auth_units': 935,
         'night_missed_n': 167,
         'visit_hrs': 765,
         'attd_pcs_hrs': 461,
         'resp_visit_hrs': 213,
         'missed_n': 277,
         'auth_resp_hrs': 360,
         'missed_hrs': 150,
         'auths_n': 8,
         'attd_cg_visit_nd': 95,
         'attd_pcs_visit_hrs': 41,
         'resp_visit_n': 1,
         'auth_attd_pcs_hrs': 70,
         'attd_night_visit_hrs': 6,
         'dementia_ddos': 5,
         'hh_tc': 9,
         'night_missed_hrs': 16,
         'hh_ddos': 12,
         'resp_night_visit_n': 5,
         'auth_hrs': 2,
         'auth_pc_n': 2,
         'auth_resp_n': 2,
         'attd_memb_init_missed_n': 1,
         'attd_memb_init_missed_hrs': 1})
old.most_common(50)

# Try KNN with the best features

In [None]:
best_feats = [f for f, n in cnt.most_common(20) if n > 100]

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(n_neighbors=100, weights = 'distance')
X_train, X_test, y_train, y_test = train_test_split(df[best_feats], df.attd_pcs_appropriate_hrs_y)

knn.fit(X_train, y_train)
knn.score(X_test, y_test)