In [None]:
# %conda update -n base -c defaults conda
# %conda install pymysql
%load_ext autoreload
%autoreload 2

Collecting package metadata (current_repodata.json): done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::jpeg==9b=h024ee3a_2
  - defaults/linux-64::libtiff==4.1.0=h2733197_1
  - defaults/linux-64::lcms2==2.11=h396b838_0
  - defaults/linux-64::libwebp==1.0.1=h8e7db2f_0
  - defaults/linux-64::openjpeg==2.3.0=h05c96fa_1
  - defaults/linux-64::cairo==1.14.12=h8948797_3
  - defaults/linux-64::qt==5.9.7=h5867ecd_1
  - defaults/noarch::black==19.10b0=py_0
  - defaults/linux-64::harfbuzz==2.4.0=hca77d97_1
  - defaults/linux-64::mkl-service==2.3.0=py37he8ac12f_0
  - defaults/linux-64::pyqt==5.9.2=py37h05f1152_2
  - defaults/linux-64::numpy-base==1.19.2=py37hfa32c7d_0
  - defaults/linux-64::pango==1.45.3=hd140c19_0
  - defaults/noarch::flask==1.1.2=pyhd3eb1b0_0
  - defaults/linux-64::pyopenssl==19.1.0=py37_0
  - defaults/linux-64::secretstorage==3.3.1=py37h06a4308_0
  -

In [3]:
import os
import io
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
import boto3
import sagemaker
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

  """)


In [3]:
# configuration
use_cache = True
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 91


In [None]:
role = sagemaker.get_execution_role()
bucket_name = 'cb-analytics-us-east-2-prd'
prefix = 'sagemaker/'
file_name = 'ip_features_all.parquet'
# my_region = boto3.session.Session().region_name # set the region of the instance

obj = boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, file_name)).get()
ip_features_all = pd.read_parquet(io.BytesIO(obj['Body'].read()))
ip_features_all.head()

In [None]:
ip_features_all.shape

### Mark good member periods

eligible members on day of id and first day of post

iterate from first month to total_months - pre + post_period, create pre/post for member if elg

In [None]:
pre_months = 12
post_months = 6
pre_post_months = pre_months + post_months

In [None]:
months = sorted(ip_features_all.eom.unique())
n_months = len(months)
last_valid_pre_start = n_months - pre_post_months # 42
months[:3], months[-3:], n_months, last_valid_pre_start 

In [None]:
# create bool column flags to easily query what batches this can be in
# pres = {f'pre_{i}': False for i, _ in enumerate(months) if i < last_valid_pre_start}
# posts = {f'post_{i}': False for i, _ in enumerate(months) if i < last_valid_pre_start}
flags = {f'{prefix}_{i}': False for prefix in ['pre', 'post', 'pre_post_elg'] for i in range(n_months) if i < last_valid_pre_start}
ip_features_all = ip_features_all.assign(**flags);

In [None]:
ifa = ip_features_all

In [None]:
# assign bool flags for each potential period
periods = []
for i in tqdm(range(last_valid_pre_start)):
    # Build date anchor points relative to start month
    pre_start = months[i]
    pre_end = months[i+11]
    # id_date = pre_end + relativedelta(days=1)
    
    post_start = None
    post_end = None
    if i + 17 < n_months:
        post_start = months[i+12]
        post_end = months[i+17]
        
    periods.append([i, pre_start, pre_end, post_start, post_end])

    # Determine elg members
    pre_elg = ifa.loc[(ifa.eom == pre_end) & (ifa.is_cb_eligible)].member_id.unique()
    post_elg = ifa.loc[(ifa.eom == post_start) & (ifa.is_cb_eligible)].member_id.unique()

    pre_post_elg_mems = np.intersect1d(pre_elg, post_elg)

    # Flag elg members for period i
    ifa.loc[(ifa.eom >= pre_start) & (ifa.eom <= pre_end) & (ifa.member_id.isin(pre_elg)), f'pre_{i}'] = True 
    ifa.loc[(ifa.eom >= post_start) & (ifa.eom <= post_end) & (ifa.member_id.isin(post_elg)), f'post_{i}'] = True 

    ifa.loc[(ifa.eom >= pre_start) & (ifa.eom <= post_end) & (ifa.member_id.isin(pre_post_elg_mems)), f'pre_post_elg_{i}'] = True 

In [None]:
periods_df = pd.DataFrame(periods, columns=['period', 'pre_start', 'pre_end', 'post_start', 'post_end'])

In [None]:
months_df = pd.DataFrame(months, columns=['eom'])

In [None]:
ifa.is_cb_eligible = ifa.is_cb_eligible.fillna(False)
ifa.is_unaligned = ifa.is_unaligned.fillna(False)
ifa = ifa.fillna(0)

In [None]:
ifa = ifa.assign(is_male=np.where(ifa.gender=='m',1,0))

In [None]:
# assign state
ifa = ifa.assign(state=ifa.mco_name.str.split(' ').apply(lambda x: x[1]).replace({'Centene': 'IA'}))

In [None]:
ifa.to_parquet('./data/member_periods_v4.parquet')

In [None]:
# ifa.to_csv('./data/member_periods.csv')

### Build features + targets

In [None]:
member_periods = pd.read_parquet('./data/member_periods_v4.parquet')

In [4]:
target_cols = ['ip_tc', 'er_tc', 'snf_tc', 'amb_tc']
##
tc_feats = [c for c in ifa.columns if '_tc' in c]
ddos_cols = ['ip_ddos', 'er_ddos', 'out_ddos', 'snf_ddos', 'icf_ddos', 'hh_ddos', 'amb_ddos', 'hsp_ddos', 'pro_ddos', 'spc_fac_ddos', 'dme_ddos', 'cls_ddos', 'hha_ddos']
top_level_feats = ['age', 'is_male', 'state', 'ggroup', 'line_of_business_id']

NameError: name 'ifa' is not defined

In [5]:
tc_dx_feats = [
 # 'rx_tc',
 # 'other_tc',
 # 'ip_tc',
 # 'er_tc',
 # 'out_tc',
 # 'snf_tc',
 # 'icf_tc',
 # 'hh_tc',
 # 'amb_tc',
 # 'hsp_tc',
 # 'pro_tc',
 # 'spc_fac_tc',
 # 'dme_tc',
 # 'cls_tc',
 # 'hha_tc',
 'hcbs_attdpcs_tc',
 'hcbs_other_tc',
 'hcbs_support_house_tc',
 'hcbs_adult_day_tc',
 'hcbs_pers_tc',
 'hcbs_assist_tech_tc',
 'oxygen_tc',
 'hosp_bed_tc',
 'chf_tc',
 'heart_tc',
 'copd_tc',
 'pulmonar_tc',
 'cancer_tc',
 'ckd_tc',
 'esrd_tc',
 'lipidy_tc',
 'diab_tc',
 'alzh_tc',
 'demented_tc',
 'stroke_tc',
 'hyper_tc',
 'fall_tc',
 'trans_tc',
 'liver_tc',
 'hippy_tc',
 'depressed_tc',
 'psycho_tc',
 'druggy_tc',
 'boozy_tc',
 'paralyzed_tc',
 'mono_tc',
 'mono_dom_tc',
 'hemi_tc',
 'hemi_dom_tc',
 'para_tc',
 'quad_tc',
 'tbi_tc',
 'obese_tc',
 'pressure_ulcer_tc',
 'hemophilia_tc']

In [None]:
# def build_member_features(mdf): 
mcos = member_periods.mco_name.unique().tolist()
mco_cols = [f'is_{m.lower().replace(" ", "_")}' for m in mcos]
n_mcos = len(mcos)
def encode_mco(mco_str):
    one_hot = np.zeros(n_mcos, dtype=int)
    one_hot[mcos.index(mco_str)] = 1 
    return one_hot

In [None]:
lobs = member_periods.line_of_business_id.unique().tolist()
lob_cols = [f'is_lob_{l}' for l in lobs]
n_lobs = len(lobs)
def encode_lob(lob):
    one_hot = np.zeros(n_lobs, dtype=int)
    one_hot[lobs.index(lob)] = 1 
    return one_hot

In [None]:
groups = member_periods.ggroup.unique().tolist()
group_cols = [f'is_group_{l}' for l in groups]
n_groups = len(groups)
def encode_group(group):
    one_hot = np.zeros(n_groups, dtype=int)
    one_hot[groups.index(group)] = 1 
    return one_hot

In [None]:
states = member_periods.state.unique().tolist()
state_cols = [f'is_state_{l}' for l in states]
n_states = len(states)
def encode_state(state):
    one_hot = np.zeros(n_states, dtype=int)
    one_hot[states.index(state)] = 1 
    return one_hot

In [None]:
wide_ddos_cols = [f'{c}_{i}' for i in range(pre_months) for c in ddos_cols]
wide_tc_dx_cols = [f'{c}_{i}' for i in range(pre_months) for c in tc_dx_feats]

In [None]:
def build_member_features(mdf, months_range):
# mdf = member_periods.loc[(member_periods.pre_0) & (member_periods.pre_full_0) & (member_periods.member_id == 102)].sort_values('eom')
    # print(months_range)

    if len(mdf) == 0:
        return mdf
        
    demographic_data = mdf[top_level_feats + ['member_id']].iloc[-1]
    
    mdf = months_range.merge(mdf, on='eom', how='left')
    mdf = mdf.sort_values('eom')[ddos_cols + tc_dx_feats]
    mdf = mdf.fillna(0)
    
    ddos_data = mdf.to_numpy().reshape([1, -1])

    state_data = encode_state(demographic_data.state)
    lob_data = encode_lob(demographic_data.line_of_business_id)
    group_data = encode_group(demographic_data.ggroup)
    data = np.concatenate((ddos_data[0], state_data, lob_data, group_data, np.array([demographic_data.is_male, demographic_data.age, demographic_data.member_id])), axis=0, dtype=float)
    cols = wide_ddos_cols + wide_tc_dx_cols + state_cols + lob_cols + group_cols + ['is_male', 'age', 'member_id']

    return pd.DataFrame([data], columns=cols)

In [None]:
# mdf = member_periods.loc[(member_periods.pre_0) & (member_periods.pre_full_0 == False) & (member_periods.member_id == 26)].sort_values('eom')

In [None]:
def build_member_targets(mdf):
    if len(mdf) == 0:
        return pd.DataFrame([], columns=['member_id', 'target'])
    tc = mdf[target_cols].sum().sum()
#     pmpm = tc / mdf.cpmm.sum()
    return pd.DataFrame([[mdf.iloc[0].member_id, tc]], columns=['member_id', 'target'])

In [None]:
def build_targets(post_df):
    return post_df.groupby('member_id', as_index=False).apply(build_member_targets)

In [None]:
def build_features(pre_df, months_range):
    return pre_df.groupby('member_id', as_index=False).apply(lambda x: build_member_features(x, months_range))

In [None]:
# build features and targets for each period
period_dfs = []
for i in tqdm(range(last_valid_pre_start)):
    elg = member_periods.loc[member_periods[f'pre_post_elg_{i}']] 
    pre = elg.loc[elg[f'pre_{i}']] 

    post = elg.loc[elg[f'post_{i}']] 
    x = build_features(pre, months_df.loc[i:i+11])
    # if i < 42:
    y = build_targets(post)
    final = x.merge(y, how='left', left_on='member_id', right_on='member_id').assign(period=i)
    # else:
        # final = x.assign(period=i)
    period_dfs.append(final)

In [None]:
master_df = pd.concat(period_dfs)
master_df.shape

In [None]:
# master_df.to_parquet('./data/master_df.parquet')
# master_df.to_parquet('./data/master_ddos_df.parquet')
master_df.to_parquet('./data/master_wide_df_v4.parquet')

### Train/Val/Test split
Avoid any leakage by doing the splits at the member level

In [None]:
role = sagemaker.get_execution_role()
bucket_name = 'cb-analytics-us-east-2-prd'
prefix = 'sagemaker/'
file_name = 'master_wide_df_v4.parquet'
# my_region = boto3.session.Session().region_name # set the region of the instance

obj = boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, file_name)).get()
master_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
master_df.head()

In [None]:
# master_df = pd.read_parquet('./data/master_df.parquet')
# master_df = pd.read_parquet('./data/master_ddos_df.parquet')
master_df = pd.read_parquet('./data/master_wide_df_v4.parquet')
# master_df = master_df.loc[master_df.period > 24]
# make dtype str for these categorical features
# master_df.ggroup = master_df.ggroup.astype(str)
# master_df.line_of_business_id = master_df.line_of_business_id.astype(str)

In [1]:
master_df.head()

NameError: name 'master_df' is not defined

### One hot enc

In [None]:
# cat_feats = ['gender', 'mco_name', 'ggroup', 'line_of_business_id']
# one_hots = pd.get_dummies(master_df[cat_feats])
# master_df = pd.concat([master_df, one_hots], axis=1).drop(columns=cat_feats)

In [None]:
member_ids = master_df.member_id.unique()
n_members = len(member_ids)
n_members

In [None]:
train_n = int(n_members * .7)
val_n = int(n_members * .15)
test_n = n_members - train_n - val_n
train_n, val_n, test_n

In [None]:
np.random.shuffle(member_ids)

In [None]:
train_mems, val_mems, test_mems = np.split(member_ids, [train_n, train_n + val_n])
assert train_mems.shape[0] == train_n
assert val_mems.shape[0] == val_n
assert test_mems.shape[0] == test_n

In [None]:
training_df = master_df.loc[master_df.member_id.isin(train_mems)]
val_df = master_df.loc[master_df.member_id.isin(val_mems)]
test_df = master_df.loc[master_df.member_id.isin(test_mems)]

### Normalize/encode features if needed
not needed for trees, most linear models will do it for you if you pass the param

### Training

In [None]:
x_cols = [c for c in training_df.columns if c not in ['member_id', 'target', 'period']]
# x_cols = [c for c in training_df.columns if c not in ['member_id', 'target', 'period'] + cat_feats]
x = training_df[x_cols]
y = training_df.target

In [None]:
ridge = Ridge(alpha=1, normalize=True)

In [None]:
ridge.fit(x, y)

In [None]:
ridge.score(x,y)

In [None]:
val_x = val_df[x_cols]
val_y = val_df.target
ridge.score(val_x,val_y)

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x, y)

In [None]:
histr.score(x, y)

In [None]:
histr.score(val_x, val_y)

In [None]:
val_df.head()

In [None]:
train_preds = histr.predict(x)
val_preds = histr.predict(val_x)

In [None]:
train_w_preds = training_df.assign(pred=train_preds, sample='train').merge(periods_df, on='period')
val_w_preds = val_df.assign(pred=val_preds, sample='validation').merge(periods_df, on='period')

In [None]:
out_cols = ['member_id', 'sample', 'target', 'pred', 'period', 'pre_start', 'pre_end', 'post_start', 'post_end']

In [None]:
pd.concat([train_w_preds[out_cols], val_w_preds[out_cols]]).to_csv('hgbr_12_mom_ddos.csv', index=False)

### Feature importance

In [None]:
sorted(zip(ridge.coef_, x.columns))

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
result = permutation_importance(histr, val_x, val_y, n_repeats=10,random_state=0, n_jobs=-1)

In [None]:
sorted(zip(result.importances_mean, val_x.columns))