In [1]:
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2



In [2]:
# configuration
use_cache = True
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 79


In [3]:
query = f"select * from junk.ip_features_all;"
ip_features_all = cb_utils.sql_query_to_df(query, use_cache=use_cache)

Pulled query from cache


In [4]:
ip_features_all.shape

(1444147, 140)

In [5]:
# ip_features_all.to_parquet('./data/ip_features_all.parquet')

# ip_features_all.to_csv('./data/ip_features_all.csv', index=False)

In [5]:
member_key = cb_utils.sql_query_to_df(f"select * from junk.ip_member_key;", use_cache=use_cache)

Pulled query from cache


### Mark good member periods

eligible members on day of id and first day of post

iterate from first month to total_months - pre + post_period, create pre/post for member if elg

In [7]:
pre_months = 12
post_months = 6
pre_post_months = pre_months + post_months

In [8]:
months = sorted(ip_features_all.eom.unique())
n_months = len(months)
last_valid_pre_start = n_months - pre_post_months # 42
months[:3], months[-3:], n_months, last_valid_pre_start 

([datetime.date(2017, 1, 31),
  datetime.date(2017, 2, 28),
  datetime.date(2017, 3, 31)],
 [datetime.date(2021, 10, 31),
  datetime.date(2021, 11, 30),
  datetime.date(2021, 12, 31)],
 60,
 42)

In [9]:
# create bool column flags to easily query what batches this can be in
# pres = {f'pre_{i}': False for i, _ in enumerate(months) if i < last_valid_pre_start}
# posts = {f'post_{i}': False for i, _ in enumerate(months) if i < last_valid_pre_start}
flags = {f'{prefix}_{i}': False for prefix in ['pre', 'post', 'pre_full', 'post_full'] for i in range(n_months) if i < last_valid_pre_start}
ip_features_all = ip_features_all.assign(**flags)

  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_features_all = ip_features_all.assign(**flags)
  ip_feature

In [10]:
ifa = ip_features_all

In [None]:
# assign bool flags for each potential period
periods = []
for i in tqdm(range(last_valid_pre_start)):
    # Build date anchor points relative to start month
    pre_start = months[i]
    pre_end = months[i+11]
    # id_date = pre_end + relativedelta(days=1)
    post_start = months[i+12]
    post_end = months[i+17]
    periods.append([i, pre_start, pre_end, post_start, post_end])

    # Determine elg members
    pre_elg = ifa.loc[(ifa.eom == pre_end) & (ifa.is_cb_eligible)].member_id.unique()
    post_elg = ifa.loc[(ifa.eom == post_start) & (ifa.is_cb_eligible)].member_id.unique()

    full_pre = ifa.loc[(ifa.eom >= pre_start) & (ifa.eom <= pre_end)].groupby('member_id', as_index=False).filter(lambda x: len(x) == pre_months).member_id.unique()
    full_post = ifa.loc[(ifa.eom >= post_start) & (ifa.eom <= post_end)].groupby('member_id', as_index=False).filter(lambda x: len(x) == post_months).member_id.unique()

    elg_mems = np.intersect1d(pre_elg, post_elg)

    # Flag elg members for period i
    ifa.loc[(ifa.eom >= pre_start) & (ifa.eom <= pre_end) & (ifa.member_id.isin(elg_mems)), f'pre_{i}'] = True 
    ifa.loc[(ifa.eom >= post_start) & (ifa.eom <= post_end) & (ifa.member_id.isin(elg_mems)), f'post_{i}'] = True 

    ifa.loc[(ifa.eom >= pre_start) & (ifa.eom <= pre_end) & (ifa.member_id.isin(full_pre)), f'pre_full_{i}'] = True 
    ifa.loc[(ifa.eom >= post_start) & (ifa.eom <= post_end) & (ifa.member_id.isin(full_post)), f'post_full_{i}'] = True 

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [01:14<00:00,  1.77s/it]


In [23]:
periods_df = pd.DataFrame(periods, columns=['period', 'pre_start', 'pre_end', 'post_start', 'post_end'])

In [24]:
ifa.is_cb_eligible = ifa.is_cb_eligible.fillna(False)
ifa.is_unaligned = ifa.is_unaligned.fillna(False)
ifa = ifa.fillna(0)

In [25]:
ifa = ifa.assign(is_male=np.where(ifa.gender=='m',1,0))

In [26]:
ifa.to_parquet('./data/member_periods.parquet')

### Build features + targets

In [27]:
member_periods = pd.read_parquet('./data/member_periods.parquet')

In [28]:
target_cols = ['ip_tc', 'er_tc', 'snf_tc', 'amb_tc']
##
tc_feats = [c for c in ifa.columns if '_tc' in c]
ddos_cols = ['ip_ddos', 'er_ddos', 'out_ddos', 'snf_ddos', 'icf_ddos', 'hh_ddos', 'amb_ddos', 'hsp_ddos', 'pro_ddos', 'spc_fac_ddos', 'dme_ddos', 'cls_ddos', 'hha_ddos']
top_level_feats = ['age', 'is_male', 'mco_name', 'ggroup', 'line_of_business_id']

In [29]:
# def build_member_features(mdf): 
mcos = member_periods.mco_name.unique().tolist()
mco_cols = [f'is_{m.lower().replace(" ", "_")}' for m in mcos]
n_mcos = len(mcos)
def encode_mco(mco_str):
    one_hot = np.zeros(n_mcos, dtype=int)
    one_hot[mcos.index(mco_str)] = 1 
    return one_hot

In [30]:
lobs = member_periods.line_of_business_id.unique().tolist()
lob_cols = [f'is_lob_{l}' for l in lobs]
n_lobs = len(lobs)
def encode_lob(lob):
    one_hot = np.zeros(n_lobs, dtype=int)
    one_hot[lobs.index(lob)] = 1 
    return one_hot

In [31]:
groups = member_periods.ggroup.unique().tolist()
group_cols = [f'is_group_{l}' for l in groups]
n_groups = len(groups)
def encode_group(group):
    one_hot = np.zeros(n_groups, dtype=int)
    one_hot[groups.index(group)] = 1 
    return one_hot

In [32]:
def get_one_all_cats_and_new_cols(df, col):
    values = df[col].unique().tolist()
    cols = [f'is_{col}_{v}' for v in values]
    n = len(cols)
    return values, cols, n
    
    
def get_one_hot_and_labels(value, all_values, n_values):
    one_hot = np.zeros(n_values, dtype=int)
    one_hot[all_values.index(value)] = 1 
    return one_hot

In [45]:
def build_member_yearly_features(mdf):
# mdf = member_periods.loc[(member_periods.pre_0) & (member_periods.pre_full_0) & (member_periods.member_id == 102)].sort_values('eom')
    demographic_data = mdf.sort_values('eom')[['member_id'] + top_level_feats].iloc[-1]
    
    ddos_data = mdf[ddos_cols].sum().to_numpy()
    
    mco_data = encode_mco(demographic_data.mco_name)
    lob_data = encode_lob(demographic_data.line_of_business_id)
    group_data = encode_group(demographic_data.ggroup)
    data = np.concatenate((ddos_data, mco_data, lob_data, group_data, np.array([demographic_data.is_male, demographic_data.age, demographic_data.member_id])), axis=0, dtype=float)
    cols = ddos_cols + mco_cols + lob_cols + group_cols + ['is_male', 'age', 'member_id']

    # return pd.concat([demographic_data, ddos_data])
    return pd.DataFrame([data], columns=cols)

In [46]:
wide_ddos_cols = [f'{c}_{i}' for i in range(pre_months) for c in ddos_cols]

In [47]:
def build_member_features(mdf):
# mdf = member_periods.loc[(member_periods.pre_0) & (member_periods.pre_full_0) & (member_periods.member_id == 102)].sort_values('eom')
    demographic_data = mdf[top_level_feats + ['member_id']].iloc[-1]

    mdf = mdf.sort_values('eom')[ddos_cols]
    ddos_data = mdf.to_numpy().reshape([1, -1])

    mco_data = encode_mco(demographic_data.mco_name)
    lob_data = encode_lob(demographic_data.line_of_business_id)
    group_data = encode_group(demographic_data.ggroup)
    data = np.concatenate((ddos_data[0], mco_data, lob_data, group_data, np.array([demographic_data.is_male, demographic_data.age, demographic_data.member_id])), axis=0, dtype=float)
    cols = wide_ddos_cols + mco_cols + lob_cols + group_cols + ['is_male', 'age', 'member_id']

    return pd.DataFrame([data], columns=cols)

In [48]:
def build_member_targets(mdf):
    tc = mdf[target_cols].sum().sum()
#     pmpm = tc / mdf.cpmm.sum()
    return pd.DataFrame([[mdf.iloc[0].member_id, tc]], columns=['member_id', 'target'])

In [49]:
def build_targets(post_df):
    return post_df.groupby('member_id', as_index=False).apply(build_member_targets)

In [50]:
def build_features(pre_df):
    return pre_df.groupby('member_id', as_index=False).apply(build_member_yearly_features)
    # return pre_df.groupby('member_id', as_index=False).apply(build_member_features)

In [52]:
# build features and targets for each period
full_pre_only = False
period_dfs = []
for i in tqdm(range(30, last_valid_pre_start)):
    pre = member_periods.loc[member_periods[f'pre_{i}']] 
    if full_pre_only:
        pre = pre.loc[pre[f'pre_full_{i}']] 
    post = member_periods.loc[member_periods[f'post_{i}']] 
   
    x = build_features(pre)
    y = build_targets(post)
    final = x.merge(y, on='member_id').assign(period=i)
    period_dfs.append(final)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [09:02<00:00, 45.17s/it]


In [54]:
master_df = pd.concat(period_dfs)
master_df.shape

(343218, 50)

In [55]:
# master_df.to_parquet('./data/master_df.parquet')
# master_df.to_parquet('./data/master_ddos_df.parquet')
master_df.to_parquet('./data/master_ddos_simple.parquet')

### Train/Val/Test split
Avoid any leakage by doing the splits at the member level

In [56]:
# master_df = pd.read_parquet('./data/master_df.parquet')
# master_df = pd.read_parquet('./data/master_ddos_df.parquet')
master_df = pd.read_parquet('./data/master_ddos_simple.parquet')
# master_df = master_df.loc[master_df.period > 24]
# make dtype str for these categorical features
# master_df.ggroup = master_df.ggroup.astype(str)
# master_df.line_of_business_id = master_df.line_of_business_id.astype(str)

In [57]:
master_df.head()

Unnamed: 0,ip_ddos,er_ddos,out_ddos,snf_ddos,icf_ddos,hh_ddos,amb_ddos,hsp_ddos,pro_ddos,spc_fac_ddos,dme_ddos,cls_ddos,hha_ddos,is_anthem_tn,is_uhc_tn,is_uhc_fl,is_uhc_tx,is_uhc_ks,is_anthem_ia,is_ia_centene_itc,is_lob_1,is_lob_3,is_lob_2,is_lob_8,is_group_0.0,is_group_3.0,is_group_2.0,is_group_1.0,is_group_-1.0,is_group_5.0,is_group_6.0,is_group_4.0,is_group_8.0,is_group_14.0,is_group_16.0,is_group_7.0,is_group_11.0,is_group_15.0,is_group_20.0,is_group_12.0,is_group_13.0,is_group_9.0,is_group_21.0,is_group_18.0,is_group_10.0,is_male,age,member_id,target,period
0,1.0,4.0,8.0,0.0,0.0,7.0,0.0,0.0,19.0,0.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,1.0,0.0,30
1,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,2.0,419.98,30
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,61.0,3.0,0.0,30
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42.0,6.0,0.0,30
4,0.0,0.0,4.0,0.0,0.0,30.0,0.0,0.0,9.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,56.0,8.0,0.0,30


### One hot enc

In [58]:
# cat_feats = ['gender', 'mco_name', 'ggroup', 'line_of_business_id']
# one_hots = pd.get_dummies(master_df[cat_feats])
# master_df = pd.concat([master_df, one_hots], axis=1).drop(columns=cat_feats)

In [59]:
member_ids = master_df.member_id.unique()
n_members = len(member_ids)
n_members

33544

In [60]:
train_n = int(n_members * .7)
val_n = int(n_members * .15)
test_n = n_members - train_n - val_n
train_n, val_n, test_n

(23480, 5031, 5033)

In [61]:
np.random.shuffle(member_ids)

In [62]:
train_mems, val_mems, test_mems = np.split(member_ids, [train_n, train_n + val_n])
assert train_mems.shape[0] == train_n
assert val_mems.shape[0] == val_n
assert test_mems.shape[0] == test_n

In [63]:
training_df = master_df.loc[master_df.member_id.isin(train_mems)]
val_df = master_df.loc[master_df.member_id.isin(val_mems)]
test_df = master_df.loc[master_df.member_id.isin(test_mems)]

### Save features for aws sagemaker

In [64]:
export_cols = ['target'] + [c for c in training_df.columns if c not in ['member_id', 'target', 'period']]
training_df[export_cols].to_csv('data/train.csv', header=False, index=False)
val_df[export_cols].to_csv('data/val.csv', header=False, index=False)
test_df[export_cols].to_csv('data/test.csv', header=False, index=False)
print(export_cols)

['target', 'ip_ddos', 'er_ddos', 'out_ddos', 'snf_ddos', 'icf_ddos', 'hh_ddos', 'amb_ddos', 'hsp_ddos', 'pro_ddos', 'spc_fac_ddos', 'dme_ddos', 'cls_ddos', 'hha_ddos', 'is_anthem_tn', 'is_uhc_tn', 'is_uhc_fl', 'is_uhc_tx', 'is_uhc_ks', 'is_anthem_ia', 'is_ia_centene_itc', 'is_lob_1', 'is_lob_3', 'is_lob_2', 'is_lob_8', 'is_group_0.0', 'is_group_3.0', 'is_group_2.0', 'is_group_1.0', 'is_group_-1.0', 'is_group_5.0', 'is_group_6.0', 'is_group_4.0', 'is_group_8.0', 'is_group_14.0', 'is_group_16.0', 'is_group_7.0', 'is_group_11.0', 'is_group_15.0', 'is_group_20.0', 'is_group_12.0', 'is_group_13.0', 'is_group_9.0', 'is_group_21.0', 'is_group_18.0', 'is_group_10.0', 'is_male', 'age']


### Training

In [56]:
x_cols = [c for c in training_df.columns if c not in ['member_id', 'target', 'period']]
# x_cols = [c for c in training_df.columns if c not in ['member_id', 'target', 'period'] + cat_feats]
x = training_df[x_cols]
y = training_df.target

In [57]:
ridge = Ridge(alpha=1, normalize=True)

In [58]:
ridge.fit(x, y)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


Ridge(alpha=1, normalize=True)

In [59]:
ridge.score(x,y)

0.12626577172672981

In [60]:
val_x = val_df[x_cols]
val_y = val_df.target
ridge.score(val_x,val_y)

0.12219154151718581

In [61]:
histr = HistGradientBoostingRegressor()
histr.fit(x, y)

HistGradientBoostingRegressor()

In [62]:
histr.score(x, y)

0.23895925809879914

In [63]:
histr.score(val_x, val_y)

0.13840631409141568

In [64]:
val_df.head()

Unnamed: 0,ip_ddos_0,er_ddos_0,out_ddos_0,snf_ddos_0,icf_ddos_0,hh_ddos_0,amb_ddos_0,hsp_ddos_0,pro_ddos_0,spc_fac_ddos_0,dme_ddos_0,cls_ddos_0,hha_ddos_0,ip_ddos_1,er_ddos_1,out_ddos_1,snf_ddos_1,icf_ddos_1,hh_ddos_1,amb_ddos_1,hsp_ddos_1,pro_ddos_1,spc_fac_ddos_1,dme_ddos_1,cls_ddos_1,hha_ddos_1,ip_ddos_2,er_ddos_2,out_ddos_2,snf_ddos_2,icf_ddos_2,hh_ddos_2,amb_ddos_2,hsp_ddos_2,pro_ddos_2,spc_fac_ddos_2,dme_ddos_2,cls_ddos_2,hha_ddos_2,ip_ddos_3,er_ddos_3,out_ddos_3,snf_ddos_3,icf_ddos_3,hh_ddos_3,amb_ddos_3,hsp_ddos_3,pro_ddos_3,spc_fac_ddos_3,dme_ddos_3,cls_ddos_3,hha_ddos_3,ip_ddos_4,er_ddos_4,out_ddos_4,snf_ddos_4,icf_ddos_4,hh_ddos_4,amb_ddos_4,hsp_ddos_4,pro_ddos_4,spc_fac_ddos_4,dme_ddos_4,cls_ddos_4,hha_ddos_4,ip_ddos_5,er_ddos_5,out_ddos_5,snf_ddos_5,icf_ddos_5,hh_ddos_5,amb_ddos_5,hsp_ddos_5,pro_ddos_5,spc_fac_ddos_5,dme_ddos_5,cls_ddos_5,hha_ddos_5,ip_ddos_6,er_ddos_6,out_ddos_6,snf_ddos_6,icf_ddos_6,hh_ddos_6,amb_ddos_6,hsp_ddos_6,pro_ddos_6,spc_fac_ddos_6,dme_ddos_6,cls_ddos_6,hha_ddos_6,ip_ddos_7,er_ddos_7,out_ddos_7,snf_ddos_7,icf_ddos_7,hh_ddos_7,amb_ddos_7,hsp_ddos_7,pro_ddos_7,spc_fac_ddos_7,dme_ddos_7,cls_ddos_7,hha_ddos_7,ip_ddos_8,er_ddos_8,out_ddos_8,snf_ddos_8,icf_ddos_8,hh_ddos_8,amb_ddos_8,hsp_ddos_8,pro_ddos_8,spc_fac_ddos_8,dme_ddos_8,cls_ddos_8,hha_ddos_8,ip_ddos_9,er_ddos_9,out_ddos_9,snf_ddos_9,icf_ddos_9,hh_ddos_9,amb_ddos_9,hsp_ddos_9,pro_ddos_9,spc_fac_ddos_9,dme_ddos_9,cls_ddos_9,hha_ddos_9,ip_ddos_10,er_ddos_10,out_ddos_10,snf_ddos_10,icf_ddos_10,hh_ddos_10,amb_ddos_10,hsp_ddos_10,pro_ddos_10,spc_fac_ddos_10,dme_ddos_10,cls_ddos_10,hha_ddos_10,ip_ddos_11,er_ddos_11,out_ddos_11,snf_ddos_11,icf_ddos_11,hh_ddos_11,amb_ddos_11,hsp_ddos_11,pro_ddos_11,spc_fac_ddos_11,dme_ddos_11,cls_ddos_11,hha_ddos_11,is_anthem_tn,is_uhc_tn,is_uhc_fl,is_uhc_tx,is_uhc_ks,is_anthem_ia,is_ia_centene_itc,is_lob_1,is_lob_3,is_lob_2,is_lob_8,is_group_0.0,is_group_3.0,is_group_2.0,is_group_1.0,is_group_-1.0,is_group_5.0,is_group_6.0,is_group_4.0,is_group_8.0,is_group_14.0,is_group_16.0,is_group_7.0,is_group_11.0,is_group_15.0,is_group_20.0,is_group_12.0,is_group_13.0,is_group_9.0,is_group_21.0,is_group_18.0,is_group_10.0,is_male,age,member_id,target,period
21,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,2262.0,0.0,0
23,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,2455.0,0.0,0
32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,3211.0,1831.47,0
35,0.0,0.0,4.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,53.0,3346.0,6976.66,0
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,30.0,4496.0,0.0,0


In [247]:
train_preds = histr.predict(x)
val_preds = histr.predict(val_x)

In [253]:
train_w_preds = training_df.assign(pred=train_preds, sample='train').merge(periods_df, on='period')
val_w_preds = val_df.assign(pred=val_preds, sample='validation').merge(periods_df, on='period')

In [105]:
out_cols = ['member_id', 'sample', 'target', 'pred', 'period', 'pre_start', 'pre_end', 'post_start', 'post_end']

In [260]:
pd.concat([train_w_preds[out_cols], val_w_preds[out_cols]]).to_csv('hgbr_12_mom_ddos.csv', index=False)

## Contemporaneous

In [74]:
cmems = pd.read_csv('./data/contemp_members.csv', parse_dates=['start_date', 'end_date'])
cmems.head()

Unnamed: 0,member_id,start_date,end_date
0,249680,2020-12-01,2021-11-30
1,23399,2020-12-01,2021-11-30
2,303825,2020-12-01,2021-11-30
3,275421,2020-12-01,2021-11-30
4,270539,2020-12-01,2021-11-30


In [84]:
for i, row in cmems.iterrows():
    break
start_date = row.start_date
end_date = row.end_date
c_feats = ifa.loc[(ifa.eom >= row.start_date) & (ifa.eom <= row.end_date)];

In [89]:
months_df = pd.DataFrame(sorted(c_feats.eom.unique()), columns=['eom'])
months_df.head()

Unnamed: 0,eom
0,2020-12-31
1,2021-01-31
2,2021-02-28
3,2021-03-31
4,2021-04-30


In [96]:
# mdf.head(15)

# ifa.is_cb_eligible = ifa.is_cb_eligible.fillna(False)
# ifa.is_unaligned = ifa.is_unaligned.fillna(False)
# ifa = ifa.fillna(0)

In [100]:
rows = []
for i, row in tqdm(cmems.iterrows()):
    mdf = c_feats.loc[c_feats.member_id == row.member_id]
    demographic_data = mdf[top_level_feats + ['member_id']].iloc[-1]

    mdf = months_df.merge(mdf, on='eom', how='left')
    mdf = mdf.sort_values('eom')[ddos_cols]
    ddos_data = mdf.to_numpy().reshape([1, -1])

    mco_data = encode_mco(demographic_data.mco_name)
    lob_data = encode_lob(demographic_data.line_of_business_id)
    group_data = encode_group(demographic_data.ggroup)
    data = np.concatenate((ddos_data[0], mco_data, lob_data, group_data, np.array([demographic_data.is_male, demographic_data.age, demographic_data.member_id])), axis=0, dtype=float)
    cols = wide_ddos_cols + mco_cols + lob_cols + group_cols + ['is_male', 'age', 'member_id']
    # pd.DataFrame([data], columns=cols)
    rows.append(data)
cmem_feats = pd.DataFrame(rows, columns=cols)

37808it [00:57, 658.15it/s]


In [102]:
cmem_feats = cmem_feats.fillna(0)

In [108]:
cmem_feats.assign(pred=histr.predict(cmem_feats[x_cols]))[['member_id', 'pred']].to_csv('contemp_members.csv')

### Feature importance

In [265]:
sorted(zip(ridge.coef_, x.columns))

[(-1206.73737335276, 'is_group_10.0'),
 (-622.2951453761325, 'is_group_14.0'),
 (-538.7597790626265, 'is_group_8.0'),
 (-492.4243302796052, 'is_anthem_ia'),
 (-477.35819993993107, 'is_group_4.0'),
 (-449.954474419673, 'is_group_15.0'),
 (-439.48188671013736, 'is_group_16.0'),
 (-397.58699249570964, 'is_lob_1'),
 (-383.3429534663939, 'is_group_6.0'),
 (-354.76974111363745, 'is_ia_centene_itc'),
 (-290.6350778996661, 'icf_ddos_5'),
 (-283.5381734703448, 'is_anthem_tn'),
 (-275.34683247051527, 'is_uhc_tn'),
 (-274.4416291662979, 'icf_ddos_6'),
 (-254.14885978740375, 'is_group_5.0'),
 (-251.4193910793106, 'icf_ddos_4'),
 (-179.66746386800338, 'icf_ddos_7'),
 (-174.99909622456724, 'icf_ddos_3'),
 (-135.35349019041035, 'is_group_13.0'),
 (-116.27026282142678, 'icf_ddos_9'),
 (-107.80372019014911, 'icf_ddos_8'),
 (-106.03947822501195, 'icf_ddos_10'),
 (-102.33363139831587, 'is_group_7.0'),
 (-76.75060063216331, 'is_group_20.0'),
 (-60.22870101775429, 'hsp_ddos_11'),
 (-58.69207133287029, 'is_

In [266]:
from sklearn.inspection import permutation_importance

In [273]:
result = permutation_importance(histr, val_x, val_y, n_repeats=10,random_state=0, n_jobs=-1)

In [272]:
sorted(zip(result.importances_mean, val_x.columns))

[(-0.0007723196063331716, 'ip_ddos_1'),
 (-0.0007613080344119228, 'out_ddos_7'),
 (-0.0006213780445273765, 'pro_ddos_6'),
 (-0.0006040160463351762, 'dme_ddos_4'),
 (-0.0005426325574968028, 'out_ddos_4'),
 (-0.0005344473363957825, 'pro_ddos_4'),
 (-0.00042550598246204885, 'hh_ddos_0'),
 (-0.00041766603550545954, 'pro_ddos_8'),
 (-0.00039609842375647463, 'amb_ddos_0'),
 (-0.0003715464358559517, 'hh_ddos_10'),
 (-0.00036159888644871074, 'out_ddos_0'),
 (-0.00035674841632420405, 'amb_ddos_3'),
 (-0.0003384314455971538, 'dme_ddos_5'),
 (-0.0003093824656464017, 'er_ddos_0'),
 (-0.00029041345635669733, 'dme_ddos_6'),
 (-0.0002771600599830837, 'dme_ddos_10'),
 (-0.00027710314100239943, 'dme_ddos_0'),
 (-0.00022345737934712507, 'dme_ddos_7'),
 (-0.00021884647303249682, 'hh_ddos_6'),
 (-0.00021387650280220162, 'er_ddos_5'),
 (-0.00016166853624497435, 'out_ddos_1'),
 (-0.00015999504317887325, 'pro_ddos_5'),
 (-0.000155104495229752, 'out_ddos_9'),
 (-0.00014995055466715534, 'er_ddos_1'),
 (-0.0001

In [65]:
export_cols

['target',
 'ip_ddos',
 'er_ddos',
 'out_ddos',
 'snf_ddos',
 'icf_ddos',
 'hh_ddos',
 'amb_ddos',
 'hsp_ddos',
 'pro_ddos',
 'spc_fac_ddos',
 'dme_ddos',
 'cls_ddos',
 'hha_ddos',
 'is_anthem_tn',
 'is_uhc_tn',
 'is_uhc_fl',
 'is_uhc_tx',
 'is_uhc_ks',
 'is_anthem_ia',
 'is_ia_centene_itc',
 'is_lob_1',
 'is_lob_3',
 'is_lob_2',
 'is_lob_8',
 'is_group_0.0',
 'is_group_3.0',
 'is_group_2.0',
 'is_group_1.0',
 'is_group_-1.0',
 'is_group_5.0',
 'is_group_6.0',
 'is_group_4.0',
 'is_group_8.0',
 'is_group_14.0',
 'is_group_16.0',
 'is_group_7.0',
 'is_group_11.0',
 'is_group_15.0',
 'is_group_20.0',
 'is_group_12.0',
 'is_group_13.0',
 'is_group_9.0',
 'is_group_21.0',
 'is_group_18.0',
 'is_group_10.0',
 'is_male',
 'age']