In [1]:
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting, enable_halving_search_cv  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, HalvingRandomSearchCV 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 

# from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../../src')
import cb_utils
import cb_model_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2



In [2]:
# configuration
use_cache = False
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 44


In [3]:
query = f"select * from junk.ip_features_all_new;"
ip_features_all = cb_utils.sql_query_to_df(query, use_cache=use_cache)

Pulling query from db


In [4]:
ip_features_all.shape

(2629215, 141)

In [5]:
# member_key = cb_utils.sql_query_to_df(f"select * from junk.ip_member_key;", use_cache=use_cache)

In [6]:
pre_months = 12
post_months = 12

In [7]:
# pre_post_months = pre_months + post_months

In [8]:
periods_df, months_df = cb_model_utils.build_member_periods(ip_features_all, pre_months=pre_months, post_months=post_months)

In [9]:
periods_df.head()

Unnamed: 0,is_cb_eligible,mco_id,mco_name,mco_state,eom,member_id,line_of_business_id,ggroup,is_unaligned,age,gender,cwmm,days_in_month,cpmm,rx_tc,other_tc,ip_tc,er_tc,out_tc,snf_tc,icf_tc,hh_tc,amb_tc,hsp_tc,pro_tc,spc_fac_tc,dme_tc,cls_tc,hha_tc,hcbs_attdpcs_tc,hcbs_other_tc,hcbs_support_house_tc,hcbs_adult_day_tc,ip_ddos_span,er_ddos_span,out_ddos_span,snf_ddos_span,icf_ddos_span,hh_ddos_span,amb_ddos_span,hsp_ddos_span,pro_ddos_span,spc_fac_ddos_span,dme_ddos_span,cls_ddos_span,hha_ddos_span,hcbs_attdpcs_ddos_span,hcbs_other_ddos_span,hcbs_support_house_ddos_span,hcbs_adult_day_ddos_span,other_ddos_span,ip_ddos,er_ddos,out_ddos,snf_ddos,icf_ddos,hh_ddos,amb_ddos,hsp_ddos,pro_ddos,spc_fac_ddos,dme_ddos,cls_ddos,hha_ddos,hcbs_attdpcs_ddos,hcbs_other_ddos,hcbs_support_house_ddos,hcbs_adult_day_ddos,other_ddos,hcbs_pers_ddos,hcbs_assist_tech_ddos,oxygen_ddos,hosp_bed_ddos,chf_ddos,heart_ddos,copd_ddos,pulmonar_ddos,cancer_ddos,ckd_ddos,esrd_ddos,lipidy_ddos,diab_ddos,alzh_ddos,demented_ddos,stroke_ddos,hyper_ddos,fall_ddos,trans_ddos,liver_ddos,hippy_ddos,depressed_ddos,psycho_ddos,druggy_ddos,boozy_ddos,paralyzed_ddos,mono_ddos,mono_dom_ddos,hemi_ddos,hemi_dom_ddos,para_ddos,quad_ddos,tbi_ddos,obese_ddos,pressure_ulcer_ddos,hemophilia_ddos,hcbs_pers_tc,hcbs_assist_tech_tc,oxygen_tc,hosp_bed_tc,chf_tc,heart_tc,copd_tc,pulmonar_tc,cancer_tc,ckd_tc,esrd_tc,lipidy_tc,diab_tc,alzh_tc,demented_tc,stroke_tc,hyper_tc,fall_tc,trans_tc,liver_tc,hippy_tc,depressed_tc,psycho_tc,druggy_tc,boozy_tc,paralyzed_tc,mono_tc,mono_dom_tc,hemi_tc,hemi_dom_tc,para_tc,quad_tc,tbi_tc,obese_tc,pressure_ulcer_tc,hemophilia_tc,pre_0,pre_1,pre_2,pre_3,pre_4,pre_5,pre_6,pre_7,pre_8,pre_9,pre_10,pre_11,pre_12,pre_13,pre_14,pre_15,pre_16,pre_17,pre_18,pre_19,pre_20,pre_21,pre_22,pre_23,pre_24,pre_25,pre_26,pre_27,pre_28,pre_29,pre_30,pre_31,pre_32,pre_33,pre_34,pre_35,pre_36,pre_37,pre_38,pre_39,pre_40,pre_41,pre_42,post_0,post_1,post_2,post_3,post_4,post_5,post_6,post_7,post_8,post_9,post_10,post_11,post_12,post_13,post_14,post_15,post_16,post_17,post_18,post_19,post_20,post_21,post_22,post_23,post_24,post_25,post_26,post_27,post_28,post_29,post_30,post_31,post_32,post_33,post_34,post_35,post_36,post_37,post_38,post_39,post_40,post_41,post_42,pre_post_elg_0,pre_post_elg_1,pre_post_elg_2,pre_post_elg_3,pre_post_elg_4,pre_post_elg_5,pre_post_elg_6,pre_post_elg_7,pre_post_elg_8,pre_post_elg_9,pre_post_elg_10,pre_post_elg_11,pre_post_elg_12,pre_post_elg_13,pre_post_elg_14,pre_post_elg_15,pre_post_elg_16,pre_post_elg_17,pre_post_elg_18,pre_post_elg_19,pre_post_elg_20,pre_post_elg_21,pre_post_elg_22,pre_post_elg_23,pre_post_elg_24,pre_post_elg_25,pre_post_elg_26,pre_post_elg_27,pre_post_elg_28,pre_post_elg_29,pre_post_elg_30,pre_post_elg_31,pre_post_elg_32,pre_post_elg_33,pre_post_elg_34,pre_post_elg_35,pre_post_elg_36,pre_post_elg_37,pre_post_elg_38,pre_post_elg_39,pre_post_elg_40,pre_post_elg_41,pre_post_elg_42,is_male,state
0,False,2,UHC TN,tn,2017-02-28,1,1,2,True,65,f,1,28,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,590.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,8.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,TN
1,False,2,UHC TN,tn,2017-03-31,1,1,2,True,66,f,1,31,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,70.8,0,0.0,0.0,0.0,0.0,1594.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0,0.0,0.0,0.0,0.0,21.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.16,150.16,0.0,0.0,0.0,150.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,TN
2,False,2,UHC TN,tn,2017-04-30,1,1,2,True,66,f,1,30,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,203.2,0,0.0,1800.36,186.76,0.0,1460.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2.0,0,0.0,37.0,4.0,0.0,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,27,4,0,19,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,349.24,349.24,0.0,0.0,0.0,349.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,TN
3,False,2,UHC TN,tn,2017-05-31,1,1,2,True,66,f,1,31,1.0,2349.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,123.95,0,203.2,0,0.0,2000.4,40.02,0.0,1391.28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,13.0,0,2.0,0,0.0,53.0,1.0,0.0,19.0,0.0,0,0,1,0,0,0,0,0,2,0,1,0,0,30,1,0,19,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,606.83,606.83,0.0,0.0,0.0,606.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,TN
4,False,2,UHC TN,tn,2017-06-30,1,1,2,True,66,f,1,30,1.0,6014.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,1967.06,586.96,0.0,1344.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,45.0,10.0,0.0,18.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,10,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,TN


In [10]:
months_df.head()

Unnamed: 0,eom
0,2017-01-31
1,2017-02-28
2,2017-03-31
3,2017-04-30
4,2017-05-31


In [12]:
months_df.to_csv('../data/months_df.csv', index=False)

In [13]:
periods_df.to_parquet('../data/member_periods_v11.parquet')

In [None]:
# # fill na's
# periods_df.is_cb_eligible = periods_df.is_cb_eligible.fillna(False)
# periods_df.is_unaligned = periods_df.is_unaligned.fillna(False)
# periods_df = periods_df.fillna(0)

# # add gender, state
# periods_df = periods_df.assign(is_male=np.where(periods_df.gender=='m',1,0))

# # assign state
# periods_df = periods_df.assign(state=periods_df.mco_name.str.split(' ').apply(lambda x: x[1]).replace({'Centene': 'IA'}))

### Build features + targets
Built separately with multithreaded script

### Train/Val/Test split
Avoid any leakage by doing the splits at the member level

In [None]:
# master_df = pd.read_parquet('./data/master_df.parquet')
# master_df = pd.read_parquet('./data/master_ddos_df.parquet')
master_df = pd.read_parquet('./master_wide_df_v10.parquet')
# master_df = master_df.loc[master_df.period > 24]
# make dtype str for these categorical features
# master_df.ggroup = master_df.ggroup.astype(str)
# master_df.line_of_business_id = master_df.line_of_business_id.astype(str)

In [None]:
master_df.head()

### Build Yearly DDOS service_types + Demographics

In [None]:
d = cb_model_utils.build_yearly_stddos_dem(master_df)
# train_val_test_split(d, 'yearly_st_ddos_dem', return_wo_saving=True)
train, val, test =  cb_model_utils.train_val_test_split(d, return_wo_saving=True)

train.head()

In [None]:
x_train, y_train = cb_model_utils.get_xy(train)

In [None]:
x_train.head()

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x_train, y_train)


In [None]:
perf = cb_model_utils.get_model_performance(histr, train, val)

In [None]:
perf

In [None]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV 

param_grid = {
    'l2_regularization': [.5, 1, 2],
    'learning_rate': [0.01, 0.1, .2, .5, .7, 1],
    'min_samples_leaf': [20, 50, 100, 200],
    'max_iter': [100, 200, 500],
    'max_depth': [3, 5, 10, 20, 50],
}


base_estimator = HistGradientBoostingRegressor()
sh = HalvingRandomSearchCV(base_estimator,
                         param_grid,
                         cv=5,
                         factor=2,
                         n_candidates='exhaust',
                         min_resources=5000,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1
                        ).fit(x_train, y_train)
sh.n_resources_

In [None]:
sh.best_params_

In [None]:
perf = cb_model_utils.get_model_performance(sh.best_estimator_, train, val)

In [None]:
perf

In [None]:
perf

### Build MoM DDOS service_types + Demographics

In [None]:
d = cb_model_utils.build_mom_stddos_dem(master_df)
# train_val_test_split(d, 'yearly_st_ddos_dem', return_wo_saving=True)
train, val, test =  cb_model_utils.train_val_test_split(d, return_wo_saving=True)

train.head()

In [None]:
list(train.columns)

In [None]:
x_train, y_train = cb_model_utils.get_xy(train)

In [None]:
x_train.head()

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x_train, y_train)

In [None]:
perf = cb_model_utils.get_model_performance(histr, train, val)

In [None]:
perf

In [None]:

param_grid = {
    'l2_regularization': [.5, 1, 2],
    'learning_rate': [0.01, 0.1, .2, .5, .7, 1],
    'min_samples_leaf': [20, 50, 100, 200],
    'max_iter': [100, 200, 500],
    'max_depth': [3, 5, 10, 20, 50],
}


base_estimator = HistGradientBoostingRegressor()
sh = HalvingRandomSearchCV(base_estimator,
                         param_grid,
                         cv=5,
                         factor=2,
                         n_candidates='exhaust',
                         min_resources=5000,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1
                        ).fit(x_train, y_train)
sh.n_resources_

In [None]:
sh.best_params_

In [None]:
perf = cb_model_utils.get_model_performance(sh.best_estimator_, train, val)

In [None]:
perf

### Yearly st DDOS + dx ddos  + Dem

In [None]:
d = cb_model_utils.build_yearly_stdxddos_dem(master_df)
# train_val_test_split(d, 'yearly_st_ddos_dem', return_wo_saving=True)
train, val, test =  cb_model_utils.train_val_test_split(d, return_wo_saving=True)

train.head()

In [None]:
x_train, y_train = cb_model_utils.get_xy(train)

In [None]:
x_train.head()

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x_train, y_train)

In [None]:
perf = cb_model_utils.get_model_performance(histr, train, val)
perf

In [None]:

param_grid = {
    'l2_regularization': [.5, 1, 2],
    'learning_rate': [0.01, 0.1, .2, .5, .7, 1],
    'min_samples_leaf': [20, 50, 100, 200],
    'max_iter': [100, 200, 500],
    'max_depth': [3, 5, 10, 20, 50],
}


base_estimator = HistGradientBoostingRegressor()
sh = HalvingRandomSearchCV(base_estimator,
                         param_grid,
                         cv=5,
                         factor=2,
                         n_candidates='exhaust',
                         min_resources=5000,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1
                        ).fit(x_train, y_train)
sh.best_params_

In [None]:
perf = cb_model_utils.get_model_performance(sh.best_estimator_, train, val)
perf

### MOM st DDOS + dx ddos  + Dem

In [None]:
d = cb_model_utils.build_mom_stdxddos_dem(master_df)
# train_val_test_split(d, 'yearly_st_ddos_dem', return_wo_saving=True)
train, val, test =  cb_model_utils.train_val_test_split(d, return_wo_saving=True)

train.head()

In [None]:
x_train, y_train = cb_model_utils.get_xy(train)

In [None]:
x_train.head()

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x_train, y_train)

In [None]:
perf = cb_model_utils.get_model_performance(histr, train, val)
perf

In [None]:

param_grid = {
    'l2_regularization': [.5, 1, 2],
    'learning_rate': [0.01, 0.1, .2, .5, .7, 1],
    'min_samples_leaf': [20, 50, 100, 200],
    'max_iter': [100, 200, 500],
    'max_depth': [3, 5, 10, 20, 50],
}


base_estimator = HistGradientBoostingRegressor()
sh = HalvingRandomSearchCV(base_estimator,
                         param_grid,
                         cv=5,
                         factor=2,
                         n_candidates='exhaust',
                         min_resources=5000,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1
                        ).fit(x_train, y_train)
sh.best_params_

In [None]:
perf = cb_model_utils.get_model_performance(sh.best_estimator_, train, val)
perf

### MOM st DDOS + Dem

In [None]:
d = cb_model_utils.build_mom_stdxddos_dem(master_df)
# train_val_test_split(d, 'yearly_st_ddos_dem', return_wo_saving=True)
train, val, test =  cb_model_utils.train_val_test_split(d, return_wo_saving=True)

train.head()

In [None]:
x_train, y_train = cb_model_utils.get_xy(train)

In [None]:
x_train.head()

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x_train, y_train)

In [None]:
perf = cb_model_utils.get_model_performance(histr, train, val)
perf

In [None]:

param_grid = {
    'l2_regularization': [.5, 1, 2],
    'learning_rate': [0.01, 0.1, .2, .5, .7, 1],
    'min_samples_leaf': [20, 50, 100, 200],
    'max_iter': [100, 200, 500],
    'max_depth': [3, 5, 10, 20, 50],
}


base_estimator = HistGradientBoostingRegressor()
sh = HalvingRandomSearchCV(base_estimator,
                         param_grid,
                         cv=5,
                         factor=2,
                         n_candidates='exhaust',
                         min_resources=5000,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1
                        ).fit(x_train, y_train)
sh.best_params_

In [None]:
perf = cb_model_utils.get_model_performance(sh.best_estimator_, train, val)
perf

# Build and deploy
### Save data

In [None]:
d = cb_model_utils.build_mom_stddos_dem(master_df)
cols = ['target'] + [c for c in d.columns if c != 'target' and c != 'member_id']  # sm target always first
d[cols].to_csv(f'data/mom_stddos_dem_12m_target.csv', header=False, index=False)

### deploy

## OLD

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(model, val_x, val_y, n_repeats=10, random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{x.columns[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

In [None]:
MOM DDOS service_types and dx + Dem
5...8 same but for TC

Maybe a combo TC + DDOS
Span?

### Normalize/encode features if needed
not needed for trees, most linear models will do it for you if you pass the param

In [None]:
def mae(model, x, y):
    preds = model.predict(x)
    return np.abs(preds-y).mean()

### Tune mom ddos st + dem

In [None]:
d = build_mom_stddos_dem(master_df)
# d = build_mom_sttc_dem(master_df)
# d = build_yearly_sttc_dem(master_df)
# d = build_yearly_stddos_dem(master_df)
train, val, test =  train_val_test_split(d, return_wo_saving=True)
train.head()

In [None]:
x_cols = [c for c in train.columns if c not in ['member_id', 'target', 'period']]
# x_cols = [c for c in training_df.columns if c not in ['member_id', 'target', 'period'] + cat_feats]
x = train[x_cols]
y = train.target

val_x = val[x_cols]
val_y = val.target

Things to tune
- l2_regularization=1
- learning_rate
- min_samples_leaf
- max_iter=100,
- max_depth=40,

- tune('learning_rate', np.arange(0.01, 0.1, .01))
- min_samples_leaf_results = tune('min_samples_leaf', [50, 60, 70, 100])

In [None]:
# determine number of iterations

In [None]:
def tune(param, values):
    val_scores = []
    train_scores = []
    val_mae = []
    train_mae = []
    for v in tqdm(values):
        kw = {param: v}
        histr = HistGradientBoostingRegressor(max_iter=100, **kw)
        histr.fit(x, y)
        val_scores.append(histr.score(val_x, val_y))
        train_scores.append(histr.score(x, y))
        train_mae.append(np.abs(histr.predict(x) - y).mean())
        val_mae.append(np.abs(histr.predict(val_x) - val_y).mean())
        
#         print(f'Val: {val_scores[-1]}. Train: {train_scores[-1]}')

    results = pd.DataFrame(zip(values, train_scores, val_scores, train_mae, val_mae), columns=[param, 'Train Score', 'Val Score', 'Train MAE', 'Val MAE'])
    scores_tall = results.melt(id_vars=[param], value_vars=['Train Score', 'Val Score'], value_name='score')
    mae_tall = results.melt(id_vars=[param], value_vars=['Train MAE', 'Val MAE'], value_name='mae')

    display(sns.relplot(data=scores_tall, x=param, y='score', hue='variable', kind='line'))
    display(sns.relplot(data=mae_tall, x=param, y='mae', hue='variable', kind='line'))
    return results

In [None]:
max_depth = tune('max_depth', [3, 5, 10, 20, 40, 75])

In [None]:
l2 = tune('l2_regularization', np.arange(1., 3, .25))

In [None]:
min_samples_leaf_results = tune('min_samples_leaf', [25, 50, 75, 100, 125])

In [None]:
model = HistGradientBoostingRegressor(max_iter=100,
                                      min_samples_leaf=100,
                                      max_depth=5,
                                      l2_regularization=1.5)
model.fit(x,y)
model.score(x,y), model.score(val_x, val_y), np.abs(model.predict(x) - y).mean(), np.abs(model.predict(val_x) - val_y).mean()

In [None]:
# mom tc
model.fit(x,y)
model.score(x,y), model.score(val_x, val_y)

In [None]:
# mom tc
model.fit(x,y)
model.score(x,y), model.score(val_x, val_y)

In [None]:
# mom ddos
model.fit(x,y)
model.score(x,y), model.score(val_x, val_y)

In [None]:
# yearly tc
model.fit(x,y)
model.score(x,y), model.score(val_x, val_y)

In [None]:
# yearly ddos
model.fit(x,y)
model.score(x,y), model.score(val_x, val_y)

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(model, val_x, val_y, n_repeats=10, random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{x.columns[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

In [None]:
for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
        print(f"{x.columns[i]:<8}"
              f"{r.importances_mean[i]:.3f}"
              f" +/- {r.importances_std[i]:.3f}")

In [None]:
ridge = Ridge(alpha=1, normalize=True)

In [None]:
ridge.fit(x, y)

In [None]:
ridge.score(x,y)

In [None]:
val_x = val_df[x_cols]
val_y = val_df.target
ridge.score(val_x,val_y)

In [None]:
histr = HistGradientBoostingRegressor()
histr.fit(x, y)

In [None]:
histr.score(x, y)

In [None]:
histr.score(val_x, val_y)

In [None]:
val_df.head()

In [None]:
train_preds = histr.predict(x)
val_preds = histr.predict(val_x)

In [None]:
train_w_preds = training_df.assign(pred=train_preds, sample='train').merge(periods_df, on='period')
val_w_preds = val_df.assign(pred=val_preds, sample='validation').merge(periods_df, on='period')

In [None]:
out_cols = ['member_id', 'sample', 'target', 'pred', 'period', 'pre_start', 'pre_end', 'post_start', 'post_end']

In [None]:
pd.concat([train_w_preds[out_cols], val_w_preds[out_cols]]).to_csv('hgbr_12_mom_ddos.csv', index=False)

### Feature importance

In [None]:
sorted(zip(ridge.coef_, x.columns))

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
result = permutation_importance(histr, val_x, val_y, n_repeats=10,random_state=0, n_jobs=-1)

In [None]:
sorted(zip(result.importances_mean, val_x.columns))

### Write datasets for sagemaker

In [None]:
export_cols = ['target'] + [c for c in training_df.columns if c not in ['member_id', 'target', 'period']]
training_df[export_cols].head()

In [None]:
training_df[export_cols].to_csv('./data/train_df.csv', index=False, header=False)

In [None]:
val_df[export_cols].to_csv('./data/val_df.csv', index=False, header=False)

In [None]:
test_df[export_cols].to_csv('./data/test_df.csv', index=False, header=False)