# Script to retrain model
based on the 12 month risk model using service type ddos and demographic features

In [3]:
# Note need r5.24xl to run this on
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting, enable_halving_search_cv  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, HalvingRandomSearchCV 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 

# from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../../src')
%pip install pymysql
import cb_utils
import cb_model_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

Matplotlib is building the font cache; this may take a moment.


Collecting pymysql
  Obtaining dependency information for pymysql from https://files.pythonhosted.org/packages/e5/30/20467e39523d0cfc2b6227902d3687a16364307260c75e6a1cb4422b0c62/PyMySQL-1.1.0-py3-none-any.whl.metadata
  Downloading PyMySQL-1.1.0-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.1.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
# configuration
use_cache = False
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 25


### Pull data
The data pull and building member periods only needs to be run the first time. Subsequent times you can skip to the part that is reading the file directly.

In [3]:
query = f"select * from junk.ip_features_all_new_new;"
ip_features_all = cb_utils.sql_query_to_df(query, use_cache=use_cache)

Pulling query from db


In [4]:
pre_months = 12
post_months = 12

### Build member periods
This chops a member into multiple training samples 
The result is a bunch of columns tacked on to the original features df that signify what periods the member is good for and if it's pre or post 

The data needs to be saved and uploaded to s3
It's pulled from s3 in the multithreaded feature gen script

In [12]:
periods_df, months_df = cb_model_utils.build_member_periods(ip_features_all, pre_months=pre_months, post_months=post_months)

In [13]:
months_df.to_csv('../data/months_df.csv', index=False)

In [17]:
periods_df = periods_df.drop(columns=['gender'])

In [18]:
periods_df.to_parquet('../data/member_periods_v12.parquet')

### Build features + targets
Built separately with multithreaded script

See `src/gen_aster_features_df.py`

I usually run this on a BIG instance ie m5 8x or 12x, usually takes about an hour

The features and targets generated include everything. It's possible to later to limit to cert

In [6]:
dfs = []
for f in tqdm(os.listdir('../../data')):
    dfs.append(pd.read_parquet(f'../../data/{f}'))
df = pd.concat(dfs, axis=0, ignore_index=True)

100%|██████████| 42/42 [00:11<00:00,  3.77it/s]


In [7]:
df.to_parquet('../../data/final_wide_df_v12_all.parquet')

In [8]:
master_df = df

### Read in member periods df

In [4]:
master_df = pd.read_parquet('./final_wide_df_v12_all.parquet')

In [9]:
master_df.head()

Unnamed: 0,ip_ddos_span_0,er_ddos_span_0,out_ddos_span_0,snf_ddos_span_0,icf_ddos_span_0,hh_ddos_span_0,amb_ddos_span_0,hsp_ddos_span_0,pro_ddos_span_0,spc_fac_ddos_span_0,dme_ddos_span_0,cls_ddos_span_0,hha_ddos_span_0,hcbs_attdpcs_ddos_span_0,hcbs_other_ddos_span_0,hcbs_support_house_ddos_span_0,hcbs_adult_day_ddos_span_0,other_ddos_span_0,ip_ddos_0,er_ddos_0,out_ddos_0,snf_ddos_0,icf_ddos_0,hh_ddos_0,amb_ddos_0,hsp_ddos_0,pro_ddos_0,spc_fac_ddos_0,dme_ddos_0,cls_ddos_0,hha_ddos_0,hcbs_attdpcs_ddos_0,hcbs_other_ddos_0,hcbs_support_house_ddos_0,hcbs_adult_day_ddos_0,other_ddos_0,hcbs_pers_ddos_0,hcbs_assist_tech_ddos_0,oxygen_ddos_0,hosp_bed_ddos_0,chf_ddos_0,heart_ddos_0,copd_ddos_0,pulmonar_ddos_0,cancer_ddos_0,ckd_ddos_0,esrd_ddos_0,lipidy_ddos_0,diab_ddos_0,alzh_ddos_0,demented_ddos_0,stroke_ddos_0,hyper_ddos_0,fall_ddos_0,trans_ddos_0,liver_ddos_0,hippy_ddos_0,depressed_ddos_0,psycho_ddos_0,druggy_ddos_0,boozy_ddos_0,paralyzed_ddos_0,mono_ddos_0,mono_dom_ddos_0,hemi_ddos_0,hemi_dom_ddos_0,para_ddos_0,quad_ddos_0,tbi_ddos_0,obese_ddos_0,pressure_ulcer_ddos_0,hemophilia_ddos_0,rx_tc_0,other_tc_0,ip_tc_0,er_tc_0,out_tc_0,snf_tc_0,icf_tc_0,hh_tc_0,amb_tc_0,hsp_tc_0,pro_tc_0,spc_fac_tc_0,dme_tc_0,cls_tc_0,hha_tc_0,hcbs_attdpcs_tc_0,hcbs_other_tc_0,hcbs_support_house_tc_0,hcbs_adult_day_tc_0,hcbs_pers_tc_0,hcbs_assist_tech_tc_0,oxygen_tc_0,hosp_bed_tc_0,chf_tc_0,heart_tc_0,copd_tc_0,pulmonar_tc_0,cancer_tc_0,ckd_tc_0,esrd_tc_0,lipidy_tc_0,diab_tc_0,alzh_tc_0,demented_tc_0,stroke_tc_0,hyper_tc_0,fall_tc_0,trans_tc_0,liver_tc_0,hippy_tc_0,depressed_tc_0,psycho_tc_0,druggy_tc_0,boozy_tc_0,paralyzed_tc_0,mono_tc_0,mono_dom_tc_0,hemi_tc_0,hemi_dom_tc_0,para_tc_0,quad_tc_0,tbi_tc_0,obese_tc_0,pressure_ulcer_tc_0,hemophilia_tc_0,ip_ddos_span_1,er_ddos_span_1,out_ddos_span_1,snf_ddos_span_1,icf_ddos_span_1,hh_ddos_span_1,amb_ddos_span_1,hsp_ddos_span_1,pro_ddos_span_1,spc_fac_ddos_span_1,dme_ddos_span_1,cls_ddos_span_1,hha_ddos_span_1,hcbs_attdpcs_ddos_span_1,hcbs_other_ddos_span_1,hcbs_support_house_ddos_span_1,hcbs_adult_day_ddos_span_1,other_ddos_span_1,ip_ddos_1,er_ddos_1,out_ddos_1,snf_ddos_1,icf_ddos_1,hh_ddos_1,amb_ddos_1,hsp_ddos_1,pro_ddos_1,spc_fac_ddos_1,dme_ddos_1,cls_ddos_1,hha_ddos_1,hcbs_attdpcs_ddos_1,hcbs_other_ddos_1,hcbs_support_house_ddos_1,hcbs_adult_day_ddos_1,other_ddos_1,hcbs_pers_ddos_1,hcbs_assist_tech_ddos_1,oxygen_ddos_1,hosp_bed_ddos_1,chf_ddos_1,heart_ddos_1,copd_ddos_1,pulmonar_ddos_1,cancer_ddos_1,ckd_ddos_1,esrd_ddos_1,lipidy_ddos_1,diab_ddos_1,alzh_ddos_1,demented_ddos_1,stroke_ddos_1,hyper_ddos_1,fall_ddos_1,trans_ddos_1,liver_ddos_1,hippy_ddos_1,depressed_ddos_1,psycho_ddos_1,druggy_ddos_1,boozy_ddos_1,paralyzed_ddos_1,mono_ddos_1,mono_dom_ddos_1,hemi_ddos_1,hemi_dom_ddos_1,para_ddos_1,quad_ddos_1,tbi_ddos_1,obese_ddos_1,pressure_ulcer_ddos_1,hemophilia_ddos_1,rx_tc_1,other_tc_1,ip_tc_1,er_tc_1,out_tc_1,snf_tc_1,icf_tc_1,hh_tc_1,amb_tc_1,hsp_tc_1,pro_tc_1,spc_fac_tc_1,dme_tc_1,cls_tc_1,hha_tc_1,hcbs_attdpcs_tc_1,hcbs_other_tc_1,hcbs_support_house_tc_1,hcbs_adult_day_tc_1,hcbs_pers_tc_1,hcbs_assist_tech_tc_1,oxygen_tc_1,hosp_bed_tc_1,chf_tc_1,heart_tc_1,copd_tc_1,pulmonar_tc_1,cancer_tc_1,ckd_tc_1,esrd_tc_1,lipidy_tc_1,diab_tc_1,alzh_tc_1,demented_tc_1,stroke_tc_1,hyper_tc_1,fall_tc_1,trans_tc_1,liver_tc_1,hippy_tc_1,depressed_tc_1,psycho_tc_1,druggy_tc_1,boozy_tc_1,paralyzed_tc_1,mono_tc_1,mono_dom_tc_1,hemi_tc_1,hemi_dom_tc_1,para_tc_1,quad_tc_1,...,oxygen_ddos_10,hosp_bed_ddos_10,chf_ddos_10,heart_ddos_10,copd_ddos_10,pulmonar_ddos_10,cancer_ddos_10,ckd_ddos_10,esrd_ddos_10,lipidy_ddos_10,diab_ddos_10,alzh_ddos_10,demented_ddos_10,stroke_ddos_10,hyper_ddos_10,fall_ddos_10,trans_ddos_10,liver_ddos_10,hippy_ddos_10,depressed_ddos_10,psycho_ddos_10,druggy_ddos_10,boozy_ddos_10,paralyzed_ddos_10,mono_ddos_10,mono_dom_ddos_10,hemi_ddos_10,hemi_dom_ddos_10,para_ddos_10,quad_ddos_10,tbi_ddos_10,obese_ddos_10,pressure_ulcer_ddos_10,hemophilia_ddos_10,rx_tc_10,other_tc_10,ip_tc_10,er_tc_10,out_tc_10,snf_tc_10,icf_tc_10,hh_tc_10,amb_tc_10,hsp_tc_10,pro_tc_10,spc_fac_tc_10,dme_tc_10,cls_tc_10,hha_tc_10,hcbs_attdpcs_tc_10,hcbs_other_tc_10,hcbs_support_house_tc_10,hcbs_adult_day_tc_10,hcbs_pers_tc_10,hcbs_assist_tech_tc_10,oxygen_tc_10,hosp_bed_tc_10,chf_tc_10,heart_tc_10,copd_tc_10,pulmonar_tc_10,cancer_tc_10,ckd_tc_10,esrd_tc_10,lipidy_tc_10,diab_tc_10,alzh_tc_10,demented_tc_10,stroke_tc_10,hyper_tc_10,fall_tc_10,trans_tc_10,liver_tc_10,hippy_tc_10,depressed_tc_10,psycho_tc_10,druggy_tc_10,boozy_tc_10,paralyzed_tc_10,mono_tc_10,mono_dom_tc_10,hemi_tc_10,hemi_dom_tc_10,para_tc_10,quad_tc_10,tbi_tc_10,obese_tc_10,pressure_ulcer_tc_10,hemophilia_tc_10,ip_ddos_span_11,er_ddos_span_11,out_ddos_span_11,snf_ddos_span_11,icf_ddos_span_11,hh_ddos_span_11,amb_ddos_span_11,hsp_ddos_span_11,pro_ddos_span_11,spc_fac_ddos_span_11,dme_ddos_span_11,cls_ddos_span_11,hha_ddos_span_11,hcbs_attdpcs_ddos_span_11,hcbs_other_ddos_span_11,hcbs_support_house_ddos_span_11,hcbs_adult_day_ddos_span_11,other_ddos_span_11,ip_ddos_11,er_ddos_11,out_ddos_11,snf_ddos_11,icf_ddos_11,hh_ddos_11,amb_ddos_11,hsp_ddos_11,pro_ddos_11,spc_fac_ddos_11,dme_ddos_11,cls_ddos_11,hha_ddos_11,hcbs_attdpcs_ddos_11,hcbs_other_ddos_11,hcbs_support_house_ddos_11,hcbs_adult_day_ddos_11,other_ddos_11,hcbs_pers_ddos_11,hcbs_assist_tech_ddos_11,oxygen_ddos_11,hosp_bed_ddos_11,chf_ddos_11,heart_ddos_11,copd_ddos_11,pulmonar_ddos_11,cancer_ddos_11,ckd_ddos_11,esrd_ddos_11,lipidy_ddos_11,diab_ddos_11,alzh_ddos_11,demented_ddos_11,stroke_ddos_11,hyper_ddos_11,fall_ddos_11,trans_ddos_11,liver_ddos_11,hippy_ddos_11,depressed_ddos_11,psycho_ddos_11,druggy_ddos_11,boozy_ddos_11,paralyzed_ddos_11,mono_ddos_11,mono_dom_ddos_11,hemi_ddos_11,hemi_dom_ddos_11,para_ddos_11,quad_ddos_11,tbi_ddos_11,obese_ddos_11,pressure_ulcer_ddos_11,hemophilia_ddos_11,rx_tc_11,other_tc_11,ip_tc_11,er_tc_11,out_tc_11,snf_tc_11,icf_tc_11,hh_tc_11,amb_tc_11,hsp_tc_11,pro_tc_11,spc_fac_tc_11,dme_tc_11,cls_tc_11,hha_tc_11,hcbs_attdpcs_tc_11,hcbs_other_tc_11,hcbs_support_house_tc_11,hcbs_adult_day_tc_11,hcbs_pers_tc_11,hcbs_assist_tech_tc_11,oxygen_tc_11,hosp_bed_tc_11,chf_tc_11,heart_tc_11,copd_tc_11,pulmonar_tc_11,cancer_tc_11,ckd_tc_11,esrd_tc_11,lipidy_tc_11,diab_tc_11,alzh_tc_11,demented_tc_11,stroke_tc_11,hyper_tc_11,fall_tc_11,trans_tc_11,liver_tc_11,hippy_tc_11,depressed_tc_11,psycho_tc_11,druggy_tc_11,boozy_tc_11,paralyzed_tc_11,mono_tc_11,mono_dom_tc_11,hemi_tc_11,hemi_dom_tc_11,para_tc_11,quad_tc_11,tbi_tc_11,obese_tc_11,pressure_ulcer_tc_11,hemophilia_tc_11,is_state_az,is_state_dc,is_state_fl,is_state_ia,is_state_ks,is_state_ma,is_state_mn,is_state_oh,is_state_tn,is_state_tx,is_state_va,is_lob_1,is_lob_3,is_lob_2,is_lob_8,is_group_0,is_group_3,is_group_2,is_group_1,is_group_-1,is_group_5,is_group_8,is_group_4,is_group_6,is_group_7,is_group_10,is_group_9,is_group_11,is_male,is_female,age,member_id,target,period
0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,49.0,0.0,0.0,46.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,31.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1108.29,0.0,0.0,0.0,323.77,0.0,0.0,0.0,0.0,0.0,156.37,0.0,0.0,0.0,0.0,1964.58,0.0,0.0,1190.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,48.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,29.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.59,0.0,0.0,1370.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.9,0.0,0.0,0.0,0.0,3040.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.37,0.0,180.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,3.0,0.0,0.0,46.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,28.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3580.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.58,0.0,167.84,0.0,0.0,2977.46,386.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,385.0,385.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,70.0,1.0,0.0,42
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6071.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,0.0,64.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,31.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,253.86,0.0,84.76,0.0,0.0,0.0,0.0,5805.05,276.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,523.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5936.39,407.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,124.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6051.66,182.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,92.0,2.0,359.65,42
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,62.0,3.0,0.0,42
3,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.67,0.0,0.0,0.0,0.0,0.0,0.0,401.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2164.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5204.52,0.0,0.0,0.0,116.29,0.0,0.0,434.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2278.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2518.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-591.47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.85,0.0,0.0,0.0,0.0,2192.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,237.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4124.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2392.32,105.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,57.0,8.0,55559.21,42
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1651.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.79,0.0,0.0,692.55,0.0,0.0,0.0,0.0,0.0,14.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,2.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,104.54,0.0,72.73,0.0,74.79,0.0,0.0,805.41,0.0,0.0,0.0,0.0,0.0,14.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,245.26,0.0,0.0,0.0,245.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,641.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.23,0.0,0.0,0.0,0.0,0.0,635.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,123.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,108.02,0.0,67.58,0.0,6.23,0.0,0.0,738.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.91,0.0,0.0,0.0,0.0,0.0,175.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,71.0,14.0,722.42,42


In [10]:
master_df.shape

(7561100, 1558)

In [11]:
# May need to update the demographic features in cb_model_utils.py if training on a new state, group, etc...
demographic_cols = [c for c in master_df.columns if 'is_group' in c or 'is_state' in c or 'is_lob' in c or c == 'is_male' or c == 'is_female' or c == 'age']
demographic_cols 

['is_state_az',
 'is_state_dc',
 'is_state_fl',
 'is_state_ia',
 'is_state_ks',
 'is_state_ma',
 'is_state_mn',
 'is_state_oh',
 'is_state_tn',
 'is_state_tx',
 'is_state_va',
 'is_lob_1',
 'is_lob_3',
 'is_lob_2',
 'is_lob_8',
 'is_group_0',
 'is_group_3',
 'is_group_2',
 'is_group_1',
 'is_group_-1',
 'is_group_5',
 'is_group_8',
 'is_group_4',
 'is_group_6',
 'is_group_7',
 'is_group_10',
 'is_group_9',
 'is_group_11',
 'is_male',
 'is_female',
 'age']

### Build MoM DDOS service_types + Demographics

In [12]:
d = cb_model_utils.build_mom_stddos_dem(master_df)
# train_val_test_split(d, 'yearly_st_ddos_dem', return_wo_saving=True)
train, val, test =  cb_model_utils.train_val_test_split(d, return_wo_saving=True)

In [13]:
x_train, y_train = cb_model_utils.get_xy(train)

In [14]:
x_train.head()

Unnamed: 0,ip_ddos_0,ip_ddos_1,ip_ddos_2,ip_ddos_3,ip_ddos_4,ip_ddos_5,ip_ddos_6,ip_ddos_7,ip_ddos_8,ip_ddos_9,ip_ddos_10,ip_ddos_11,er_ddos_0,er_ddos_1,er_ddos_2,er_ddos_3,er_ddos_4,er_ddos_5,er_ddos_6,er_ddos_7,er_ddos_8,er_ddos_9,er_ddos_10,er_ddos_11,out_ddos_0,out_ddos_1,out_ddos_2,out_ddos_3,out_ddos_4,out_ddos_5,out_ddos_6,out_ddos_7,out_ddos_8,out_ddos_9,out_ddos_10,out_ddos_11,snf_ddos_0,snf_ddos_1,snf_ddos_2,snf_ddos_3,snf_ddos_4,snf_ddos_5,snf_ddos_6,snf_ddos_7,snf_ddos_8,snf_ddos_9,snf_ddos_10,snf_ddos_11,icf_ddos_0,icf_ddos_1,icf_ddos_2,icf_ddos_3,icf_ddos_4,icf_ddos_5,icf_ddos_6,icf_ddos_7,icf_ddos_8,icf_ddos_9,icf_ddos_10,icf_ddos_11,hh_ddos_0,hh_ddos_1,hh_ddos_2,hh_ddos_3,hh_ddos_4,hh_ddos_5,hh_ddos_6,hh_ddos_7,hh_ddos_8,hh_ddos_9,hh_ddos_10,hh_ddos_11,amb_ddos_0,amb_ddos_1,amb_ddos_2,amb_ddos_3,amb_ddos_4,amb_ddos_5,amb_ddos_6,amb_ddos_7,amb_ddos_8,amb_ddos_9,amb_ddos_10,amb_ddos_11,hsp_ddos_0,hsp_ddos_1,hsp_ddos_2,hsp_ddos_3,hsp_ddos_4,hsp_ddos_5,hsp_ddos_6,hsp_ddos_7,hsp_ddos_8,hsp_ddos_9,hsp_ddos_10,hsp_ddos_11,pro_ddos_0,pro_ddos_1,pro_ddos_2,pro_ddos_3,pro_ddos_4,pro_ddos_5,pro_ddos_6,pro_ddos_7,pro_ddos_8,pro_ddos_9,pro_ddos_10,pro_ddos_11,spc_fac_ddos_0,spc_fac_ddos_1,spc_fac_ddos_2,spc_fac_ddos_3,spc_fac_ddos_4,spc_fac_ddos_5,spc_fac_ddos_6,spc_fac_ddos_7,spc_fac_ddos_8,spc_fac_ddos_9,spc_fac_ddos_10,spc_fac_ddos_11,dme_ddos_0,dme_ddos_1,dme_ddos_2,dme_ddos_3,dme_ddos_4,dme_ddos_5,dme_ddos_6,dme_ddos_7,dme_ddos_8,dme_ddos_9,dme_ddos_10,dme_ddos_11,cls_ddos_0,cls_ddos_1,cls_ddos_2,cls_ddos_3,cls_ddos_4,cls_ddos_5,cls_ddos_6,cls_ddos_7,cls_ddos_8,cls_ddos_9,cls_ddos_10,cls_ddos_11,hha_ddos_0,hha_ddos_1,hha_ddos_2,hha_ddos_3,hha_ddos_4,hha_ddos_5,hha_ddos_6,hha_ddos_7,hha_ddos_8,hha_ddos_9,hha_ddos_10,hha_ddos_11,is_state_az,is_state_dc,is_state_fl,is_state_ia,is_state_ks,is_state_ma,is_state_mn,is_state_oh,is_state_tn,is_state_tx,is_state_va,is_lob_1,is_lob_3,is_lob_2,is_lob_8,is_group_0,is_group_3,is_group_2,is_group_1,is_group_-1,is_group_5,is_group_8,is_group_4,is_group_6,is_group_7,is_group_10,is_group_9,is_group_11,is_male,is_female,age
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,4.0,1.0,4.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,70.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,92.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,62.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,6.0,1.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,57.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,71.0


In [15]:
histr = HistGradientBoostingRegressor()
histr.fit(x_train, y_train)

In [16]:
perf = cb_model_utils.get_model_performance(histr, train, val)

In [17]:
perf

{'train_score': 0.21885827645176925,
 'val_score': 0.23575165030435197,
 'train_mae': 8700.691405180401,
 'val_mae': 8653.753860099796}

In [18]:
perf

{'train_score': 0.1961394840447247,
 'val_score': 0.14712943546550983,
 'train_mae': 10336.051238312228,
 'val_mae': 10619.036998224748}

In [12]:
perf

{'train_score': 0.22651338263305354,
 'val_score': 0.13478742311953518,
 'train_mae': 9468.802067391181,
 'val_mae': 9637.370276625466}

#### Note: Many other feature combinations can be used see src/cb_model_utils.py for available options

# Build and deploy
### Save data
Target must be first, save without header or index

In [18]:
d = cb_model_utils.build_mom_stddos_dem(master_df)
cols = ['target'] + [c for c in d.columns if c != 'target' and c != 'member_id']  # sm target always first
d[cols].to_csv(f'mom_stddos_dem_12m_target_v3.csv', header=False, index=False)
cols

['target',
 'ip_ddos_0',
 'ip_ddos_1',
 'ip_ddos_2',
 'ip_ddos_3',
 'ip_ddos_4',
 'ip_ddos_5',
 'ip_ddos_6',
 'ip_ddos_7',
 'ip_ddos_8',
 'ip_ddos_9',
 'ip_ddos_10',
 'ip_ddos_11',
 'er_ddos_0',
 'er_ddos_1',
 'er_ddos_2',
 'er_ddos_3',
 'er_ddos_4',
 'er_ddos_5',
 'er_ddos_6',
 'er_ddos_7',
 'er_ddos_8',
 'er_ddos_9',
 'er_ddos_10',
 'er_ddos_11',
 'out_ddos_0',
 'out_ddos_1',
 'out_ddos_2',
 'out_ddos_3',
 'out_ddos_4',
 'out_ddos_5',
 'out_ddos_6',
 'out_ddos_7',
 'out_ddos_8',
 'out_ddos_9',
 'out_ddos_10',
 'out_ddos_11',
 'snf_ddos_0',
 'snf_ddos_1',
 'snf_ddos_2',
 'snf_ddos_3',
 'snf_ddos_4',
 'snf_ddos_5',
 'snf_ddos_6',
 'snf_ddos_7',
 'snf_ddos_8',
 'snf_ddos_9',
 'snf_ddos_10',
 'snf_ddos_11',
 'icf_ddos_0',
 'icf_ddos_1',
 'icf_ddos_2',
 'icf_ddos_3',
 'icf_ddos_4',
 'icf_ddos_5',
 'icf_ddos_6',
 'icf_ddos_7',
 'icf_ddos_8',
 'icf_ddos_9',
 'icf_ddos_10',
 'icf_ddos_11',
 'hh_ddos_0',
 'hh_ddos_1',
 'hh_ddos_2',
 'hh_ddos_3',
 'hh_ddos_4',
 'hh_ddos_5',
 'hh_ddos_6',
 'hh_

In [19]:
file_suffix = 'mom_stddos_dem_12m_target_v3'
training_data_bucket = 'cb-analytics-us-east-2-prd'
training_data_path = 'sagemaker/data/' + file_suffix + '/train.csv'

In [20]:
cb_utils.upload_file_to_s3(f'{file_suffix}.csv', training_data_bucket, training_data_path)

### Training

In [21]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn

In [22]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

AWS Region: us-east-2
RoleArn: arn:aws:iam::257056996471:role/cb-sagemaker


In [23]:
prefix = 'sagemaker'
model_name = 'hgbr_model_v3_mom_stddos_dem_12m_target'
model_bucket = 'cb-analytics-exports-us-east-2-prd'

In [24]:
s3_model_output_location ='s3://{}/{}/{}'.format(
    model_bucket,
    prefix,
    model_name
)
s3_model_output_location

's3://cb-analytics-exports-us-east-2-prd/sagemaker/hgbr_model_v3_mom_stddos_dem_12m_target'

In [25]:
sklearn_estimator = SKLearn(entry_point='train_deploy.py',
                            instance_type='ml.m5.24xlarge',
                            role=role,
                            framework_version='0.23-1')

sklearn_estimator.fit({
    'train': f's3://{training_data_bucket}/sagemaker/data/{file_suffix}/train.csv'
})

NOTEBOOK_METADATA_FILE detected but failed to get valid domain and user from it.


Using provided s3_resource


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2023-09-07-00-47-39-811


2023-09-07 00:47:40 Starting - Starting the training job...
2023-09-07 00:47:54 Starting - Preparing the instances for training......
2023-09-07 00:48:51 Downloading - Downloading input data........................
2023-09-07 00:53:02 Training - Training image download completed. Training in progress.[34m2023-09-07 00:53:03,495 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-09-07 00:53:03,498 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-07 00:53:03,531 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-09-07 00:53:03,684 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-07 00:53:03,696 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-07 00:53:03,708 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m

### Attempt serverless deploy

In [26]:
from time import gmtime, strftime

model_name = "xgboost-serverless-data-v12-model-v3-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model name: " + model_name)

Model name: xgboost-serverless-data-v12-model-v3-2023-09-07-01-12-42


In [27]:
serverless_conf = sagemaker.serverless.serverless_inference_config.ServerlessInferenceConfig(memory_size_in_mb=6144, max_concurrency=100)

In [28]:
predictor = sklearn_estimator.deploy(serverless_inference_config=serverless_conf,
                                     endpoint_name='ep-' + model_name,
                                     model_name=model_name
                                    )


INFO:sagemaker:Creating model with name: xgboost-serverless-data-v12-model-v3-2023-09-07-01-12-42
INFO:sagemaker:Creating endpoint-config with name ep-xgboost-serverless-data-v12-model-v3-2023-09-07-01-12-42
INFO:sagemaker:Creating endpoint with name ep-xgboost-serverless-data-v12-model-v3-2023-09-07-01-12-42


----!

In [29]:
predictor.endpoint_name

'ep-xgboost-serverless-data-v12-model-v3-2023-09-07-01-12-42'