In [1]:
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2



In [2]:
# configuration
use_cache = True
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 57


In [3]:
query = f"select * from junk.ip_features_all;"
features = cb_utils.sql_query_to_df(query, use_cache=use_cache)

Pulled query from cache


In [4]:
features.shape

(1444147, 140)

In [5]:
member_key = cb_utils.sql_query_to_df(f"select * from junk.ip_member_key;", use_cache=use_cache)

Pulled query from cache


### Mark good member periods

eligible members on day of id and first day of post

iterate from first month to total_months - pre + post_period, create pre/post for member if elg

In [6]:
pre_months = 12
post_months = 6
pre_post_months = pre_months + post_months

In [7]:
months = sorted(features.eom.unique())
n_months = len(months)
last_valid_pre_start = n_months - pre_post_months
months[:3], months[-3:], n_months, last_valid_pre_start 

([datetime.date(2017, 1, 31),
  datetime.date(2017, 2, 28),
  datetime.date(2017, 3, 31)],
 [datetime.date(2021, 10, 31),
  datetime.date(2021, 11, 30),
  datetime.date(2021, 12, 31)],
 60,
 42)

In [8]:
# create bool column flags to easily query what batches this can be in
pres = {f'pre_{i}': False for i, _ in enumerate(months) if i < last_valid_pre_start}
posts = {f'post_{i}': False for i, _ in enumerate(months) if i < last_valid_pre_start}
features = features.assign(**pres, **posts)

In [9]:
# assign bool flags for each potential period
for i in tqdm(range(last_valid_pre_start)):
    # Build date anchor points relative to start month
    pre_start = months[i]
    pre_end = pre_start + relativedelta(months=pre_months - 1)
    # id_date = pre_end + relativedelta(days=1)
    post_start = pre_end + relativedelta(months=1)
    post_end = pre_end + relativedelta(months=post_months)
    
    # Determine elg members
    pre_elg = features.loc[(features.eom == pre_end) & (features.is_cb_eligible)].member_id.unique()
    post_elg = features.loc[(features.eom == post_start) & (features.is_cb_eligible)].member_id.unique()
    elg_mems = np.intersect1d(pre_elg, post_elg)

    # Flag elg members for period i
    features.loc[(features.eom >= pre_start) & (features.eom <= pre_end) & (features.member_id.isin(elg_mems)), f'pre_{i}'] = True 
    features.loc[(features.eom >= post_start) & (features.eom <= post_end) & (features.member_id.isin(elg_mems)), f'post_{i}'] = True 
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:14<00:00,  2.82it/s]


### Build features + targets

In [11]:
# TODO fillna at top level
ddos_cols = ['ip_ddos', 'er_ddos', 'out_ddos', 'snf_ddos', 'icf_ddos', 'hh_ddos', 'amb_ddos', 'hsp_ddos', 'pro_ddos', 'spc_fac_ddos', 'dme_ddos', 'cls_ddos', 'hha_ddos']
target_cols = ['ip_tc', 'er_tc', 'snf_tc', 'amb_tc']
# feature cols
top_level_feats = ['age', 'gender', 'mco_name', 'ggroup', 'line_of_business_id']
tc_feats = [c for c in features.columns if '_tc' in c]
features[target_cols+ddos_cols] = features[target_cols+ddos_cols].fillna(0)

In [12]:
def build_member_targets(mdf):
    tc = mdf[target_cols].sum().sum()
#     pmpm = tc / mdf.cpmm.sum()
    return pd.DataFrame([[mdf.iloc[0].member_id, tc]], columns=['member_id', 'target'])

In [13]:
# targets.loc[targets.tc == 0].shape
# note lots of 0 posts...
# 13447 vs 75017
def build_targets(post_df):
    return post_df.groupby('member_id', as_index=False).apply(build_member_targets)

In [14]:
def build_member_features(mdf):
    demographic_data = mdf.sort_values('eom')[['member_id'] + top_level_feats].iloc[-1]

    ddos_data = mdf[ddos_cols].sum()
#     tcs = mdf[tc_feats].fillna(0).sum()
#     mm = mdf.cpmm.sum()
#     pmpm_data = tcs / mm
    return pd.concat([demographic_data, ddos_data])

In [15]:
def build_features(pre_df):
    return pre_df.groupby('member_id', as_index=False).apply(build_member_features)

In [16]:
# build features and targets for each period
period_dfs = []
for i in tqdm(range(20, last_valid_pre_start)):
    pre = features.loc[features[f'pre_{i}']] 
    post = features.loc[features[f'post_{i}']] 
   
    x = build_features(pre)
    y = build_targets(post)
    final = x.merge(y, on='member_id').assign(period=i)
    period_dfs.append(final)

  0%|                                                                                                           | 0/22 [00:00<?, ?it/s]


IndexError: single positional indexer is out-of-bounds

In [26]:
master_df = pd.concat(period_dfs)
master_df.shape

(148544, 21)

In [27]:
# cache feature/target gen
master_df = master_df.assign(is_male=np.where(master_df.gender=='m', 1, 0)).drop(columns=['gender'])
# master_df.to_parquet('./data/master_df.parquet')
master_df.to_parquet('./data/master_ddos_df.parquet')

### Train/Val/Test split
Avoid any leakage by doing the splits at the member level

In [28]:
# master_df = pd.read_parquet('./data/master_df.parquet')
master_df = pd.read_parquet('./data/master_ddos_df.parquet')
# master_df = master_df.loc[master_df.period > 24]
# make dtype str for these categorical features
# master_df.ggroup = master_df.ggroup.astype(str)
# master_df.line_of_business_id = master_df.line_of_business_id.astype(str)

In [47]:
# master_df.head()

### One hot enc

In [36]:
cat_feats = ['gender', 'mco_name', 'ggroup', 'line_of_business_id']
# one_hots = pd.get_dummies(master_df[cat_feats])
# master_df = pd.concat([master_df, one_hots], axis=1).drop(columns=cat_feats)

In [30]:
member_ids = master_df.member_id.unique()
n_members = len(member_ids)
n_members

37899

In [31]:
train_n = int(n_members * .7)
val_n = int(n_members * .15)
test_n = n_members - train_n - val_n
train_n, val_n, test_n

(26529, 5684, 5686)

In [32]:
np.random.shuffle(member_ids)

In [33]:
train_mems, val_mems, test_mems = np.split(member_ids, [train_n, train_n + val_n])
assert train_mems.shape[0] == train_n
assert val_mems.shape[0] == val_n
assert test_mems.shape[0] == test_n

In [34]:
training_df = master_df.loc[master_df.member_id.isin(train_mems)]
val_df = master_df.loc[master_df.member_id.isin(val_mems)]
test_df = master_df.loc[master_df.member_id.isin(test_mems)]

### Normalize/encode features if needed
not needed for trees, most linear models will do it for you if you pass the param

### Training

In [37]:
x_cols = [c for c in training_df.columns if c not in ['member_id', 'target', 'period'] + cat_feats]
x = training_df[x_cols]
y = training_df.target

In [39]:
ridge = Ridge(alpha=.5, normalize=True)

In [40]:
ridge.fit(x, y)

Ridge(alpha=0.5, normalize=True)

In [41]:
ridge.score(x,y)

0.1153788000631163

In [42]:
val_x = val_df[x_cols]
val_y = val_df.target
ridge.score(val_x,val_y)

0.10943956931836418

In [43]:
sorted(zip(ridge.coef_, x.columns))

[(-88.79724694401762, 'icf_ddos'),
 (-5.742603355205591, 'hha_ddos'),
 (-3.6443472354973068, 'hsp_ddos'),
 (0.0, 'cls_ddos'),
 (0.0, 'spc_fac_ddos'),
 (6.799065134516623, 'hh_ddos'),
 (22.673302446752682, 'age'),
 (23.537797184252497, 'pro_ddos'),
 (43.434253978861385, 'out_ddos'),
 (55.94437176958225, 'dme_ddos'),
 (130.47858120757368, 'amb_ddos'),
 (179.01391362199442, 'snf_ddos'),
 (249.1090035530895, 'er_ddos'),
 (294.2218378276579, 'is_male'),
 (1486.0289988420132, 'ip_ddos')]

In [44]:
histr = HistGradientBoostingRegressor()
histr.fit(x, y)

HistGradientBoostingRegressor()

In [45]:
histr.score(x, y)

0.18128692193429896

In [46]:
histr.score(val_x, val_y)

0.11850827987935209