In [1]:
import os
import sys
import time
import random
import warnings
import collections
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, HistGradientBoostingRegressor 
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.preprocessing import OneHotEncoder

sys.path.append('../src')
import cb_utils

sns.set(style="darkgrid")
pd.options.display.max_columns = 500

%load_ext autoreload
%autoreload 2

In [2]:
# configuration
use_cache = True
seed = random.randint(0, 100)

print(f'Seed: {seed}')

Seed: 3


In [3]:
query = f"SELECT * FROM junk.ip_member_periods"
member_periods = cb_utils.sql_query_to_df(query, use_cache=use_cache)

Pulling query from db


In [49]:
member_periods = member_periods.assign(is_male=np.where(member_periods.gender=='m', 1, 0))
# member_periods.head()

In [20]:
pre_va = member_periods.loc[(member_periods.vrs == 'A') & member_periods.is_pre]
pre_vb = member_periods.loc[(member_periods.vrs == 'B') & member_periods.is_pre]
post_va = member_periods.loc[(member_periods.vrs == 'A') & (member_periods.is_pre == False)]
post_vb = member_periods.loc[(member_periods.vrs == 'B') & (member_periods.is_pre == False)]

In [41]:
feature_cols = [
    'age',
    'is_male',
    'impactable_tc',
    'ip_ddos',
    'er_ddos',
    'nf_ddos',
    'amb_ddos',
    'pro_ddos',
    'out_ddos'
]

def build_xy(pre, post):
    targets = post.assign(target=post.impactable_tc)[['member_id', 'target']]
    features = pre[['member_id'] + feature_cols ]
    feature_targets = features.merge(targets, on='member_id')
    x = feature_targets[feature_cols]
    y = feature_targets.target
    return x, y

In [42]:
x, y = build_xy(pre_va, post_va)

In [43]:
ridge = Ridge(alpha=.5, normalize=True)
ridge.fit(x, y)
ridge.score(x,y)

0.13032879413342502

In [44]:
bt = HistGradientBoostingRegressor()
bt.fit(x, y)
bt.score(x,y)

0.2350317573140548

In [37]:
preds = bt.predict(x)

In [38]:
feature_target_scores = feature_targets.assign(pred=preds)

In [40]:
# feature_target_scores[['member_id', 'target', 'pred']].to_csv('preds.csv',index=False)

In [47]:
# val
x, y = build_xy(pre_vb, post_vb)
ridge.score(x,y), bt.score(x,y)

(0.11946043447844856, 0.12041944906182545)

In [50]:
sorted(zip(ridge.coef_, x.columns))

[(-45.509134030657435, 'is_male'),
 (0.06671202121643414, 'impactable_tc'),
 (16.116114243861965, 'pro_ddos'),
 (19.59898308717066, 'age'),
 (21.29854537924598, 'nf_ddos'),
 (44.27998474713496, 'out_ddos'),
 (68.45334242101012, 'amb_ddos'),
 (178.0896712899947, 'er_ddos'),
 (1099.9963265726187, 'ip_ddos')]