# Libraries / Import Statements

In [1]:
# We're pitching this to banks companies

import pandas as pd
import numpy as np

import re

import datetime as dt

import time
from time import sleep, time
from timeit import timeit

import random

import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt

import sklearn as sk

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler

import xgboost as xgb

import sys
import math

import pickle

import imblearn



pd.set_option("display.max_rows", None, "display.max_columns", None)

# General-use Functions

In [2]:
#Reads csv file and returns X and y arrays/dataframes
def read_dataset(as_numpy_array=True):
    read_data_=pd.read_csv("creditcard.csv")
    X=read_data_.drop(['Class'],axis=1)
    y=read_data_['Class']
    del read_data_
    if as_numpy_array:
        return(np.array(X),np.array(y))
    else:
        return(X,y)
    
def read_dataset_reg(as_numpy_array=True):
    read_data_=pd.read_csv("creditcard.csv")
    scaler = StandardScaler()
    y=read_data_['Class']
    read_data_.drop(['Class'],axis=1,inplace=True)
    X = scaler.fit_transform(read_data_)

    del read_data_
    if as_numpy_array:
        return(np.array(X),np.array(y))
    else:
        return(X,y)
    


In [3]:
#Scoring Models from Class. Eval. Pair
def accuracy(actuals, preds):
    truth_matrix=actuals==preds
    success=0
    for item in truth_matrix:
        if item:
            success+=1
    return(success/len(actuals))


def precision(actuals, preds):
    true_positive=0
    false_positive=0
    for i in range(len(actuals)):
        if not actuals[i] and preds[i]:
            false_positive+=1
        if actuals[i] and preds[i]:
            true_positive+=1
            
    if true_positive+false_positive==0:
        return('error: divide by 0')
    return(true_positive/(true_positive+false_positive))
    
    
def recall(actuals, preds):
    true_positive=0
    false_negative=0
    for i in range(len(actuals)):
        if actuals[i] and not preds[i]:
            false_negative+=1
        if actuals[i] and preds[i]:
            true_positive+=1
            
    if true_positive+false_negative==0:
        return('error: divide by 0')
    return(true_positive/(true_positive+false_negative))


def f1(actuals, preds):
    if precision(actuals,preds)=='error: divide by 0':
        return('precision error: divide by 0')
    if recall(actuals,preds)=='error: divide by 0':
        return('recall error: divide by 0')
    
    mult=precision(actuals,preds)*recall(actuals,preds)
    add=precision(actuals,preds)+recall(actuals,preds)
    return(2*mult/add)

def f1v2(precision, recall):
    mult=precision*recall
    add=precision+recall
    return(2*mult/add)
    
def score(actuals, preds):
    
    return({"accuracy":accuracy(actuals, preds), "precision":precision(actuals, preds), "recall":recall(actuals, preds), "f1":f1(actuals, preds)})

In [4]:
#Currency value in Euros, but the model can be used for any currency/location

TotalFraudCost=958*1e6
TotalFraudCards=11.29*1e6

AvgFraudLoss=TotalFraudCost/TotalFraudCards
AvgFraudLoss

#Cost of investigation
CpBase=6

In [5]:
# For the sake of simplicity, we assume that customers do not churn due to fraud,
# but we'll include the variables in the cost function for ease of re-fitting models.

#Placeholder variables:
#Chance of churn becuase of False Neg P(Cn)
PCn=0.00
#Chance of churn becuase of False Pos = P(Cp)
PCp=0.00
#Cost of replacing customer CR
CR=0

Cn=CR*PCn
Cp=CR*PCp


#Calculates costs per transaction
def costs(actuals, preds, f_pos_cost=CpBase+Cp,f_neg_cost=AvgFraudLoss+Cn):
    pred_positive=0
    false_negative=0
    
    for i in range(len(actuals)):
        #We have to pay for investigations regardless of whether transaction is really fraudulent
        if preds[i]:
            pred_positive+=1
        #We lose money only if fraud not intercepted
        if actuals[i] and not preds[i]:
            false_negative+=1
    
    total_cost=f_pos_cost*pred_positive+f_neg_cost*false_negative
    return(total_cost/len(actuals))


In [6]:
#Baseline costs for naive models
y=read_dataset()[1]
all_legit_costs=costs(y,np.zeros(len(y)))
all_fraud_costs=costs(y,np.ones(len(y)))


In [7]:
print(f'Cost per Transaction if all marked legitimate: €{all_legit_costs}')
print(f'Cost per Transaction if all marked fraudulent: €{all_fraud_costs}')

Cost per Transaction if all marked legitimate: €0.14658381170363086
Cost per Transaction if all marked fraudulent: €6.0


# Prepare Dataset for out-of-box models

In [8]:
#Load data, split it 60/40 Train/Test

Xy=read_dataset()
X_train, X_test, y_train, y_test=train_test_split(Xy[0],Xy[1],test_size=0.2,random_state=hash("Server-Clearing Market Gardener")%(2**32))

In [9]:
train_fraud_count=(y_train==1).sum()
test_fraud_count=(y_test==1).sum()

# So I think the metric we're focusing on most is Cost, not any traditional scoring method.

# Model 1: Logistic Regression

### Baseline: Out-of-box Logistic Regression

In [204]:
%%time

throw("Throw-wall to prevent accidental loss of progress")

logreg_oob=LogisticRegression(max_iter=1000)

print('Fitting...',end='')
logreg_oob.fit(X_train, y_train)

print('Scoring Recall...',end='')
oob_recall=cross_val_score(logreg_oob, X_train, y_train, cv=5, scoring="recall")
print('Scoring Precision...',end='')
oob_precision=cross_val_score(logreg_oob, X_train, y_train, cv=5, scoring="precision")
print('Predicting Values...')
logreg_oob_preds=logreg_oob.predict(X_test)

print(f'precision={oob_precision.mean()}\nrecall={oob_recall.mean()}\nf1={f1v2(oob_precision.mean(), oob_recall.mean())}')

print('Predicting Costs...')
logreg_oob_costs=costs(y_test,logreg_oob_preds)
print(f'Cost of baseline model: €{logreg_oob_costs}')

NameError: name 'throw' is not defined

In [205]:
costs(y_test,logreg_oob_preds)
#Baseline LogReg cost: €0.05034773854644103

NameError: name 'logreg_oob_preds' is not defined

### Ok, not terrible, not great. That's an okay baseline

#### Ok after trying an out-of-box baseline I think KNN's going to take way too long to create a proper model let's just stick with logistic regression for now

# Fine-Tune Logistic Regression Model

### Attempt 1: Undersample Negatives

In [None]:
%%time

RERUN_US_LOGREG=True

if RERUN_US_LOGREG:
    
    us_recall={}
    us_precision={}
    us_f1={}

    us_cost={}
    for neg_pos_ratio in range(95,116,1):

        

        print(neg_pos_ratio, end='_')
        
        us_precision_temp=0
        us_recall_temp=0
        us_f1_temp=0
        logreg_us_costs=0
        
        for attempts in range(0,10):
            X_tr, X_val, y_tr, y_val=train_test_split(X_train,y_train,test_size=0.25,random_state=(hash("Kris Get The Banana Potassium")*attempts)%(2**32))
            
            n_pos = np.sum(y_tr == 1)
            n_neg = np.sum(y_tr == 0)
            sampling_ratio = {1 : n_pos, 0 : n_pos*neg_pos_ratio}
            
            RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha sands uednertal")*attempts)%(2**32))
            X_tr_rs, y_tr_rs = RUS.fit_resample(X_tr, y_tr)

            logreg_us=LogisticRegression(max_iter=1000)
            logreg_us.fit(X_tr_rs, y_tr_rs)
            
            #Minimizing cost is my priority - model scoring is secondary
            
            #print('p_',end='')
            #us_precision_temp+=cross_val_score(logreg_us, X_tr_rs, y_tr_rs, cv=5, scoring="precision")    
            #print('r_',end='')
            #us_recall_temp+=cross_val_score(logreg_us, X_tr_rs, y_tr_rs, cv=5, scoring="recall")
            #print('f_',end='')
            #us_f1_temp+=cross_val_score(logreg_us, X_tr_rs, y_tr_rs, cv=5, scoring="f1")
            
            logreg_us_preds=logreg_us.predict(X_val)
            logreg_us_costs+=costs(y_val,logreg_us_preds)
        
        us_recall[neg_pos_ratio]=us_recall_temp/(attempts+1)
        us_precision[neg_pos_ratio]=us_precision_temp/(attempts+1)
        us_f1[neg_pos_ratio]=us_f1_temp/(attempts+1)

        us_cost[neg_pos_ratio]=logreg_us_costs/(attempts+1)
    
    

In [None]:
#Iterating by 10s: Most efficient sampling ratio is ~1:110
# Best undersampling is 1-107 with €0.0421/transaction with validation data (€0.0345 w/ test data)

In [None]:

#Increasing data ratio above 40-1 doesn't seem to have much effect on f1, recall, prec...
#okay precision gets affected way too much by random chance to make much of a prediction
#let's just say ~30-40 is the optimal data ratio for scoring

#print('Recall')
#for item in us_recall:
#    print (item, us_recall[item].mean())

# print('Precision')
# for item in us_precision:
#     print (item, us_precision[item].mean())

# print('F1')
# for item in us_f1:
#     print (item, us_f1[item].mean())

print('Cost')
for item in us_cost:
    print (item, us_cost[item])

In [221]:
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
sampling_ratio = {1 : n_pos, 0 : n_pos*107}

RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha sands uednertal"))%(2**32))

X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

logreg_us=LogisticRegression(max_iter=1000)
logreg_us.fit(X_tr_rs, y_tr_rs)
logreg_us_preds=logreg_us.predict(X_test)
costs(y_test,logreg_us_preds)


0.05391408360161417

In [222]:
score(y_test,logreg_us_preds)

{'accuracy': 0.9986833327481479,
 'precision': 0.6307692307692307,
 'recall': 0.7522935779816514,
 'f1': 0.686192468619247}

### Attempt 2: Adjust Class Weights

In [None]:
%%time
#Fine-tuning weighting hyperparameter

RERUN_CW_LOGREG=True

if RERUN_CW_LOGREG:

    w_recall={}
    w_precision={}
    w_f1={}

    w_cost={}
    
    #Splitting data train-validation-test 60-20-20
    X_tr, X_test, y_tr, y_test=train_test_split(Xy[0],Xy[1],test_size=0.2,random_state=hash("HEY EVERY! IT'S ME!")%(2**32-1))
    for neg_pos_ratio in range(70,121,5):
        logreg_w_costs=0
        print(neg_pos_ratio, end=',')
        for attempt in range(0,10):
            print(attempt, end='_')
            X_train, X_val, y_train, y_val=train_test_split(X_tr,y_tr,test_size=0.25,random_state=hash("SPAMT    SPAMTON G. SPAMTON")*attempt%(2**32-1))
            n_pos = np.sum(y_train == 1)
            n_neg = np.sum(y_train == 0)
            pos_ratio=n_pos/(n_pos+n_neg)
            
            temp_ratio=pos_ratio*neg_pos_ratio
            class_weightings={0:temp_ratio,1:1-temp_ratio}

            logreg_w=LogisticRegression(max_iter=1000, class_weight=class_weightings)
            logreg_w.fit(X_train, y_train)

            #print('p_',end='')
            #w_precision_temp=cross_val_score(logreg_w, X_tr_rs, y_tr_rs, cv=5, scoring="precision")
            #print('r_',end='')
            #w_recall_temp=cross_val_score(logreg_w, X_tr_rs, y_tr_rs, cv=5, scoring="recall")
            #print('f_',end='')
            #w_f1_temp=cross_val_score(logreg_w, X_tr_rs, y_tr_rs, cv=5, scoring="f1")

            #w_recall[neg_pos_ratio]=w_recall_temp
            #w_precision[neg_pos_ratio]=w_precision_temp
            #w_f1[neg_pos_ratio]=w_f1_temp
            
            logreg_w_preds=logreg_w.predict(X_val)
            logreg_w_costs+=costs(y_val,logreg_w_preds)

        w_cost[neg_pos_ratio]=logreg_w_costs/(attempt+1)


In [209]:
#print('Recall')
#for item in w_recall:
#    print (item, w_recall[item].mean())
    
#print('Precision')
#for item in w_precision:
#    print (item, w_precision[item].mean())

#print('F1')
#for item in w_f1:
#    print (item, w_f1[item].mean())

cost_min=99
cost_idx=0

print('Cost')
for item in w_cost:
    print (item, w_cost[item])
    if cost_min>w_cost[item]:
        cost_idx=item
        cost_min=w_cost[item]

Cost


NameError: name 'w_cost' is not defined

In [210]:
#Best weighting is 90 at €0.0464/transaction with validation data (€0.0266 w/test data)

In [213]:
X_tr, X_test, y_tr, y_test=train_test_split(Xy[0],Xy[1],test_size=0.2,random_state=hash("HEY EVERY! IT'S ME!")%(2**32-1))
X_train, X_val, y_train, y_val=train_test_split(X_tr,y_tr,test_size=0.25,random_state=hash("SPAMT    SPAMTON G. SPAMTON")%(2**32-1))
n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
pos_ratio=n_pos/(n_pos+n_neg)

temp_ratio=pos_ratio*90
class_weightings={0:temp_ratio,1:1-temp_ratio}
logreg_w=LogisticRegression(max_iter=1000, class_weight=class_weightings)
logreg_w.fit(X_train, y_train)
logreg_w_preds=logreg_w.predict(X_test)
costs(y_test,logreg_w_preds)

0.055644387293578686

### Attempt 3: Adjust Thresholds

In [None]:
#%%time


RERUN_TH_LOGREG=True


if RERUN_TH_LOGREG:
    th_recall={}
    th_precision={}
    th_f1={}

    th_cost={}

    #Don't think I can use validation here because there's no custom threshold for logreg
    #THAT'S NOT VERY [[BIG SHOT]] OF YOU, [[SciKit]].
    #but I'll randomize the train/holdout data and average it
    #a lot
    
    for i in range(0,51):
        th_cost[i]=[]
        th_recall[i]=[]
        th_precision[i]=[]
        th_f1[i]=[]
        
    for attempt in range(0,10):
        print(attempt,end='_')
        X_train, X_test, y_train, y_test=train_test_split(Xy[0],Xy[1],test_size=0.2,random_state=hash("Chicken Dance")%(2**32-1))
        X_train, X_val, y_train, y_val=train_test_split(X_train,y_train,test_size=0.25,random_state=hash("Take On Me")*attempt%(2**32-1))
        
        logreg_th=LogisticRegression(max_iter=1000)
        logreg_th.fit(X_train, y_train)
        
        logreg_th_probs=logreg_th.predict_proba(X_val)
        
        for thresh in range(1,51):
            logreg_th_preds = np.where(logreg_th_probs[:,1] > (thresh*0.01), 1, 0)
            logreg_th_costs=costs(y_val,logreg_th_preds)
            
            scores=score(y_val,logreg_th_preds)
            
            th_recall[thresh].append(scores['recall'])
            th_precision[thresh].append(scores['precision'])
            th_f1[thresh].append(scores['f1'])
            
            th_cost[thresh].append(logreg_th_costs)
    print('done')
    

In [214]:
for thresh in th_cost:
    print(thresh,sum(th_cost[thresh])/10)
    
#Best threshold for cost: 20-30% (23% best) €0.0658/transaction (€0.0561 with validation data @ 21%)

NameError: name 'th_cost' is not defined

In [215]:
logreg_th=LogisticRegression(max_iter=1000)
logreg_th.fit(X_train, y_train)
logreg_th_probs=logreg_th.predict_proba(X_test)
logreg_th_preds = np.where(logreg_th_probs[:,1] > 0.23, 1, 0)
logreg_th_costs=costs(y_test,logreg_th_preds)
logreg_th_costs

0.0659366745172741

# Model Type 2: Random Forest

### Baseline RF Model

In [171]:
X_train, X_test, y_train, y_test = train_test_split(Xy[0], Xy[1], test_size=0.2, random_state=hash("I CAN DO ANYTHING")%(2**32))
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=hash("CHAOS, CHAOS")%(2**32))

rf_oob = RandomForestRegressor(n_jobs=-1, max_features=3)
rf_oob.fit(X_train,y_train)

rf_oob_probs=rf_oob.predict(X_val)

thresh=50
rf_oob_preds=np.where(rf_oob_probs > (thresh*0.01), True, False)

In [172]:
costs(y_val,rf_oob_preds)
# €0.0489
# Ok that's actually really good for a baseline

0.04035646159332551

### Let's tune max_features

In [173]:
%%time
# Testing max_features

X_train, X_test, y_train, y_test = train_test_split(Xy[0], Xy[1], test_size=0.2, random_state=hash("I CAN DO ANYTHING")%(2**32))
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=hash("CHAOS, CHAOS")%(2**32))

probs={}

for f in range(1,11):
    print(f,end=',')
    rf_f = RandomForestRegressor(max_features=f)
    rf_f.fit(X_train,y_train)

    rf_f_probs=rf_f.predict(X_val)
    
    probs[f]=rf_f_probs
    
    
print('done')    
    

1,2,3,4,5,6,7,8,9,10,done
CPU times: user 15min 39s, sys: 2 s, total: 15min 41s
Wall time: 15min 43s


In [174]:
for item in probs:
    thresh=50
    rf_f_preds=np.where(probs[item] > (thresh*0.01), True, False)
    print(item,costs(y_val,rf_f_preds))
    
#Balancing time and cost: Let's stick with 3 features because after that it really doesn't seem to matter that much

1 0.049836061515564786
2 0.04174078540511632
3 0.04025112821317734
4 0.04035646159332551
5 0.037482480589595746
6 0.04035646159332551
7 0.03886680440138655
8 0.039077471161682885
9 0.037587813969743915
10 0.038972137781534716


In [257]:
rf_f = RandomForestRegressor(max_features=3)
rf_f.fit(X_train,y_train)



RandomForestRegressor(max_features=3)

In [259]:
probs_temp=rf_f.predict(X_val)
thresh_tuning=[]
for thresh in range(0,26):
    rf_f_preds=np.where(probs_temp > (thresh*0.01), True, False)
    thresh_tuning.append(costs(y_val,rf_f_preds))

In [197]:
#Best threshold is 0.18, with cost of €0.0385 with validation data (€0.0348 for test data)

In [258]:
probs_temp=rf_f.predict(X_test)
thresh_tuning1=[]
for thresh in range(0,26):
    rf_f_preds=np.where(probs_temp > (thresh*0.01), True, False)
    thresh_tuning1.append(costs(y_test,rf_f_preds))

In [220]:
score(y_test,rf_f_preds)

{'accuracy': 0.9997191109862715,
 'precision': 0.918918918918919,
 'recall': 0.9357798165137615,
 'f1': 0.9272727272727272}

In [262]:
thresh_tuning[18]


0.03054587166077713

In [263]:
thresh_tuning1[18]

0.03808450113075555

# Model Type 3: Gradient Boosted Trees

### Almost-out-of-box model

In [12]:
#Splitting data train-validation-test 60-20-20

X_train, X_test, y_train, y_test = train_test_split(Xy[0], Xy[1], test_size=0.2, random_state=hash("I Am Going To Touch The Cheese")%(2**32))
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=hash("Something About TerminalMontage")%(2**32))

#Modified code from XGB notebook
gb_oob = xgb.XGBRegressor(random_state=hash("nope.avi")%(2**32))
eval_set = [(X_train, y_train), (X_val, y_val)]

gb_oob_fit = gb_oob.fit(X_train,y_train,eval_set=eval_set,verbose=False)

gb_oob_proba=gb_oob.predict(X_val)
gb_oob_preds=np.where(gb_oob_proba > 0.5, True, False)

print("Validation Data Cost",costs(y_val,gb_oob_preds))

gb_oob_proba=gb_oob.predict(X_test)
gb_oob_preds=np.where(gb_oob_proba > 0.5, True, False)
print("Test Data Cost",costs(y_test,gb_oob_preds))

Validation Data Cost 0.03639916704838485
Test Data Cost 0.04491577667942599


In [47]:
# Validation Data Cost 0.03639916704838485
# Test Data Cost 0.04491577667942599

### Not a bad baseline; let's improve on that

### Attempt 1: Undersampling

In [13]:
gb_us_costs={}

for pos_neg_ratio in range(0,150,10):
    print(pos_neg_ratio,end=',')
    n_pos = np.sum(y_train == 1)
    sampling_ratio = {1 : n_pos, 0 : n_pos*pos_neg_ratio}

    RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
    X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

    gb_us = xgb.XGBRegressor(random_state=hash("nope.avi")%(2**32))
    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_us_fit = gb_us.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_us_proba=gb_us_fit.predict(X_val)
    gb_us_preds=np.where(gb_us_proba > 0.5, True, False)

    gb_us_costs[pos_neg_ratio]=costs(y_val,gb_us_preds)
print('done')

0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,done


In [20]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}

RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

gb_us = xgb.XGBRegressor(random_state=hash("nope.avi")%(2**32))
eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

gb_us_fit = gb_us.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
gb_us_proba=gb_us_fit.predict(X_test)
gb_us_preds=np.where(gb_us_proba > 0.5, True, False)

costs(y_test,gb_us_preds)

    
#Above ratio 1:100, there isn't much difference so let's just use 1:80 for training it's just faster



0.04627012075148758

In [15]:
%%time
sampling_ratio = {1 : n_pos, 0 : n_pos*80}

RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

gb_us = xgb.XGBRegressor(random_state=hash("nope.avi")%(2**32))
eval_set = [(X_tr_rs, y_tr_rs), (X_test, y_test)]

gb_us_fit = gb_us.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
gb_us_proba=gb_us_fit.predict(X_test)
gb_us_preds=np.where(gb_us_proba > 0.5, True, False)

costs(y_test,gb_us_preds)

CPU times: user 22.3 s, sys: 1.44 s, total: 23.7 s
Wall time: 3.34 s


0.04627012075148758

In [None]:
#Best ratio 1:80 @ €0.0463/transaction in test data; €0.0381 for validation
#Probably a bit much overfitting; we'll work on that next

### Attempt 2: General re-tuning

##### Step 1 - Learning rate

In [170]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}
RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

cost_t={}
cost_tt={}

for lr in range(1,31):
    print(lr,end='_')
    gb_t = xgb.XGBRegressor(
        #max_depth=7,
        learning_rate=lr*0.01,
        #subsample=0.8,
        #min_child_weight=12,
        #colsample_bytree=.7,
        random_state=hash("nope.avi")%(2**32))

    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_t_proba=gb_t.predict(X_val)
    gb_t_preds=np.where(gb_t_proba > 0.5, True, False)
    
    gb_tt_proba=gb_t.predict(X_test)
    gb_tt_preds=np.where(gb_tt_proba > 0.5, True, False)
    
    cost_t[lr]=costs(y_val,gb_t_preds)
    cost_tt[lr]=costs(y_test,gb_tt_preds)
print('done')

1_2_3_4_

KeyboardInterrupt: 

In [None]:
#for i in cost_t:
#    print(i,cost_t[i])
    
#0.1 appears to be the best learning_rate in general

##### Step 2 - max_depth

In [None]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}
RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

cost_t={}
cost_tt={}

for md in range(1,11):
    print(md,end='_')
    gb_t = xgb.XGBRegressor(
        max_depth=md,
        learning_rate=0.1,
        #subsample=0.1,
        #min_child_weight=12,
        #colsample_bytree=.7,
        random_state=hash("nope.avi")%(2**32))


    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_t_proba=gb_t.predict(X_val)
    gb_t_preds=np.where(gb_t_proba > 0.5, True, False)
    
    gb_tt_proba=gb_t.predict(X_test)
    gb_tt_preds=np.where(gb_tt_proba > 0.5, True, False)
    
    cost_t[md]=costs(y_val,gb_t_preds)
    cost_tt[md]=costs(y_test,gb_tt_preds)
print('done')

In [131]:
for i in cost_t:
    print(i,cost_t[i])
#5 is best max_depth

1 0.042989796097029735
2 0.03745250084986654
3 0.03638417717852024
4 0.036594843938816575
5 0.0347891866064331
6 0.036700177318964744
7 0.03712151083955742
8 0.03701617745940925
9 0.03976983521312625
10 0.040206158603583536
11 0.0386111680314964
12 0.04273415972713959
13 0.0399954918432872
14 0.040191168733718925
15 0.040206158603583536
16 0.04062749212417621
17 0.042418159586695085
18 0.040296502113867094
19 0.04157549254550973
20 0.041364825785213395
21 0.041259492405065226
22 0.0416808259256579
23 0.04115415902491706
24 0.04157549254550973
25 0.041364825785213395
26 0.041259492405065226
27 0.041259492405065226
28 0.041259492405065226
29 0.041259492405065226
30 0.041259492405065226
31 0.041259492405065226
32 0.041259492405065226
33 0.041259492405065226
34 0.041259492405065226
35 0.041259492405065226
36 0.041259492405065226
37 0.041259492405065226
38 0.041259492405065226
39 0.041259492405065226
40 0.041259492405065226
41 0.041259492405065226
42 0.041259492405065226
43 0.04125949240506

##### Step 3 - min_child_weight

In [126]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}
RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

cost_t={}
cost_tt={}

for mcw in range(2,52,2):
    print(mcw,end='_')
    gb_t = xgb.XGBRegressor(
        max_depth=5,
        learning_rate=0.1,
        #subsample=0.1,
        min_child_weight=mcw,
        #colsample_bytree=.7,
        random_state=hash("nope.avi")%(2**32))
    
    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_t_proba=gb_t.predict(X_val)
    gb_t_preds=np.where(gb_t_proba > 0.5, True, False)
    
    gb_tt_proba=gb_t.predict(X_test)
    gb_tt_preds=np.where(gb_tt_proba > 0.5, True, False)
    
    cost_t[mcw]=costs(y_val,gb_t_preds)
    cost_tt[mcw]=costs(y_test,gb_tt_preds)
print('done')

2_4_6_8_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_done


In [128]:
for i in cost_t:
    print(i,cost_t[i])
#>20 is best min_child_weight. Doesn't seem to increase past that

2 0.0347891866064331
4 0.033615529554938635
6 0.033510196174790466
8 0.03329952941449413
10 0.0334048627946423
12 0.03489451998658127
14 0.0361735104182239
16 0.03606817703807573
18 0.0361735104182239
20 0.033615529554938635
22 0.033720862935086804
24 0.033720862935086804
26 0.033615529554938635
28 0.0334048627946423
30 0.033615529554938635
32 0.033615529554938635
34 0.033510196174790466
36 0.033510196174790466
38 0.033615529554938635
40 0.033720862935086804
42 0.03329952941449413
44 0.033615529554938635
46 0.033510196174790466
48 0.033720862935086804
50 0.033720862935086804


##### Step 4 -colsample

In [143]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}
RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

cost_t={}
cost_tt={}

for ct in range(2,102,2):
    print(ct,end='_')
    gb_t = xgb.XGBRegressor(
        max_depth=5,
        learning_rate=0.1,
        #subsample=0.1,
        min_child_weight=20,
        colsample_bytree=ct*0.01,
        random_state=hash("nope.avi")%(2**32))
    
    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_t_proba=gb_t.predict(X_val)
    gb_t_preds=np.where(gb_t_proba > 0.5, True, False)
    
    gb_tt_proba=gb_t.predict(X_test)
    gb_tt_preds=np.where(gb_tt_proba > 0.5, True, False)
    
    cost_t[ct]=costs(y_val,gb_t_preds)
    cost_tt[ct]=costs(y_test,gb_tt_preds)
print('done')

2_4_6_8_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_done


In [144]:
for i in cost_t:
    print(i,cost_t[i])
#>.30 is best colsample. Doesn't seem to increase past that

2 0.06249065271211374
4 0.06249065271211374
6 0.06249065271211374
8 0.046510767251513124
10 0.03841549114106466
12 0.03841549114106466
14 0.03692583394912569
16 0.03692583394912569
18 0.03426251970569225
20 0.03734716746971837
22 0.03734716746971837
24 0.03457851984613676
26 0.03457851984613676
28 0.03457851984613676
30 0.03287819589390145
32 0.03287819589390145
34 0.03436785308584042
36 0.03436785308584042
38 0.03319419603434596
40 0.03329952941449413
42 0.03329952941449413
44 0.0334048627946423
46 0.0334048627946423
48 0.0334048627946423
50 0.033615529554938635
52 0.033615529554938635
54 0.033510196174790466
56 0.033510196174790466
58 0.033510196174790466
60 0.033510196174790466
62 0.033510196174790466
64 0.033510196174790466
66 0.033510196174790466
68 0.0334048627946423
70 0.0334048627946423
72 0.0334048627946423
74 0.033510196174790466
76 0.033510196174790466
78 0.033510196174790466
80 0.033510196174790466
82 0.033510196174790466
84 0.033510196174790466
86 0.033510196174790466
88 0

##### Step 5: Subsampling

In [151]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}
RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

cost_t={}
cost_tt={}

for ss in range(5,105,5):
    print(ss,end='_')
    gb_t = xgb.XGBRegressor(
        max_depth=5,
        learning_rate=0.1,
        subsample=ss*0.01,
        min_child_weight=20,
        colsample_bytree=0.3,
        random_state=hash("nope.avi")%(2**32))

    
    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_t_proba=gb_t.predict(X_val)
    gb_t_preds=np.where(gb_t_proba > 0.5, True, False)
    
    gb_tt_proba=gb_t.predict(X_test)
    gb_tt_preds=np.where(gb_tt_proba > 0.5, True, False)
    
    cost_t[ss]=costs(y_val,gb_t_preds)
    cost_tt[ss]=costs(y_test,gb_tt_preds)
print('done')

5_10_15_20_25_30_35_40_45_50_55_60_65_70_75_80_85_90_95_100_done


In [152]:
for i in cost_t:
    print(i,cost_t[i])
    
for i in cost_tt:
    print(i,cost_tt[i])
#~.40 appears to be best subsample.

5 0.047684424303007594
10 0.0438474530080797
15 0.0438474530080797
20 0.041289472144794434
25 0.03606817703807573
30 0.0347891866064331
35 0.03319419603434596
40 0.03319419603434596
45 0.03319419603434596
50 0.03606817703807573
55 0.03468385322628493
60 0.03468385322628493
65 0.03606817703807573
70 0.03596284365792756
75 0.03596284365792756
80 0.03308886265419779
85 0.03447318646598859
90 0.0334048627946423
95 0.0334048627946423
100 0.03287819589390145
5 0.039905148333003626
10 0.04150013890509077
15 0.042884462716881566
20 0.04171080566538711
25 0.039258158182250015
30 0.03957415832269452
35 0.03957415832269452
40 0.039363491562398184
45 0.03978482508299086
50 0.039363491562398184
55 0.03978482508299086
60 0.03957415832269452
65 0.040958482134485316
70 0.03967949170284269
75 0.03808450113075555
80 0.03946882494254635
85 0.039363491562398184
90 0.04074781537418898
95 0.03967949170284269
100 0.039258158182250015


In [253]:
n_pos = np.sum(y_train == 1)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}
RUS=imblearn.under_sampling.RandomUnderSampler(sampling_strategy = sampling_ratio, random_state=(hash("haha python go b+r*10"))%(2**32))
X_tr_rs, y_tr_rs = RUS.fit_resample(X_train, y_train)

cost_t={}
cost_tt={}

for thresh in range(28,52,2):
    print(thresh,end='_')
    gb_t = xgb.XGBRegressor(
        max_depth=5,
        learning_rate=0.1,
        subsample=0.45,
        min_child_weight=20,
        colsample_bytree=.25,
        random_state=hash("nope.avi")%(2**32))

    n_pos = np.sum(y_train == 1)
    n_neg = np.sum(y_train == 0)
    sampling_ratio = {1 : n_pos, 0 : n_pos*80}
    
    eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

    gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
    gb_t_proba=gb_t.predict(X_val)
    gb_t_preds=np.where(gb_t_proba > thresh*0.01, True, False)
    
    gb_tt_proba=gb_t.predict(X_test)
    gb_tt_preds=np.where(gb_tt_proba > thresh*0.01, True, False)
    
    cost_t[thresh]=costs(y_val,gb_t_preds)
    cost_tt[thresh]=costs(y_test,gb_tt_preds)
print('done')

28_30_32_34_36_38_40_42_44_46_48_50_done


In [254]:
for i in cost_t:
    print(i,cost_t[i]-cost_tt[i])

#cost_tt
#>0.38 thresh seems to have the best val/test cost of 0.0327/0.0363

28 -0.004875315226544975
30 -0.004664648466248637
32 -0.006259639038335783
34 -0.006364972418483952
36 -0.006470305798632121
38 -0.00657563917878029
40 -0.00657563917878029
42 -0.006470305798632121
44 -0.00657563917878029
46 -0.009238953422213723
48 -0.009028286661917392
50 -0.010412610473708186


In [255]:
cost_tt

{28: 0.0381598547711745,
 30: 0.03773852125058182,
 32: 0.038806844921928124,
 34: 0.038701511541779955,
 36: 0.03849084478148362,
 38: 0.03828017802118728,
 40: 0.03828017802118728,
 42: 0.03934850169253357,
 44: 0.039137834932237235,
 46: 0.04180114917567067,
 48: 0.04297480622716514,
 50: 0.044148463278659594}

In [256]:
gb_t = xgb.XGBRegressor(
    max_depth=5,
    learning_rate=0.1,
    subsample=0.45,
    min_child_weight=20,
    colsample_bytree=.25,
    random_state=hash("nope.avi")%(2**32))

n_pos = np.sum(y_train == 1)
n_neg = np.sum(y_train == 0)
sampling_ratio = {1 : n_pos, 0 : n_pos*80}

eval_set = [(X_tr_rs, y_tr_rs), (X_val, y_val)]

gb_t_fit = gb_t.fit(X_tr_rs,y_tr_rs,eval_set=eval_set, verbose=False)
gb_t_proba=gb_t.predict(X_val)
gb_t_preds=np.where(gb_t_proba > 0.30, True, False)

gb_tt_proba=gb_t.predict(X_test)
gb_tt_preds=np.where(gb_tt_proba > 0.30, True, False)

cost_t=costs(y_val,gb_t_preds)
cost_tt=costs(y_test,gb_tt_preds)

score(y_test,gb_tt_preds)

{'accuracy': 0.9990344440153085,
 'precision': 0.7045454545454546,
 'recall': 0.8532110091743119,
 'f1': 0.7717842323651453}

In [230]:
cost_tt

0.03817484464103911