In [10]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, model_selection
import xgboost as xgb
import warnings

In [70]:
def make_submission_prediction(model, X_train, y_train, scaler=QuantileScaler, csv=False):
    data = pd.read_csv('./test.csv')
    id_code = np.array(data.ID_code)
    X_test = np.array(scaler.transform(data.drop(['ID_code'], 1)))
    fpr, tpr, thresholds = metrics.roc_curve(y_train, model.predict(X_train))
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = model.predict(X_test)
    pred = (pred > optimal_threshold).astype(int)
    pred = pd.DataFrame({'ID_code': id_code, 'target': pred})
    if csv:
        pred.to_csv('prediction.csv', index=False)
    return pred

In [74]:
def validate_prediction(model, X_train, y_train, X_val, y_val, scaler=QuantileScaler):
    X_test = np.array(scaler.transform(X_val))
    fpr, tpr, thresholds = metrics.roc_curve(y_train, model.predict(X_train))
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = model.predict(X_test)
    pred = (pred > optimal_threshold).astype(int)
    print(metrics.roc_auc_score(y_val, pred))
    return pred

In [2]:
data = './train.csv'
data = pd.read_csv(data)
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [3]:
data.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


In [4]:
data.shape

(200000, 202)

In [5]:
X = data.drop(['target', 'ID_code'], 1)
y = data.target

In [27]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y)

In [28]:
QuantileScaler = preprocessing.QuantileTransformer(n_quantiles=10000)
QuantileScaler.fit(X_train)
X_train = QuantileScaler.transform(X_train)

In [29]:
pd.DataFrame(X_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,...,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.500813,0.499618,0.500049,0.499779,0.500574,0.498742,0.50013,0.499588,0.499777,0.499924,...,0.499773,0.500476,0.499705,0.500049,0.500627,0.499716,0.500121,0.499445,0.500494,0.499798
std,0.288927,0.288966,0.288618,0.288782,0.289329,0.288719,0.288702,0.288352,0.288272,0.288303,...,0.288655,0.288576,0.288717,0.28854,0.28837,0.288639,0.288536,0.289122,0.288998,0.288676
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.250657,0.249735,0.250544,0.249738,0.249594,0.248821,0.249375,0.250287,0.250444,0.24985,...,0.249491,0.250698,0.249255,0.250738,0.251101,0.249895,0.250394,0.248592,0.25002,0.249653
50%,0.50092,0.498755,0.499923,0.499481,0.502093,0.497803,0.500693,0.499017,0.499445,0.499736,...,0.499773,0.50075,0.5001,0.50001,0.501046,0.499025,0.500619,0.499062,0.500964,0.499172
75%,0.752021,0.750285,0.74941,0.749725,0.751637,0.74829,0.750297,0.749054,0.749792,0.749405,...,0.749374,0.749789,0.749079,0.7497,0.750193,0.750099,0.749902,0.749963,0.751385,0.749908
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
# for testing only
freq = 10
warnings.warn('Youu are working with data reduced {} times'.format(freq))
X_train = X_train[::freq]
X_val = X_val[::freq]
y_train = y_train[::freq]
y_val = y_val[::freq]
print(X_train.shape)

(15000, 200)


  This is separate from the ipykernel package so we can avoid doing imports until


In [35]:
# params = {'objective':'binary:logistic', 'eval_metric': 'auc', 'n_jobs': 12, 'tree_method': 'hist', 'verbosity':1, 
#           'max_depth': 16, 'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1, 
#          }
# booster = xgb.XGBRegressor(**params)

In [37]:
# %timeit booster.fit(X_train, y_train)

9.7 s ± 708 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
params = {'objective':'binary:logistic', 'eval_metric': 'auc', 'n_jobs': 12, 'tree_method': 'gpu_hist', 'verbosity':1, 
          'max_depth': 16, 'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1, 
         }
booster_gpu = xgb.XGBRegressor(**params)

In [39]:
%timeit booster_gpu.fit(X_train, y_train)

6.61 s ± 48.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [42]:
metrics.roc_auc_score(y_true=y_val, y_score=booster_gpu.predict(QuantileScaler.transform(X_val)))

0.8362615367508062

In [71]:
pred = make_submission_prediction(booster_gpu, X_train, y_train, csv=True)
pred.head() 

Unnamed: 0,ID_code,target
0,test_0,0
1,test_1,0
2,test_2,0
3,test_3,0
4,test_4,0


In [None]:
# this easy model without fine-tunning and with downsampled sized reached 0.50863 in submission validation and 0.63943
# without downsampling

In [75]:
validate_prediction(booster_gpu, X_train, y_train, X_val, y_val)

0.5


array([1, 1, 1, ..., 1, 1, 1])