In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, model_selection
import xgboost as xgb
import logging

In [2]:
logging.basicConfig(filename='./log.log', level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(name)s %(message)s')
logger = logging.getLogger(__name__)

In [3]:
def make_submission_prediction(model, X_train, y_train, scaler, csv=False):
    data = pd.read_csv('./test.csv')
    id_code = np.array(data.ID_code)
    X_test = np.array(scaler.transform(data.drop(['ID_code'], 1)))
    fpr, tpr, thresholds = metrics.roc_curve(y_train, model.predict(X_train))
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = model.predict(X_test)
    pred = (pred > optimal_threshold).astype(int)
    pred = pd.DataFrame({'ID_code': id_code, 'target': pred})
    if csv:
        pred.to_csv('prediction.csv', index=False)
    return pred

In [4]:
def validate_prediction(model, X_train, y_train, X_val, y_val, scaler):
    X_test = np.array(scaler.transform(X_val))
    fpr, tpr, thresholds = metrics.roc_curve(y_train, model.predict(X_train))
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = model.predict(X_test)
    pred = (pred > optimal_threshold).astype(int)
    print(metrics.roc_auc_score(y_val, pred))
    return pred

In [5]:
data = './train.csv'
data = pd.read_csv(data)
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [6]:
X = data.drop(['target', 'ID_code'], 1)
y = data.target
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size=0.1)

In [7]:
prop = (y == 0).sum().astype(float)/(y == 1).sum()
prop

8.951238929246692

In [8]:
QuantileScaler = preprocessing.QuantileTransformer(n_quantiles=10000)
QuantileScaler.fit(X_train)
X_train = QuantileScaler.transform(X_train)

## Based data prepartion finished

In [None]:
# params = {'objective':'binary:logistic', 'eval_metric': 'auc', 'n_jobs': 12, 'tree_method': 'gpu_hist', 'verbosity':1, 
#           'max_depth': 16, 'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1, 
#           'booster': 'gbtree', 'scale_pos_weight' : 1/prop,
#          }

In [None]:
# booster_gpu = xgb.XGBRegressor(**params)

In [None]:
# booster_gpu.fit(X_train, np.array(y_train))

In [None]:
# validate_prediction(booster_gpu, X_train, y_train, X_val, y_val, QuantileScaler)

In [None]:
# make_submission_prediction(booster_gpu, X_train, y_train, QuantileScaler, True)

### Model above without any fine-tunning reached 0.63943 in submission validation. Next we will do some EDA and model fine-tunning

In [None]:
# import seaborn as sns
# corr = X.corr()
# ax = sns.heatmap(
#     corr, 
#     vmin=-1, vmax=1, center=0,
#     cmap=sns.diverging_palette(20, 220, n=200),
#     square=True
# )
# ax.set_xticklabels(
#     ax.get_xticklabels(),
#     rotation=45,
#     horizontalalignment='right'
# )

In [9]:
# standardize the validation data before calculating
def auc(estimator, X, y):
    prediction = estimator.predict(X)
    fpr, tpr, thresholds = metrics.roc_curve(y, prediction)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = (prediction > optimal_threshold).astype(int)
    return metrics.roc_auc_score(y, pred)

In [10]:
params = {'objective':'binary:logistic', 'eval_metric': 'auc', 'n_jobs': 12, 'tree_method': 'gpu_hist', 
          'verbosity':1, 'booster': 'gbtree', 'scale_pos_weight' : 1/prop,
          'eta': 0.01, 'subsample': 0.5, 'min_obs_node': 1, 
         }
booster = xgb.XGBRegressor(**params)

In [11]:
grid = {'max_bin': [100, 250, 1000],
        'grow_policy': ['lossguide', 'depthwise'],
        'max_leaves': [0, 10, 100],
        'alpha': np.exp(np.linspace(-10, 10, 10)),
        'lambda': np.exp(np.linspace(-10, 10, 10)),
        'colsample_bytree': [0.2, 0.5, 0.8, 1],
        'colsample_bylevel': [0.2, 0.5, 0.8, 1],
        'colsample_bynode': [0.2, 0.5, 0.8, 1],
        'subsample': [0.2, 0.5, 0.8, 1],
        'max_delta_step': [0, 1, 10],
        'min_child_weight': [1, 5, 10, 100],
        'max_depth': [3, 5, 16, 50, 100], 
        'gamma': np.exp(np.linspace(-10, 10, 10)),
        'eta': np.exp(np.linspace(-5, 0, 10)),
        'n_estimators': [100, 500, 1000, 5000],
        'learning_rate': [0.01, 0.1]}

In [12]:
rs = model_selection.RandomizedSearchCV(cv=3, n_jobs=1, verbose=5, scoring=auc,
                                       estimator=booster, param_distributions=grid, n_iter=10)

In [None]:
%time rs.fit(X_train, np.array(y_train))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] subsample=0.5, n_estimators=1000, min_child_weight=5, max_leaves=0, max_depth=5, max_delta_step=0, max_bin=100, learning_rate=0.1, lambda=3.0377317775174815, grow_policy=lossguide, gamma=0.32919298780790573, eta=0.573753420737433, colsample_bytree=1, colsample_bynode=0.2, colsample_bylevel=0.8, alpha=0.32919298780790573 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.5, n_estimators=1000, min_child_weight=5, max_leaves=0, max_depth=5, max_delta_step=0, max_bin=100, learning_rate=0.1, lambda=3.0377317775174815, grow_policy=lossguide, gamma=0.32919298780790573, eta=0.573753420737433, colsample_bytree=1, colsample_bynode=0.2, colsample_bylevel=0.8, alpha=0.32919298780790573, score=0.813, total=  10.3s
[CV] subsample=0.5, n_estimators=1000, min_child_weight=5, max_leaves=0, max_depth=5, max_delta_step=0, max_bin=100, learning_rate=0.1, lambda=3.0377317775174815, grow_policy=lossguide, gamma=0.32919298780790573, eta=0.573753420737433, colsample_bytree=1, colsample_bynode=0.2, colsample_bylevel=0.8, alpha=0.32919298780790573 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.3s remaining:    0.0s


[CV]  subsample=0.5, n_estimators=1000, min_child_weight=5, max_leaves=0, max_depth=5, max_delta_step=0, max_bin=100, learning_rate=0.1, lambda=3.0377317775174815, grow_policy=lossguide, gamma=0.32919298780790573, eta=0.573753420737433, colsample_bytree=1, colsample_bynode=0.2, colsample_bylevel=0.8, alpha=0.32919298780790573, score=0.809, total=  10.1s
[CV] subsample=0.5, n_estimators=1000, min_child_weight=5, max_leaves=0, max_depth=5, max_delta_step=0, max_bin=100, learning_rate=0.1, lambda=3.0377317775174815, grow_policy=lossguide, gamma=0.32919298780790573, eta=0.573753420737433, colsample_bytree=1, colsample_bynode=0.2, colsample_bylevel=0.8, alpha=0.32919298780790573 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.4s remaining:    0.0s
