In [1]:
import sklearn as skl
import xgboost as xgb
import pandas as pd
import numpy as np
import logging

In [2]:
def prepare_data(path = './train.csv'):
    data = pd.read_csv(path)
    X = data.drop(['target', 'ID_code'], 1)
    y = data.target
    prop = (y == 0).sum().astype(float)/(y == 1).sum()
    X_train, X_val, y_train, y_val = skl.model_selection.train_test_split(X, y, 
                                            test_size=0.1, random_state=1, shuffle=False)
    return X_train, X_val, y_train, y_val, prop

In [3]:
def fit_and_transform(X_train, transformer):
    Transformer = transformer
    Transformer.fit(X_train)
    X_train = Transformer.transform(X_train)
    return X_train, Transformer

In [4]:
def validate_prediction(model, X_train, y_train, X_val, y_val, scaler):
    X_test = np.array(scaler.transform(X_val))
    fpr, tpr, thresholds = skl.metrics.roc_curve(y_train, model.predict(X_train))
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = model.predict(X_test)
    pred = (pred > optimal_threshold).astype(int)
    print(skl.metrics.roc_auc_score(y_val, pred))
    return skl.metrics.roc_auc_score(y_val, pred)

In [5]:
def auc(estimator, X, y):
    prediction = estimator.predict(X)
    fpr, tpr, thresholds = skl.metrics.roc_curve(y, prediction)
    optimal_idx = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_idx]
    pred = (prediction > optimal_threshold).astype(int)
    return skl.metrics.roc_auc_score(y, pred)

In [6]:
def fit_model(transformer, params, grid, n_iter=100, cv=3):
    X_train, X_val, y_train, y_val, prop = prepare_data()
    X_train, Transformer = fit_and_transform(X_train, transformer)
    booster = xgb.XGBRegressor(**params, scale_pos_weight=1/prop)
    rs = skl.model_selection.RandomizedSearchCV(cv=cv, n_jobs=1, verbose=100, scoring=auc,
                                       estimator=booster, param_distributions=grid, n_iter=n_iter, random_state=1)
    rs.fit(X_train, np.array(y_train))
    result = validate_prediction(rs.best_estimator_, X_train, y_train, X_val, y_val, transformer)
    return result, rs.best_params_, rs.best_estimator_

In [7]:
transformers = [skl.preprocessing.MaxAbsScaler(), skl.preprocessing.MinMaxScaler(), skl.preprocessing.Normalizer(),
               skl.preprocessing.StandardScaler(), skl.preprocessing.RobustScaler(), 
                skl.preprocessing.QuantileTransformer(), skl.preprocessing.FunctionTransformer()]

In [8]:
params = {'objective':'binary:logistic', 'eval_metric': 'auc', 'n_jobs': 12, 'tree_method': 'hist', 
          'verbosity':1, 'booster': 'gbtree', 'n_estimators': 100, 
         }

In [9]:
grid = {'alpha': np.exp(np.linspace(-10, 10, 10)),
        'lambda': np.exp(np.linspace(-10, 10, 10)),
        'colsample_bytree': [0.2, 0.5, 0.8, 1],
        'colsample_bylevel': [0.2, 0.5, 0.8, 1],
        'colsample_bynode': [0.2, 0.5, 0.8, 1],
        'subsample': [0.2, 0.5, 0.8, 1],
        'max_delta_step': [0, 1, 10],
        'min_child_weight': [1, 5, 10, 100],
        'max_depth': [3, 5, 16, 50, 100], 
        'gamma': np.exp(np.linspace(-10, 10, 10)),
        'eta': np.exp(np.linspace(-5, 0, 10))
       }

In [10]:
results = []
models = []
parameters = []
for transformer in transformers:
    print(transformer)
    res, bp, bm = fit_model(transformer, params, grid, n_iter=1, cv=2)
    results.append(res)
    models.append(bm)
    parameters.append(bp)

MaxAbsScaler(copy=True)
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.465794806718 
[CV]  subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.465794806718, score=0.676, total=   2.2s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[CV] subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.4657948

[CV]  subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.465794806718, score=0.683, total=   1.9s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.0s finished
0.699454965514081
FunctionTransformer(accept_sparse=False, check_inverse=True, func=None,
                    inv_kw_args=None, inverse_func=None, kw_args=None,
                    pass_y='deprecated', validate=None)




Fitting 2 folds for each of 1 candidates, totalling 2 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.465794806718 
[CV]  subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.465794806718, score=0.676, total=   2.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s
[CV] subsample=0.2, min_child_weight=100, max_depth=5, max_delta_step=0, lambda=258.67063051550025, gamma=0.0004189421234483841, eta=0.06217652402211632, colsample_bytree=1, colsample_bynode=0.8, colsample_bylevel=0.2, alpha=22026.465794806718 
[CV]  subsample=0



0.7094404126285768


In [13]:
pd.Series(results)

0    0.709440
1    0.709440
2    0.700972
3    0.709440
4    0.709440
5    0.699455
6    0.709440
dtype: float64

In [23]:
tmp = pd.DataFrame(parameters)
tmp['results'] = results
tmp['transformer'] = transformers
tmp = tmp.astype(str)
tmp

Unnamed: 0,subsample,min_child_weight,max_depth,max_delta_step,lambda,gamma,eta,colsample_bytree,colsample_bynode,colsample_bylevel,alpha,results,transformer
0,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.7094404126285768,MaxAbsScaler(copy=True)
1,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.7094404126285768,"MinMaxScaler(copy=True, feature_range=(0, 1))"
2,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.7009719743750487,"Normalizer(copy=True, norm='l2')"
3,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.7094404126285768,"StandardScaler(copy=True, with_mean=True, with..."
4,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.7094404126285768,"RobustScaler(copy=True, quantile_range=(25.0, ..."
5,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.699454965514081,"QuantileTransformer(copy=True, ignore_implicit..."
6,0.2,100,5,0,258.67063051550025,0.0004189421234483,0.0621765240221163,1,0.8,0.2,22026.46579480672,0.7094404126285768,"FunctionTransformer(accept_sparse=False, check..."
