In [1]:
from operator import itemgetter
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import pandas as pd

In [5]:
train = pd.read_csv('click_train.csv')
X_train = train.loc[:, train.columns != 'click']
y_target = train.click.values

In [3]:
def run_default_test(train, test, features, target, random_state = 0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "logloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state
    }
    num_boost_round = 260
    early_stopping_rounds = 20
    test_size = 0.2
    X_train, X_valid = train_test_split(train, test_size = test_size, random_state = random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals = watchlist, early_stopping_rounds = early_stopping_rounds, verbose_eval = True)


In [4]:
features = ['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14',
            'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hour_of_day',
            'site_id_int', 'site_domain_int', 'site_category_int', 'app_id_int',
            'app_domain_int', 'app_category_int', 'device_id_int', 'device_ip_int',
            'device_model_int', 'day_of_week_int']

In [None]:
run_default_test(train, y_target, features, 'click')