In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
features_train = np.load("_train.npy")
assert features_train.shape == (404290, 25), "Something wrong with the train features..."

In [3]:
features_test = np.load("_test.npy")
assert features_test.shape == (2345796, 25), "Something wrong with the test features..."

In [4]:
print("Loading raw datasets...")
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
print("Loaded.")

Loading raw datasets...
Loaded.


In [5]:
X_train = features_train
X_test = pd.DataFrame(features_test)
y_train = train['is_duplicate']

In [6]:
# Up/down sampling
pos_train = pd.DataFrame(X_train[y_train == 1])
neg_train = pd.DataFrame(X_train[y_train == 0])
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))],
                     neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(
    0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

0.189701079013


In [7]:
def select_model(params, X_train, y_train):
    from sklearn.grid_search import GridSearchCV
    xgb_model = xgb.XGBClassifier()
    clf = GridSearchCV(
        xgb_model,
        params,
        n_jobs=5,
        cv=5, # This number should be modified to 10, 5 is just for demo
        scoring='neg_log_loss',
        verbose=2,
        refit=True)
    clf.fit(X_train, y_train)
    return clf

In [8]:
# brute force scan for all parameters, here are the tricks
# usually max_depth is 6,7,8
# learning rate is around 0.05, but small changes may make big diff
# tuning min_child_weight subsample colsample_bytree can have
# much fun of fighting against overfit
# n_estimators is how many round of boosting
# finally, ensemble xgboost with multiple seeds may reduce variance
# the advice above from a Kagglers kernal, failed to find the link again

# Just for demo, should actually have a lot of parameter combinations
params = [{
    'objective': ['binary:logistic'],
    'learning_rate': [0.0225],
    'max_depth': [8],
    'subsample': [0.7],
    'colsample_bytree': [0.75],
    'n_estimators': [300],
    'base_score': [0.2],
    'seed': [911]
}]
#     , {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.8],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [200],
#     'seed': [666]
# }, {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [300],
#     'seed': [250]
# }, {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.7],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [250],
#     'seed': [250]
# }]

In [9]:
idx = 0
sub = pd.DataFrame()
sub['test_id'] = test['test_id']

for p in params:
    clf = select_model(p, X_train, y_train)
    print(idx, ':', clf.best_estimator_, '\n\n\n')
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(best_parameters, score)
    test_probs = clf.predict_proba(X_test)
    sub['is_duplicate_{}'.format(idx)] = test_probs[:, 1]
    idx += 1



Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 13.3min remaining: 20.0min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 13.4min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 13.4min finished


0 : XGBClassifier(base_score=0.2, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.75, gamma=0, learning_rate=0.0225,
       max_delta_step=0, max_depth=8, min_child_weight=1, missing=None,
       n_estimators=300, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=911, silent=True,
       subsample=0.7) 



{'base_score': 0.2, 'colsample_bytree': 0.75, 'learning_rate': 0.0225, 'max_depth': 8, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 911, 'subsample': 0.7} -0.19940911393075803


In [10]:
sub.head()

Unnamed: 0,test_id,is_duplicate_0
0,0,0.009606
1,1,0.110052
2,2,0.297908
3,3,0.000505
4,4,0.159587


In [11]:
# print('Saving :: submission file...')
# sub.to_csv('sub_11-30.csv', index=False)
# print('\nSubmission result done.')