In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
df_train = pd.read_csv('data/train.csv')

X_train = pd.read_csv('final_train_features.csv', index_col=False)
X_train = X_train.drop('Unnamed: 0', axis=1)
X_train.head()

y_train = df_train['is_duplicate'].values

In [14]:
X_train.iloc[20]

word_match                         0.666667
tfidf_wm                           0.665955
tfidf_wm_stops                     0.655016
jaccard                            0.545455
wc_diff                            1.000000
wc_ratio                           0.888889
wc_diff_unique                     1.000000
wc_ratio_unique                    0.888889
wc_diff_unq_stop                   0.000000
wc_ratio_unique_stop               1.000000
same_start                         1.000000
char_diff                          3.000000
char_diff_unq_stop                 5.000000
total_unique_words                11.000000
total_unq_words_stop               8.000000
char_ratio                         1.081081
len_q1                            45.000000
len_q2                            47.000000
diff_len                          -2.000000
len_char_q1                       20.000000
len_char_q2                       20.000000
len_word_q1                        9.000000
len_word_q2                     

In [3]:
# Up/down sampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))],
                     neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(
    0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

0.189701079013


In [4]:
def select_model(params, X_train, y_train):
    from sklearn.grid_search import GridSearchCV
    xgb_model = xgb.XGBClassifier()
    clf = GridSearchCV(
        xgb_model,
        params,
        n_jobs=4,
        cv=4,
        scoring='neg_log_loss',
        verbose=2,
        refit=True)
    clf.fit(X_train, y_train)
    return clf

In [5]:
# brute force scan for all parameters, here are the tricks
# usually max_depth is 6,7,8
# learning rate is around 0.05, but small changes may make big diff
# tuning min_child_weight subsample colsample_bytree can have
# much fun of fighting against overfit
# n_estimators is how many round of boosting
# finally, ensemble xgboost with multiple seeds may reduce variance
params = [{
    'objective': ['binary:logistic'],
    'learning_rate': [0.03],
    'max_depth': [9],
    'subsample': [0.75],
    'colsample_bytree': [0.75],
    'n_estimators': [500],
    'seed': [911]
}]
#     , {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [500],
#     'seed': [666]
# }, {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [500],
#     'seed': [250]
# }, {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [250],
#     'seed': [250]
# }]

In [6]:
df_test = pd.read_csv('./data/test.csv')
X_test = pd.read_csv('./final_test_features.csv')
X_test = X_test.drop('Unnamed: 0', axis=1)
X_test.head()

Unnamed: 0,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,...,cityblock_distance,canberra_distance,minkowski_distance,braycurtis_distance,skew_q1vec,skew_q2vec,kur_q1vec,q1_q2_intersect,q1_freq,q2_freq
0,0.266667,0.234251,0.274019,0.090909,3,1.272727,2,1.181818,3,1.5,...,12.00672,164.254264,0.407153,0.483565,0.04599,0.009958,0.039938,0,1,1
1,0.5,0.436043,0.480962,0.235294,7,0.5,7,0.5,2,0.714286,...,6.501658,115.512012,0.210239,0.245248,-0.017419,-0.046821,0.20758,0,2,2
2,0.444444,0.418727,0.468893,0.285714,8,0.428571,6,0.5,3,0.5,...,8.129246,125.626922,0.253134,0.299802,0.072835,-0.058206,-0.50531,0,1,1
3,0.0,0.0,0.0,0.0,1,0.75,1,0.75,0,1.0,...,10.918439,159.496524,0.366312,0.438647,0.060417,0.069599,-0.241064,0,1,1
4,0.8,0.841273,1.0,0.428571,2,1.5,2,1.5,1,0.666667,...,0.0,0.0,0.0,0.0,0.006085,0.006085,-0.081313,0,1,1


In [7]:
idx = 0
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']

for p in params:
    clf = select_model(p, X_train, y_train)
    print(idx, ':', clf.best_estimator_, '\n\n\n')
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(best_parameters, score)
    test_probs = clf.predict_proba(X_test)
    sub['is_duplicate_{}'.format(idx)] = test_probs[:, 1]
    idx += 1



Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 96.9min finished


0 : XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.75, gamma=0, learning_rate=0.05,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=500, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=911, silent=True,
       subsample=0.75) 



{'colsample_bytree': 0.75, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 911, 'subsample': 0.75} -0.16003070837510683


In [33]:
test_probs = clf.predict_proba(X_test)
sub['is_duplicate_{}'.format(idx)] = test_probs[:, 1]

In [34]:
sub.head(10)

Unnamed: 0,test_id,is_duplicate,is_duplicate_1
0,0,0.0,0.000157
1,1,0.184628,0.184628
2,2,0.292206,0.292206
3,3,0.0,2.4e-05
4,4,0.0,0.040773
5,5,0.0,0.000203
6,6,1.0,0.999624
7,7,0.0,0.028111
8,8,0.312459,0.312459
9,9,0.0,0.000172


In [35]:
clf

GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'objective': ['binary:logistic'], 'learning_rate': [0.05], 'max_depth': [10], 'subsample': [0.75], 'colsample_bytree': [0.75], 'n_estimators': [500], 'seed': [911]},
       pre_dispatch='2*n_jobs', refit=True, scoring='neg_log_loss',
       verbose=2)

In [12]:
# # sub = pd.DataFrame()
# # sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = sub['is_duplicate_0']
sub = sub[['test_id', 'is_duplicate']]
print('\nPrediction result:\n',sub.head(5))

print('Saving :: submission file...')
sub.to_csv('sub_11-30.csv', index=False)
print('\nSubmission result done.')


Prediction result:
    test_id  is_duplicate
0        0      0.000157
1        1      0.184628
2        2      0.292206
3        3      0.000024
4        4      0.040773
Saving :: submission file...

Submission result done.
