In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
df_train = pd.read_csv('data/train.csv')

X_train = pd.read_csv('./train_with_prob.csv', index_col=False)
X_train = X_train.drop('Unnamed: 0', axis=1)
X_train.head()

y_train = df_train['is_duplicate'].values

In [None]:
X_train.columns

In [None]:
X_train['q1_q2_freq_diff'] = np.abs(X_train['q1_freq'] - X_train['q2_freq'])
X_train['q1_q2_freq_sum'] = X_train['q1_freq'] + X_train['q2_freq']
X_train['skew_diff'] = np.abs(X_train['skew_q1vec'] - X_train['skew_q2vec'])
X_train['skew_sum'] = X_train['skew_q1vec'] + X_train['skew_q2vec']

In [None]:
# X_train = X_train.drop('skew_q1vec', axis=1)
# X_train = X_train.drop('skew_q2vec', axis=1)
# X_train = X_train.drop('q1_freq', axis=1)
# X_train = X_train.drop('q2_freq', axis=1)

In [None]:
X_train.head()

In [None]:
# Up/down sampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8 * len(pos_train))],
                     neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(
    0.8 * len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

In [9]:
def select_model(params, X_train, y_train):
    from sklearn.grid_search import GridSearchCV
    xgb_model = xgb.XGBClassifier()
    clf = GridSearchCV(
        xgb_model,
        params,
        n_jobs=5,
        cv=5,
        scoring='neg_log_loss',
        verbose=2,
        refit=True)
    clf.fit(X_train, y_train)
    return clf

In [10]:
# brute force scan for all parameters, here are the tricks
# usually max_depth is 6,7,8
# learning rate is around 0.05, but small changes may make big diff
# tuning min_child_weight subsample colsample_bytree can have
# much fun of fighting against overfit
# n_estimators is how many round of boosting
# finally, ensemble xgboost with multiple seeds may reduce variance
params = [{
    'objective': ['binary:logistic'],
    'learning_rate': [0.0225],
    'max_depth': [8],
    'subsample': [0.7],
    'colsample_bytree': [0.75],
    'n_estimators': [300],
    'base_score': [0.2],
    'seed': [911]
}]
#     , {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [500],
#     'seed': [666]
# }, {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [500],
#     'seed': [250]
# }, {
#     'objective': ['binary:logistic'],
#     'learning_rate': [0.02],
#     'max_depth': [8, 9],
#     'subsample': [0.75],
#     'colsample_bytree': [0.75, 0.7],
#     'n_estimators': [250],
#     'seed': [250]
# }]

In [11]:
df_test = pd.read_csv('./data/test.csv')
X_test = pd.read_csv('./test_with_prob.csv')
X_test = X_test.drop('Unnamed: 0', axis=1)

X_test['q1_q2_freq_diff'] = np.abs(X_test['q1_freq'] - X_test['q2_freq'])
X_test['q1_q2_freq_sum'] = X_test['q1_freq'] + X_test['q2_freq']
X_test['skew_diff'] = np.abs(X_test['skew_q1vec'] - X_test['skew_q2vec'])
X_test['skew_sum'] = X_test['skew_q1vec'] + X_test['skew_q2vec']

# X_test = X_test.drop('skew_q1vec', axis=1)
# X_test = X_test.drop('skew_q2vec', axis=1)
# X_test = X_test.drop('q1_freq', axis=1)
# X_test = X_test.drop('q2_freq', axis=1)

X_test.head()

Unnamed: 0,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,...,clf_127,clf_128,clf_129,clf_130,clf_131,clf_132,q1_q2_freq_diff,q1_q2_freq_sum,skew_diff,skew_sum
0,0.266667,0.234251,0.274019,0.090909,3,1.272727,2,1.181818,3,1.5,...,0.0001,0.00666,0.0,0.001944,0.002864,0.0,0,2,0.036032,0.055948
1,0.5,0.436043,0.480962,0.235294,7,0.5,7,0.5,2,0.714286,...,0.296028,0.302897,0.30184,0.447476,0.394525,0.442324,0,4,0.029401,-0.06424
2,0.444444,0.418727,0.468893,0.285714,8,0.428571,6,0.5,3,0.5,...,0.385672,0.449783,0.421432,0.42732,0.394525,0.442324,0,2,0.131041,0.014628
3,0.0,0.0,0.0,0.0,1,0.75,1,0.75,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0.009182,0.130016
4,0.8,0.841273,1.0,0.428571,2,1.5,2,1.5,1,0.666667,...,0.390569,0.494767,0.530017,0.416894,0.426159,0.492876,0,2,0.0,0.01217


In [None]:
idx = 0
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']

for p in params:
    clf = select_model(p, X_train, y_train)
    print(idx, ':', clf.best_estimator_, '\n\n\n')
    best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
    print(best_parameters, score)
    test_probs = clf.predict_proba(X_test)
    sub['is_duplicate_{}'.format(idx)] = test_probs[:, 1]
    idx += 1



Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
sub.head(10)

In [None]:
# # sub = pd.DataFrame()
# # sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = sub['is_duplicate_0']
sub = sub[['test_id', 'is_duplicate']]
print('\nPrediction result:\n',sub.head(5))

In [None]:
print('Saving :: submission file...')
sub.to_csv('sub_11-30.csv', index=False)
print('\nSubmission result done.')