In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

from dataio import getdata, writesub

In [9]:
import time

In [2]:
trainpath = 'train.csv'
testpath = 'test.csv'
df_train,df_test = getdata(trainpath,testpath)

In [7]:
rs = 19683

# split data into train and test
test_id = df_test.ID
test = df_test.drop(["ID"],axis=1)

X = df_train.drop(["TARGET","ID"],axis=1)
y = df_train.TARGET.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=rs)

print(X_train.shape, X_test.shape, test.shape)


((60816, 306), (15204, 306), (75818, 306))


In [8]:
## # Feature selection
clf = ExtraTreesClassifier(random_state=rs)
selector = clf.fit(X_train, y_train)
# clf.feature_importances_ 
fs = SelectFromModel(selector, prefit=True)

X_train = fs.transform(X_train)
X_test = fs.transform(X_test)
test = fs.transform(test)

print(X_train.shape, X_test.shape, test.shape)

((60816, 35), (15204, 35), (75818, 35))


In [None]:
start_time = time.time()
# grid search for params
xgb_model = xgb.XGBClassifier()
clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                               'learning_rate': [0.01, 0.1, 0.25],
                               'n_estimators': [50, 200, 500]}, 
                    verbose=1, n_jobs=1, scoring = 'roc_auc')
clf.fit(X_train, y_train)
print('Best score = %.6f'%(clf.best_score_))
print('Best params are as follows:')
print(clf.best_params_)
print("Finished grid search. Took %.2f minutes" %((time.time()-start_time)/60))

In [None]:
## # Train Model
# classifier from xgboost
m2_xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, seed=1729)
m2_xgb.fit(X_train, y_train, eval_metric="auc", verbose = False,
           eval_set=[(X_test, y_test)])

# calculate the auc score
print("Roc AUC: ", roc_auc_score(y_test, m2_xgb.predict_proba(X_test)[:,1],
              average='macro'))
              
## # Submission
probs = m2_xgb.predict_proba(test)

submission = pd.DataFrame({"ID":test_id, "TARGET": probs[:,1]})
submission.to_csv("submission.csv", index=False)
