In [36]:
import numpy as np
import matplotlib.pyplot as plt
import random
import xgboost as xgb
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error, classification_report, roc_curve, auc
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [20]:
data_train = np.loadtxt('train_2008.csv', delimiter=',', skiprows = 1)
data_test = np.loadtxt('test_2008.csv', delimiter=',', skiprows = 1)
N_train = data_train.shape[0]
N_test = data_test.shape[0]
N_feat = data_test.shape[1]-4
X_train = data_train[:,3:-1] 
X_test = data_test[:,3:]
y_train = data_train[:, -1]
y_test = data_test[:, -1]

In [24]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print(y_pred[:10], y_test[:10])
print(confusion_matrix(y_test[:5], y_pred[:5]))

[0. 0. 0. 0. 0. 0. 1. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[5]]


In [28]:
def error(y_pred, y_test):
    return np.count_nonzero(y_pred - y_test)/y_test.size


In [46]:
# CV
def CV(X, y, model, K):
    kf = KFold(n_splits=K, shuffle=True)
    kf.get_n_splits(X)
    E_cv = 0
    roc_auc = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        E_cv += error(y_pred, y_test)
        #print(classification_report(y_test, y_pred))
        y_scores = model.predict_proba(X_test)[:,1]
        fpr, tpr, _ = roc_curve(y_test, y_scores)
        roc_auc += auc(fpr, tpr)
    return [E_cv/K, roc_auc/K]

In [51]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
print(CV(X_train[:5000, :], y_train[:5000], xgb_model, 10))

[0.22139999999999999, 0.7640995787280715]


In [48]:
rfc_model = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
print(CV(X_train[:1000], y_train[:1000], xgb_model, 10))

[0.221, 0.7677729040293908]


In [49]:
gbc_model = GradientBoostingClassifier(n_estimators=20, max_features=2, max_depth = 2, random_state = 0)
print(CV(X_train[:1000], y_train[:1000], xgb_model, 10))

[0.231, 0.7696393074748668]


In [54]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
print(CV(X_train, y_train, xgb_model, 10))

[0.21549999999999997, 0.7774930694774602]


In [None]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
print(CV(X_train, y_train, xgb_model, 10))