In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn import cross_validation, metrics   
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

import matplotlib.pylab as plt
%matplotlib inline

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
features = ["Times pregnant", "Plasma glucose", "Diastolic BP", "Triceps skin fold thickness", "2-Hour serum insulin", "BMI", "Diabetes pedigree function", "Age"]

# load data
dataset = pd.read_csv(url, header = None)
dataset.head()

dataset = dataset.values
# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]


# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# fit model to training data
model = XGBClassifier(n_estimators=500, learning_rate=0.015)
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, early_stopping_rounds=10, verbose=False)

# make predictions for test data
y_pred = model.predict(X_test)


# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#Get the evaluation results and use for plotting the learning curve
results = model.evals_result()
#print(results)
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)

# plot log loss learning curve (You can do same for error, just as log loss)
plt.plot(x_axis, results['validation_0']['logloss'], label='Train')
plt.plot(x_axis, results['validation_1']['logloss'], label='Test')
plt.legend()
plt.ylabel('Log Loss')
plt.xlabel('Epoch')
plt.title('XGBoost Log Loss')
plt.show()