#### PLEASE do the coding from this blog post with YOUR OWN  comments on what is happening here: 
https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/

## 1. Install XGBoost for Use in Python

In [1]:
# Install or upgrade xgboost (if necessary)
# !pip install xgboost
# !pip install --upgrade xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/84/4e2cae6247f397f83d8adc5c2a2a0c5d7d790a14a4c7400ff6574586f589/xgboost-0.90.tar.gz (676kB)
[K    100% |████████████████████████████████| 686kB 3.2MB/s ta 0:00:01
[?25hRequirement not upgraded as not directly required: numpy in /Users/kacha/opt/anaconda3/envs/nf/lib/python3.6/site-packages (from xgboost) (1.17.2)
Requirement not upgraded as not directly required: scipy in /Users/kacha/opt/anaconda3/envs/nf/lib/python3.6/site-packages (from xgboost) (1.1.0)
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... [?25ldone
[?25h  Stored in directory: /Users/kacha/Library/Caches/pip/wheels/e9/48/4d/de4187b5270dff71d3697c5a7857a1e2d9a0c63a28b3462eeb
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.90
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip 

## 2. Load and Prepare Data

In [3]:
# import packages
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
# load data as a numpy array
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
dataset = loadtxt(url, delimiter=",")
dataset.shape

(768, 9)

In [12]:
# separate the columns into X = features and y = labels
X = dataset[:,:8]  # first 8 columns
y = dataset[:,8]   # last column

In [13]:
# split X and y into a training and testing set (set random seed for reproducibility)
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

## 3. Train the XGBoost Model

In [14]:
# create XGBoost model and fit it to our training set
model = XGBClassifier()
model.fit(X_train, y_train)   # use print(model) to print the model parameters

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

## 4. Make Predictions with the XGBoost Model

In [18]:
# make predictions for test data

# option (a): directly get binary predictions (0/1)
y_pred = model.predict(X_test)
y_pred[:5]

array([0., 1., 1., 0., 1.])

In [29]:
# option (a): first get predicted probabilities of y being 0/1, then round second column to get predicted class
y_pred_prob = model.predict_proba(X_test)
print(y_pred_prob[:5])

predictions = [round(value[1]) for value in y_pred_prob]
predictions[:5]

[[0.9545844  0.04541559]
 [0.05245447 0.9475455 ]
 [0.41897488 0.5810251 ]
 [0.9831998  0.0168002 ]
 [0.4119159  0.5880841 ]]


[0.0, 1.0, 1.0, 0.0, 1.0]

In [30]:
# evaluate model accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 77.95%


---
## 5. Tie it All Together (in one cell)

In [31]:
# import packages
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# load data
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
dataset = loadtxt(url, delimiter=",")

# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]

# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)
#predictions = [round(value[:1]) for value in y_pred]

# evaluate model accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 77.95%
