# XGBoost

In [1]:
import pandas as pd
heart_data=pd.read_csv('heart_disease.csv')
heart_data

Unnamed: 0,Is Chest Pain Present,Are any arteries blocked,Weight of the person,Is Heart Patient
0,YES,YES,205,YES
1,NO,YES,180,YES
2,YES,NO,210,YES
3,YES,YES,167,YES
4,NO,YES,156,NO
5,NO,YES,125,NO
6,YES,NO,168,NO
7,YES,YES,172,NO


# Python Implementation

In [2]:
!pip3 install --default-timeout=100 xgboost

Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from sklearn import datasets
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [4]:
# reading the features and the labels
data= pd.read_csv('pima-indians-diabetes.csv')

In [5]:
data.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skinfold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age,Is Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.columns

Index(['Number of times pregnant', 'Plasma glucose concentration',
       'Diastolic blood pressure (mm Hg)', 'Triceps skinfold thickness (mm)',
       '2-Hour serum insulin (mu U/ml)',
       'Body mass index (weight in kg/(height in m)^2)',
       'Diabetes pedigree function', 'Age', 'Is Diabetic'],
      dtype='object')

In [7]:
cols = ['Plasma glucose concentration',
       'Diastolic blood pressure (mm Hg)', 'Triceps skinfold thickness (mm)',
       '2-Hour serum insulin (mu U/ml)',
       'Body mass index (weight in kg/(height in m)^2)',
       'Diabetes pedigree function', 'Age']

In [8]:
# as mentioned in the data description, the missing values have been replaced by zeroes. So, we are replacing zeroes with nan
for col in cols:
    data[col]=data[col].replace(0, np.nan)

In [9]:
# checking for missing values
data.isna().sum()

Number of times pregnant                            0
Plasma glucose concentration                        5
Diastolic blood pressure (mm Hg)                   35
Triceps skinfold thickness (mm)                   227
2-Hour serum insulin (mu U/ml)                    374
Body mass index (weight in kg/(height in m)^2)     11
Diabetes pedigree function                          0
Age                                                 0
Is Diabetic                                         0
dtype: int64

In [10]:
# imputing the missing values
data['Plasma glucose concentration']=data['Plasma glucose concentration'].fillna(data['Plasma glucose concentration'].mode()[0])
data['Diastolic blood pressure (mm Hg)']=data['Diastolic blood pressure (mm Hg)'].fillna(data['Diastolic blood pressure (mm Hg)'].mode()[0])
data['Triceps skinfold thickness (mm)']=data['Triceps skinfold thickness (mm)'].fillna(data['Triceps skinfold thickness (mm)'].mean())
data['2-Hour serum insulin (mu U/ml)']=data['2-Hour serum insulin (mu U/ml)'].fillna(data['2-Hour serum insulin (mu U/ml)'].mean())
data['Body mass index (weight in kg/(height in m)^2)']=data['Body mass index (weight in kg/(height in m)^2)'].fillna(data['Body mass index (weight in kg/(height in m)^2)'].mean())


In [11]:
data.isna().sum()

Number of times pregnant                          0
Plasma glucose concentration                      0
Diastolic blood pressure (mm Hg)                  0
Triceps skinfold thickness (mm)                   0
2-Hour serum insulin (mu U/ml)                    0
Body mass index (weight in kg/(height in m)^2)    0
Diabetes pedigree function                        0
Age                                               0
Is Diabetic                                       0
dtype: int64

In [12]:
#Separating the feature and the Label columns 
x=data.drop(labels='Is Diabetic', axis=1)
y= data['Is Diabetic']

x.head(2)

In [13]:
# as the data points differs a lot we will scale them
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(x)


In [14]:
scaled_data

array([[ 0.63994726,  0.86840303, -0.02442979, ...,  0.16629174,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.19914997, -0.52034382, ..., -0.85253118,
        -0.36506078, -0.19067191],
       [ 1.23388019,  2.01704359, -0.68564849, ..., -1.33283341,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 , -0.01769112, -0.02442979, ..., -0.91074963,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.14640039, -1.01625784, ..., -0.34311972,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.93660356, -0.18973447, ..., -0.29945588,
        -0.47378505, -0.87137393]])

In [15]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(scaled_data,y,test_size=0.3,random_state=42)

In [16]:
model = XGBClassifier(objective='binary:logistic')
model.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [17]:
# First a single record prediction
y_pred = model.predict(np.asarray(scaled_data[0]).reshape(1,8))
y_pred

array([1])

In [18]:
y_pred = model.predict(train_x)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(train_y,predictions)
accuracy

1.0

In [19]:
y_pred

array([0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,

In [20]:
# Checking initial test accuracy
y_pred = model.predict(test_x)
predictions = [round(value) for  value in y_pred]
accuracy = accuracy_score(test_y,predictions)
accuracy

0.7272727272727273

In [21]:
test_x[0]

array([ 0.63994726, -0.77251205, -1.18156252,  0.43784695,  0.40547846,
        0.22451019, -0.1264714 ,  0.83038113])

In [22]:
# Do hyperparameter tuning to increase the accuracy of the model.

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
param_grid = {
    'learning_rate':[1,0.5,0.1,0.01,0.001],
    'max_depth':[3,5,10,20],
    'n_estimators':[10,50,100,200]
}

In [25]:
grid = GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid,verbose=3)

In [26]:
grid.fit(train_x,train_y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.824 total time=   0.0s
[CV 2/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.741 total time=   0.0s
[CV 3/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.757 total time=   0.0s
[CV 4/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.738 total time=   0.0s
[CV 5/5] END learning_rate=1, max_depth=3, n_estimators=10;, score=0.738 total time=   0.0s
[CV 1/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.815 total time=   0.0s
[CV 2/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.759 total time=   0.0s
[CV 3/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.738 total time=   0.0s
[CV 4/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.729 total time=   0.0s
[CV 5/5] END learning_rate=1, max_depth=3, n_estimators=50;, score=0.729 total time=   0.0s
[CV 1/5] END learn

[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.701 total time=   0.0s
[CV 5/5] END learning_rate=0.5, max_depth=3, n_estimators=50;, score=0.766 total time=   0.0s
[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.750 total time=   0.1s
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.731 total time=   0.1s
[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.785 total time=   0.1s
[CV 4/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.720 total time=   0.1s
[CV 5/5] END learning_rate=0.5, max_depth=3, n_estimators=100;, score=0.738 total time=   0.1s
[CV 1/5] END learning_rate=0.5, max_depth=3, n_estimators=200;, score=0.759 total time=   0.2s
[CV 2/5] END learning_rate=0.5, max_depth=3, n_estimators=200;, score=0.759 total time=   0.2s
[CV 3/5] END learning_rate=0.5, max_depth=3, n_estimators=200;, score=0.776 total time=   0.2s
[CV 4/5] END learning_rate=0.5, max_depth=3, n_estim

[CV 1/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.778 total time=   0.2s
[CV 2/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.741 total time=   0.2s
[CV 3/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.757 total time=   0.2s
[CV 4/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.720 total time=   0.2s
[CV 5/5] END learning_rate=0.1, max_depth=3, n_estimators=200;, score=0.757 total time=   0.3s
[CV 1/5] END learning_rate=0.1, max_depth=5, n_estimators=10;, score=0.824 total time=   0.0s
[CV 2/5] END learning_rate=0.1, max_depth=5, n_estimators=10;, score=0.778 total time=   0.0s
[CV 3/5] END learning_rate=0.1, max_depth=5, n_estimators=10;, score=0.776 total time=   0.0s
[CV 4/5] END learning_rate=0.1, max_depth=5, n_estimators=10;, score=0.692 total time=   0.0s
[CV 5/5] END learning_rate=0.1, max_depth=5, n_estimators=10;, score=0.710 total time=   0.0s
[CV 1/5] END learning_rate=0.1, max_depth=5, n_estimato

[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.824 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.769 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.776 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.664 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=50;, score=0.692 total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.815 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.787 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.785 total time=   0.1s
[CV 4/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.673 total time=   0.1s
[CV 5/5] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=0.701 total time=   0.1s
[CV 1/5] END learning_rate=0.01, max_depth=5,

[CV 2/5] END learning_rate=0.001, max_depth=5, n_estimators=100;, score=0.759 total time=   0.1s
[CV 3/5] END learning_rate=0.001, max_depth=5, n_estimators=100;, score=0.748 total time=   0.1s
[CV 4/5] END learning_rate=0.001, max_depth=5, n_estimators=100;, score=0.673 total time=   0.1s
[CV 5/5] END learning_rate=0.001, max_depth=5, n_estimators=100;, score=0.701 total time=   0.1s
[CV 1/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.833 total time=   0.3s
[CV 2/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.759 total time=   0.3s
[CV 3/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.757 total time=   0.3s
[CV 4/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.673 total time=   0.4s
[CV 5/5] END learning_rate=0.001, max_depth=5, n_estimators=200;, score=0.701 total time=   0.3s
[CV 1/5] END learning_rate=0.001, max_depth=10, n_estimators=10;, score=0.787 total time=   0.0s
[CV 2/5] END learning_rate=0.0

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_to_...,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                               

In [27]:
grid.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}

In [34]:
new_model=XGBClassifier(learning_rate=0.1,max_depth=5,n_estimators=50)
new_model.fit(train_x,train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [35]:
# Checking initial test accuracy
y_pred_new = new_model.predict(test_x)
predictions_new = [round(value) for  value in y_pred_new]
accuracy_new = accuracy_score(test_y,predictions_new)
accuracy_new

0.7445887445887446

In [36]:
# Save the model

In [37]:
filename = 'xgboost_model1.pickle'
pickle.dump(new_model,open(filename,'wb'))

loaded_model = pickle.load(open(filename,'rb'))

In [38]:
filename_scaler = 'scaler_model1.pickle'
pickle.dump(scaler,open(filename_scaler,'wb'))

scaler_model = pickle.load(open(filename_scaler,'rb'))

In [39]:
# Do a random prediction
d = scaler_model.transform([[6,148,72,35,80,33.6,0.627,50]])
pred=loaded_model.predict(d)
print('This data belongs to class :',pred)

This data belongs to class : [1]


