If You are using Google Collab Uncomment the following to install required Python packages. 

In [1]:
# !pip install pandas
# !pip install numpy
# !pip install sklearn
# !pip install matplotlib
# !pip install seaborn

###Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split as tts

### Importing Dataset

In [3]:
pima_df=pd.read_csv("./content/sample_data/Pima Indians Diabetes Dataset - diabetes.csv")
pima_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### **Data Cleaning**

1.   Replacing all the 0 with null value.
2.   Fill all the null values with the mean value of that column.

This is done beacuse no a number row will not be counted in mean.



In [4]:
# Glucose
pima_df['Glucose'].replace(0,np.NaN,inplace=True)
mean_Glucose=pima_df['Glucose'].mean()
pima_df['Glucose'].fillna(mean_Glucose, inplace = True)

In [5]:
# BloodPressure
pima_df['BloodPressure'].replace(0,np.NaN,inplace=True)
mean_BloodPressure=pima_df['BloodPressure'].mean()
pima_df['BloodPressure'].fillna(mean_BloodPressure, inplace = True)

In [6]:
# SkinThickness
pima_df['SkinThickness'].replace(0,np.NaN,inplace=True)
mean_SkinThickness=pima_df['SkinThickness'].mean()
pima_df['SkinThickness'].fillna(mean_SkinThickness, inplace = True)

In [7]:
# Insulin
pima_df['Insulin'].replace(0,np.NaN,inplace=True)
mean_Insulin=pima_df['Insulin'].mean()
pima_df['Insulin'].fillna(mean_Insulin, inplace = True)

In [8]:
# BMI
pima_df['BMI'].replace(0,np.NaN,inplace=True)
mean_BMI=pima_df['BMI'].mean()
pima_df['BMI'].fillna(mean_BMI, inplace = True)

###Feature Scaling


1.   Preparing Feature Vector and Obesrvation Vector. 
2.   Normalizing them.



In [9]:
StSc=StandardScaler()

In [10]:
features_X=pima_df.drop(["Outcome"],axis = 1)
print(features_X.head())
features_X=  pd.DataFrame(StSc.fit_transform(features_X),columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'DiabetesPedigreeFunction', 'Age'])
print(features_X.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \
0            6    148.0           72.0       35.00000  155.548223  33.6   
1            1     85.0           66.0       29.00000  155.548223  26.6   
2            8    183.0           64.0       29.15342  155.548223  23.3   
3            1     89.0           66.0       23.00000   94.000000  28.1   
4            0    137.0           40.0       35.00000  168.000000  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
   Pregnancies   Glucose  BloodPressure  SkinThickness       Insulin  \
0     0.639947  0.865108      -0.033518   6.655021e-01 -3.345079e-16   
1    -0.844885 -1.206162      -0.529859  -1.746338e-02 -3.345079e-16   
2     1.233880  2.015813      -0.695306   8.087936e-16 -3.345079e-16   
3    -0.844885 -1.074652      -0.529859  -7.004289e

In [11]:
observation_Y=pima_df['Outcome']
print(observation_Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


### Train Test Split

In [12]:
tr_X, te_X, tr_Y,te_Y=tts(features_X,observation_Y, test_size=0.1, random_state=42,stratify=observation_Y)

In [13]:
print("Train X Size:",tr_X.shape)
print("Test X Size:",te_X.shape)
print("Train Y Size:",tr_Y.shape)
print("Test Y Size:",te_Y.shape)

Train X Size: (691, 8)
Test X Size: (77, 8)
Train Y Size: (691,)
Test Y Size: (77,)


### Gradient Boosting Algorightm

In [14]:
my_grdBst=GradientBoostingClassifier(random_state=42)

In [15]:
my_grdBst.fit(tr_X,tr_Y)

GradientBoostingClassifier(random_state=42)

In [16]:
pred_Y=my_grdBst.predict(te_X)

In [17]:
print("Mean Square Error:",mean_squared_error(te_Y,pred_Y))
print("Mean Absolute Error:",mean_absolute_error(te_Y,pred_Y))
print("Precision:", precision_score(te_Y,pred_Y))
print("recall_score:",recall_score(te_Y,pred_Y))
print("roc_auc_score:",roc_auc_score(te_Y,pred_Y))

Mean Square Error: 0.23376623376623376
Mean Absolute Error: 0.23376623376623376
Precision: 0.6956521739130435
recall_score: 0.5925925925925926
roc_auc_score: 0.7262962962962962
