In [1]:
# Importing the Dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score


In [2]:
# Data Collection and Analysis

# Load the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/content/diabete.csv')

In [3]:
# Printing the first 5 rows of the dataset
print(diabetes_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
# Number of rows and columns in this dataset
print(diabetes_dataset.shape)

(768, 9)


In [5]:
# Getting the statistical summary of the data
print(diabetes_dataset.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [6]:
# Count of outcomes
print(diabetes_dataset['Outcome'].value_counts())
# 0 --> Non-Diabetic
# 1 --> Diabetic

Outcome
0    500
1    268
Name: count, dtype: int64


In [7]:
# Grouping by outcome to get mean values
print(diabetes_dataset.groupby('Outcome').mean())

         Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
Outcome                                                                      
0           3.298000  109.980000      68.184000      19.664000   68.792000   
1           4.865672  141.257463      70.824627      22.164179  100.335821   

               BMI  DiabetesPedigreeFunction        Age  
Outcome                                                  
0        30.304200                  0.429734  31.190000  
1        35.142537                  0.550500  37.067164  


In [8]:
# Separating the features and labels/target
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [9]:
# Data Standardization
scaler = StandardScaler()
scaler.fit(X)

In [10]:
# Transform the data
standardized_data = scaler.transform(X)

In [11]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [12]:
# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


In [13]:
# Hyperparameter tuning for the Gradient Boosting Classifier
param_dist = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 4]
}

In [14]:
# Initialize the Gradient Boosting Classifier
classifier = GradientBoostingClassifier()

In [15]:
# Randomized search for the best parameters(hyperparameter tuning)
randomized_search = RandomizedSearchCV(estimator=classifier, param_distributions=param_dist,
                                       n_iter=8, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
randomized_search.fit(X_train, Y_train)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [16]:
# Best parameters
print("Best parameters found: ", randomized_search.best_params_)

Best parameters found:  {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1}


In [17]:
# Using the best estimator
best_classifier = randomized_search.best_estimator_

In [18]:
# Training the best Gradient Boosting Classifier
best_classifier.fit(X_train, Y_train)

In [19]:
# Accuracy score on the training data
X_train_prediction = best_classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of the training data: ', training_data_accuracy)

Accuracy score of the training data:  0.9820846905537459


In [20]:
# Accuracy score on the test data
X_test_prediction = best_classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of the test data: ', test_data_accuracy)

Accuracy score of the test data:  0.7532467532467533


In [21]:
# Making a Predictive System
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)

In [22]:
# Assuming these were the original feature names used for training
feature_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [23]:
# Convert input data to a DataFrame with the correct feature names
input_data_as_df = pd.DataFrame([input_data], columns=feature_names)

In [24]:
# Standardize the input data
std_data = scaler.transform(input_data_as_df)
print(std_data)

[[ 0.3429808   1.41167241  0.14964075 -0.09637905  0.82661621 -0.78595734
   0.34768723  1.51108316]]


In [25]:
# Make prediction
prediction = best_classifier.predict(std_data)
print("Prediction:", prediction)

Prediction: [1]


In [26]:
# Output the result
if prediction[0] == 0:
    print('The person is not diabetic.')
else:
    print('The person is diabetic.')

The person is diabetic.
