# Importing Basic Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('diabetes_dataset.csv')

In [3]:
df

Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.00,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.20,23.5,Type 2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,46,Male,Other,Graduate,Upper-Middle,Unemployed,Former,1,136,8.3,...,45,150,116,113,109,14.58,5.55,26.0,Pre-Diabetes,0
99996,41,Female,White,Graduate,Middle,Employed,Never,3,76,8.8,...,55,123,146,96,146,9.02,5.97,24.4,Pre-Diabetes,0
99997,57,Female,Black,No formal,Upper-Middle,Employed,Former,4,121,9.9,...,50,111,184,93,132,2.57,5.21,27.6,No Diabetes,0
99998,47,Female,Black,Highschool,Lower-Middle,Retired,Never,3,52,5.9,...,68,91,116,106,117,9.81,5.53,26.4,Pre-Diabetes,0


In [4]:
df.columns

Index(['age', 'gender', 'ethnicity', 'education_level', 'income_level',
       'employment_status', 'smoking_status', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'bmi', 'waist_to_hip_ratio', 'systolic_bp',
       'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol',
       'ldl_cholesterol', 'triglycerides', 'glucose_fasting',
       'glucose_postprandial', 'insulin_level', 'hba1c', 'diabetes_risk_score',
       'diabetes_stage', 'diagnosed_diabetes'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 100000 non-null  int64  
 1   gender                              100000 non-null  object 
 2   ethnicity                           100000 non-null  object 
 3   education_level                     100000 non-null  object 
 4   income_level                        100000 non-null  object 
 5   employment_status                   100000 non-null  object 
 6   smoking_status                      100000 non-null  object 
 7   alcohol_consumption_per_week        100000 non-null  int64  
 8   physical_activity_minutes_per_week  100000 non-null  int64  
 9   diet_score                          100000 non-null  float64
 10  sleep_hours_per_day                 100000 non-null  float64
 11  screen_time_hours_per_day  

In [6]:
X=df.drop(['diagnosed_diabetes','diabetes_risk_score','diabetes_stage'], axis=1)
y=df['diagnosed_diabetes']

In [7]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.33 , random_state=42)

In [8]:
categorical_columns = X.select_dtypes(include=['object','string']).columns
categorical_columns

Index(['gender', 'ethnicity', 'education_level', 'income_level',
       'employment_status', 'smoking_status'],
      dtype='object')

In [9]:
numerical_columns = X.select_dtypes(include=['int64' , 'float64']).columns
numerical_columns

Index(['age', 'alcohol_consumption_per_week',
       'physical_activity_minutes_per_week', 'diet_score',
       'sleep_hours_per_day', 'screen_time_hours_per_day',
       'family_history_diabetes', 'hypertension_history',
       'cardiovascular_history', 'bmi', 'waist_to_hip_ratio', 'systolic_bp',
       'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol',
       'ldl_cholesterol', 'triglycerides', 'glucose_fasting',
       'glucose_postprandial', 'insulin_level', 'hba1c'],
      dtype='object')

# Feature Engineering Automation

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  ## for handling missing values
from sklearn.preprocessing import StandardScaler  ## for feature scaling
from sklearn.preprocessing import OneHotEncoder   ## for categorical to numerical
from sklearn.compose import ColumnTransformer

In [11]:
## Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer' , SimpleImputer(strategy='median')),
        ('scaler' , StandardScaler())
    ]
)

In [12]:
## Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer' , SimpleImputer(strategy='most_frequent')),
        ('onehotencoder' , OneHotEncoder())
    ]
)

In [13]:
### Now both these pieline have to be combined using ColumnTransformer

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline , numerical_columns),
    ('cat_pipeline' , cat_pipeline , categorical_columns)
])

In [14]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [15]:
# Using different Algorithms

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [16]:
### Models

models = {
    'RandomForest':RandomForestClassifier(),
    'DecisionTree':DecisionTreeClassifier(),
    'LogisticRegression':LogisticRegression(),
    'SVC':SVC(),
    'GaussianNB':GaussianNB()
    
}

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
### Now we write a function to evaluate our model

def evaluate_model(X_train , y_train , X_test , y_test):
    score={}
    for i in range(len(models)):
        model = list(models.values())[i]   ## we are basiucally converting the models named dictionary and taking its values and fit our training and testing data into it


        ## Fitting into model
        model.fit(X_train , y_train)

        ## Predicting testing data

        y_test_pred = model.predict(X_test)

        ## Predicting model accuracy
        test_model_score = accuracy_score(y_test_pred , y_test)

        score[list(models.keys())[i]] = test_model_score

    return score

In [19]:
evaluate_model(X_train , y_train , X_test , y_test)

{'RandomForest': 0.9226969696969697,
 'DecisionTree': 0.8638181818181818,
 'LogisticRegression': 0.8583939393939394,
 'SVC': 0.8916363636363637,
 'GaussianNB': 0.856}

In [20]:
## We clearly see RandomForestClassifier has highest accuracy score so we apply that and do its HyperParameter Tuning

classifier = RandomForestClassifier()

# Hyperparameter Tuning

In [21]:
parameters = {
    'n_estimators':[10,20,30],
    'criterion':['gini', 'entropy', 'log_loss'],
    'max_depth':range(1,4),
    'max_features':['sqrt', 'log2', None],
}

In [22]:
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV

cv1 = GridSearchCV(classifier , param_grid=parameters , cv=5 , verbose=3)
cv1.fit(X_train , y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=10;, score=0.864 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=10;, score=0.601 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=10;, score=0.601 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=10;, score=0.878 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=10;, score=0.635 total time=   0.1s
[CV 1/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=20;, score=0.653 total time=   0.3s
[CV 2/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=20;, score=0.708 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=1, max_features=sqrt, n_estimators=20;, score=0.897 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=1, max_features=sqr

In [23]:
y_prediction = cv1.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_prediction , y_test)
print(score)

0.9229090909090909


In [25]:
cv1.best_params_

{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'n_estimators': 10}

# Now we do Model Serialization( Saving and Loading)

In [29]:
import joblib
import os

In [30]:
print("Starting save process")

Starting save process


In [31]:
joblib.dump(preprocessor , 'diabetes_preprocessor.pkl')

['diabetes_preprocessor.pkl']

In [32]:
print("1. Preprocessor saved as 'diabetes_preprocessor.pkl'")

1. Preprocessor saved as 'diabetes_preprocessor.pkl'


In [34]:
joblib.dump(cv1.best_estimator_ , 'diabetes_model.pkl')

['diabetes_model.pkl']

In [35]:
print("2. Best Model saved as 'diabetes_model.pkl'")

2. Best Model saved as 'diabetes_model.pkl'


In [36]:
if os.path.exists('diabetes_model.pkl') and os.path.exists('diabetes_preprocessor.pkl'):
    print("\n SUCCESS : Both files saved successfully. You may close this notebook. ")
else:
    print("\n ERROR : Something went wrong.  Files not found!")


 SUCCESS : Both files saved successfully. You may close this notebook. 


In [42]:
defaults = {}

for col in X.columns:
    if (X[col].dtype == "object"):
        defaults[col] = X[col].mode()[0]

    else:
        defaults[col] = X[col].median()

print("=" * 30)
print(defaults)
print("=" * 30)

{'age': 50.0, 'gender': 'Female', 'ethnicity': 'White', 'education_level': 'Highschool', 'income_level': 'Middle', 'employment_status': 'Employed', 'smoking_status': 'Never', 'alcohol_consumption_per_week': 2.0, 'physical_activity_minutes_per_week': 100.0, 'diet_score': 6.0, 'sleep_hours_per_day': 7.0, 'screen_time_hours_per_day': 6.0, 'family_history_diabetes': 0.0, 'hypertension_history': 0.0, 'cardiovascular_history': 0.0, 'bmi': 25.6, 'waist_to_hip_ratio': 0.86, 'systolic_bp': 116.0, 'diastolic_bp': 75.0, 'heart_rate': 70.0, 'cholesterol_total': 186.0, 'hdl_cholesterol': 54.0, 'ldl_cholesterol': 102.0, 'triglycerides': 121.0, 'glucose_fasting': 111.0, 'glucose_postprandial': 160.0, 'insulin_level': 8.79, 'hba1c': 6.52}
