# Importing Libraries

In [1]:
# Importing the relevant libraries
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint

# Data Preprocessing

### Loading Dataset

In [2]:
# Loading the dataset
df = pd.read_csv('diabetes.csv', encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4303 entries, 0 to 4302
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             4303 non-null   int64  
 1   Gender          4303 non-null   int64  
 2   BMI             4303 non-null   float64
 3   SBP             4303 non-null   int64  
 4   DBP             4303 non-null   int64  
 5   FPG             4303 non-null   float64
 6   Chol            4303 non-null   float64
 7   Tri             4303 non-null   float64
 8   HDL             4303 non-null   float64
 9   LDL             4303 non-null   float64
 10  ALT             4303 non-null   float64
 11  BUN             4303 non-null   float64
 12  CCR             4303 non-null   float64
 13  FFPG            4303 non-null   float64
 14  smoking         4303 non-null   float64
 15  drinking        4303 non-null   float64
 16  family_histroy  4303 non-null   int64  
 17  Diabetes        4303 non-null   i

### Validation Checks

The smoking column can take on 1 of 3 values depending on smoker status  
* 1 - current  
* 2 - former  
* 3 - never  
  
  
The drinking column can take on 1 of 3 values depending on drinker status  
* 1 - current  
* 2 - former  
* 3 - never  
  
  
The family_history column can take on 1 of 2 values  
* 0 - no family history of diabetes  
* 1 - family history of diabetes 
  
  
The Diabetes column can take on 1 of 2 values  
* 0 - no diabetes  
* 1 - diabetes 

In [3]:
# Getting the value counts for columns that can only contain certain values
print(df['Gender'].value_counts())
print("\n", df['smoking'].value_counts())
print("\n", df['drinking'].value_counts())
print("\n", df['family_histroy'].value_counts())
print("\n", df['Diabetes'].value_counts())

Gender
1    2790
2    1513
Name: count, dtype: int64

 smoking
3.000000    2534
4.860753     888
1.000000     745
2.000000     136
Name: count, dtype: int64

 drinking
3.000000    2749
4.860753     888
2.000000     583
1.000000      83
Name: count, dtype: int64

 family_histroy
0    4038
1     265
Name: count, dtype: int64

 Diabetes
0    3000
1    1303
Name: count, dtype: int64


### Cleaning Data

In [4]:
# Removing rows with invalid values
invalid = []

for i in range(len(df)):
    if df.iloc[i]['smoking'] != 1 and df.iloc[i]['smoking'] != 2 and df.iloc[i]['smoking'] != 3 and df.iloc[i]['drinking'] != 1 and df.iloc[i]['drinking'] != 2 and df.iloc[i]['drinking'] != 3:
        invalid.append(i)

df.drop(invalid, inplace=True)
df = df.sample(frac=1, random_state=123).reset_index(drop=True)
df

Unnamed: 0,Age,Gender,BMI,SBP,DBP,FPG,Chol,Tri,HDL,LDL,ALT,BUN,CCR,FFPG,smoking,drinking,family_histroy,Diabetes
0,32,1,20.7,112,64,5.52,4.79,1.67,4.860753,4.860753,64.0,4.91,61.0,10.30,3.0,3.0,0,1
1,47,1,21.8,116,83,5.03,4.32,1.19,1.200000,2.830000,14.5,4.68,63.1,5.10,3.0,3.0,0,0
2,56,2,28.5,135,78,5.94,6.45,2.69,1.290000,3.990000,44.5,5.74,57.4,6.70,3.0,3.0,0,0
3,30,1,26.9,127,63,4.99,4.34,1.28,1.350000,2.480000,41.7,4.60,70.7,5.59,1.0,3.0,0,0
4,32,2,25.8,144,83,5.47,4.27,0.43,1.840000,2.330000,13.9,4.32,50.1,5.12,3.0,3.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3410,32,1,25.2,123,71,5.20,4.38,1.12,1.180000,2.530000,19.0,2.90,78.9,5.40,3.0,3.0,0,0
3411,60,2,20.7,121,74,6.16,5.51,1.00,4.860753,4.860753,12.7,5.48,55.4,7.00,3.0,3.0,0,1
3412,33,2,23.8,102,69,4.20,7.51,2.54,2.060000,4.160000,17.2,4.30,54.7,4.60,3.0,3.0,0,0
3413,31,1,20.9,121,75,4.08,3.59,0.58,1.430000,2.010000,18.7,4.65,66.6,4.92,2.0,3.0,0,0


### Creating Training and Testing Datasets

In [5]:
# Creating the training and testing datasets
y1 = df['Diabetes']
X1 = df.drop('Diabetes', axis=1)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

### Creating Training and Testing Dataset with Balanced Data

In [6]:
# Balancing the datasets
df_pos = df[df['Diabetes']==0]
df_neg = df[df['Diabetes']==1]

df2 = pd.concat([df_pos.sample(n=len(df_neg), random_state=42), df_neg], axis=0)
df2 = df2.sample(frac=1, random_state=42).reset_index(drop=True)

# Creating the balanced training and testing datasets
y2 = df2['Diabetes']
X2 = df2.drop('Diabetes', axis=1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Random Forest

### Initial Random Forest Model

In [7]:
# Initializing Random Forest Model and fitting the model on the original training set
rf_model1 = RandomForestClassifier(random_state=42).fit(X1_train, y1_train)

# Making predictions with the Random Forest Model on the test data
y1_pred_rf = rf_model1.predict(X1_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y1_test, y1_pred_rf))
print("\nClassification Report:")
print(classification_report(y1_test, y1_pred_rf))

Accuracy on Test Set: 0.9326500732064422

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       597
           1       0.74      0.71      0.73        86

    accuracy                           0.93       683
   macro avg       0.85      0.84      0.84       683
weighted avg       0.93      0.93      0.93       683



### Getting Important Features

In [8]:
# Getting important features
rf_feature_importance = pd.Series(rf_model1.feature_importances_, index=X1_train.columns).sort_values(ascending=False)

count = 0
rf_selected_features = []

for x in range(len(rf_feature_importance)):
    if rf_feature_importance[x] > 0.05:
        rf_selected_features.append(rf_feature_importance.index[x])
    count+=1
    
rf_selected_features

['FFPG', 'HDL', 'LDL', 'FPG', 'Age']

### Random Forest Model with Balanced Dataset and Important Features

In [9]:
# Getting training and testing datasets with only important features from the balanced dataset
rf_X2_train = X2_train[rf_selected_features]
rf_X2_test = X2_test[rf_selected_features]

# Initializing Random Forest Model and fitting the model on the updated training set
rf_model2 = RandomForestClassifier(random_state=42).fit(rf_X2_train, y2_train)

# Making predictions with the Random Forest Model on the updated test data
y2_pred_rf = rf_model2.predict(rf_X2_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y2_test, y2_pred_rf))
print("\nClassification Report:")
print(classification_report(y2_test, y2_pred_rf))

Accuracy on Test Set: 0.8674698795180723

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.86      0.88        94
           1       0.83      0.88      0.85        72

    accuracy                           0.87       166
   macro avg       0.86      0.87      0.87       166
weighted avg       0.87      0.87      0.87       166



### HyperParameter Tuning

In [10]:
# Defining the parameter grid
param_grid1 = {
    'n_estimators': [50, 100, 150, 250],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [5, 10, 20],
    'max_depth': [5, 10, 15, 20],
}

In [11]:
# Using Randomized Search to find best hyperparameters
random_search1 = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_grid1, n_iter=10, cv=3, random_state=42)
random_search1.fit(rf_X2_train, y2_train)
random_search1.best_params_

{'n_estimators': 150,
 'min_samples_split': 10,
 'max_features': 'log2',
 'max_depth': 5}

### Random Forest Model with Balanced Dataset, Important Features and Best HyperParameters

In [12]:
# Initializing Random Forest Model and fitting the model on the updated training set with best hyperparameters
rf_model3 = RandomForestClassifier(n_estimators=150, max_depth=5, max_features='log2', min_samples_split=10, random_state=42).fit(rf_X2_train, y2_train)

# Making predictions with the Random Forest Model on the updated test data
y3_pred_rf = rf_model3.predict(rf_X2_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y2_test, y3_pred_rf))
print("\nClassification Report:")
print(classification_report(y2_test, y3_pred_rf))

Accuracy on Test Set: 0.8855421686746988

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90        94
           1       0.87      0.86      0.87        72

    accuracy                           0.89       166
   macro avg       0.88      0.88      0.88       166
weighted avg       0.89      0.89      0.89       166



# Extreme Gradient Boosting

### Initial Extreme Gradient Boosting Model

In [13]:
# Initializing Extreme Gradient Boosting Model and fitting the model on the original training set
xgb_model1 = XGBClassifier(random_state=42).fit(X1_train, y1_train)

# Making predictions with the Extreme Gradient Boosting Model on the test data
y1_pred_xgb = xgb_model1.predict(X1_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y1_test, y1_pred_xgb))
print("\nClassification Report:")
print(classification_report(y1_test, y1_pred_xgb))

Accuracy on Test Set: 0.9326500732064422

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       597
           1       0.76      0.69      0.72        86

    accuracy                           0.93       683
   macro avg       0.86      0.83      0.84       683
weighted avg       0.93      0.93      0.93       683



### Getting Important Features

In [14]:
# Getting important features
xgb_feature_importance = pd.Series(xgb_model1.feature_importances_, index=X1_train.columns).sort_values(ascending=False)

count = 0
xgb_selected_features = []

for x in range(len(xgb_feature_importance)):
    if xgb_feature_importance[x] > 0.05:
        xgb_selected_features.append(xgb_feature_importance.index[x])
    count+=1
    
xgb_selected_features

['FFPG', 'HDL', 'Age', 'FPG']

### Extreme Gradient Boosting Model with Balanced Dataset and Important Features

In [15]:
# Getting training and testing datasets with only important features from the balanced dataset
xgb_X2_train = X2_train[xgb_selected_features]
xgb_X2_test = X2_test[xgb_selected_features]

# Initializing Extreme Gradient Boosting Model and fitting the model on the updated training set
xgb_model2 = XGBClassifier(random_state=42).fit(xgb_X2_train, y2_train)

# Making predictions with the Extreme Gradient Boosting Model on the updated test data
y2_pred_xgb = xgb_model2.predict(xgb_X2_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y2_test, y2_pred_xgb))
print("\nClassification Report:")
print(classification_report(y2_test, y2_pred_xgb))

Accuracy on Test Set: 0.8554216867469879

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87        94
           1       0.80      0.89      0.84        72

    accuracy                           0.86       166
   macro avg       0.85      0.86      0.85       166
weighted avg       0.86      0.86      0.86       166



### HyperParameter Tuning

In [16]:
# Defining the parameter grid
param_grid2 = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0.5, 1, 1.5],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9],
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1, 0.15]
}

In [17]:
# Using Randomized Search to find best hyperparameters
random_search2 = RandomizedSearchCV(estimator=XGBClassifier(random_state=42), param_distributions=param_grid2, n_iter=10, cv=3, random_state=42)
random_search2.fit(xgb_X2_train, y2_train)
random_search2.best_params_

{'subsample': 0.7,
 'n_estimators': 100,
 'min_child_weight': 3,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 1.5,
 'colsample_bytree': 0.7}

### Extreme Gradient Boosting with Balanced Dataset, Important Features and Best HyperParameters

In [18]:
# Initializing Extreme Gradient Boosting Model and fitting the model on the updated training set with best hyperparameters
xgb_model3 = XGBClassifier(n_estimators=100, gamma=1.5, max_depth=3, min_child_weight=3, subsample=0.7, colsample_bytree=0.7, learning_rate=0.05, random_state=42).fit(xgb_X2_train, y2_train)

# Making predictions with the Extreme Gradient Boosting Model on the updated test data
y3_pred_xgb = xgb_model3.predict(xgb_X2_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y2_test, y3_pred_xgb))
print("\nClassification Report:")
print(classification_report(y2_test, y3_pred_xgb))

Accuracy on Test Set: 0.8855421686746988

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90        94
           1       0.87      0.86      0.87        72

    accuracy                           0.89       166
   macro avg       0.88      0.88      0.88       166
weighted avg       0.89      0.89      0.89       166



# Logistic Regression

### Initial Logistic Regression Model

In [19]:
# Initializing Logistic Regression Model and fitting the model on the original training set
lr_model1 = LogisticRegression(max_iter= 5000, random_state=42).fit(X1_train, y1_train)

# Making predictions with the Logistic Regression Model on the test data
y1_pred_lr = lr_model1.predict(X1_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y1_test, y1_pred_lr))
print("\nClassification Report:")
print(classification_report(y1_test, y1_pred_lr))

Accuracy on Test Set: 0.9341142020497804

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       597
           1       0.80      0.64      0.71        86

    accuracy                           0.93       683
   macro avg       0.87      0.81      0.84       683
weighted avg       0.93      0.93      0.93       683



### Getting Important Features

In [20]:
# Getting important features
rfe = RFE(lr_model1, n_features_to_select=5).fit(X1_train, y1_train)
lr_feature_importance = rfe.support_

count = 0
lr_selected_features = []

for col in X1_train:
    if lr_feature_importance[count]:
        lr_selected_features.append(col)
    count+=1

lr_selected_features

['FPG', 'HDL', 'LDL', 'FFPG', 'family_histroy']

### Logistic Regression Model with Balanced Dataset and Important Features

In [21]:
# Getting training and testing datasets with only important features from the balanced dataset
lr_X2_train = X2_train[lr_selected_features]
lr_X2_test = X2_test[lr_selected_features]

# Initializing Logistic Regression Model and fitting the model on the updated training set
lr_model2 = LogisticRegression(max_iter= 5000, random_state=42).fit(lr_X2_train, y2_train)

# Making predictions with the Logistic Regression Model on the updated test data
y2_pred_lr = lr_model2.predict(lr_X2_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y2_test, y2_pred_lr))
print("\nClassification Report:")
print(classification_report(y2_test, y2_pred_lr))

Accuracy on Test Set: 0.8614457831325302

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.88      0.88        94
           1       0.85      0.83      0.84        72

    accuracy                           0.86       166
   macro avg       0.86      0.86      0.86       166
weighted avg       0.86      0.86      0.86       166



### HyperParameter Tuning

In [22]:
# Defining the parameter grid
param_grid3 = {
    'solver': ['newton-cg', 'newton-cholesky', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l2'],
    'C': [100, 10, 1, 0.1, 0.01, 0.001],
}

In [23]:
# Using Randomized Search to find best hyperparameters
random_search3 = RandomizedSearchCV(estimator=LogisticRegression(max_iter=5000, random_state=42), param_distributions=param_grid3, n_iter=25, cv=3, random_state=42)
random_search3.fit(lr_X2_train, y2_train)
random_search3.best_params_

{'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 0.1}

### Logistic Regression Model with Balanced Dataset, Important Features and Best HyperParameters

In [24]:
# Initializing Logistic Regression Model and fitting the model on the updated training set with best hyperparameters
lr_model3 = LogisticRegression(penalty='l2', C=0.1, solver='newton-cholesky', max_iter= 5000, random_state=42).fit(lr_X2_train, y2_train)

# Making predictions with the Logistic Regression Model on the updated test data
y3_pred_lr = lr_model3.predict(lr_X2_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y2_test, y3_pred_lr))
print("\nClassification Report:")
print(classification_report(y2_test, y3_pred_lr))

Accuracy on Test Set: 0.8734939759036144

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        94
           1       0.87      0.83      0.85        72

    accuracy                           0.87       166
   macro avg       0.87      0.87      0.87       166
weighted avg       0.87      0.87      0.87       166

