# Converting Data into DataFrame

In [1]:
import pandas as pd

# Load the dataset
data_path = 'diabetes_prediction_dataset.csv'  # Make sure to replace this with the actual path to your dataset
diabetes_data = pd.read_csv(data_path)

# Display the first few rows of the DataFrame
diabetes_data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


# Cleaning Data

In [2]:
diabetes_data_encoded = pd.get_dummies(diabetes_data, columns=['gender'], prefix='gender')

diabetes_data_encoded = pd.get_dummies(diabetes_data_encoded, columns=['smoking_history'], prefix='smoking')

# Displaying the first few rows of the DataFrame to show the result of one-hot encoding
diabetes_data_encoded.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,gender_Other,smoking_No Info,smoking_current,smoking_ever,smoking_former,smoking_never,smoking_not current
0,80.0,0,1,25.19,6.6,140,0,True,False,False,False,False,False,False,True,False
1,54.0,0,0,27.32,6.6,80,0,True,False,False,True,False,False,False,False,False
2,28.0,0,0,27.32,5.7,158,0,False,True,False,False,False,False,False,True,False
3,36.0,0,0,23.45,5.0,155,0,True,False,False,False,True,False,False,False,False
4,76.0,1,1,20.14,4.8,155,0,False,True,False,False,True,False,False,False,False


# Initial Training Random Forest Model

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Preparing the data
X = diabetes_data_encoded.drop('diabetes', axis=1)
y = diabetes_data_encoded['diabetes']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the Random Forest classifier
rf_all = RandomForestClassifier(n_estimators=100, random_state=42)

# Fitting the model on the training set
rf_all.fit(X_train, y_train)

# Predicting diabetes with the Random Forest classifier on the test set
y_pred = rf_all.predict(X_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Set: 0.97005

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.69      0.80      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



# Importance Feature Search Using Random Forest

In [4]:
X = diabetes_data_encoded.drop('diabetes', axis=1)
y = diabetes_data_encoded['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

feature_importances

HbA1c_level            0.405538
blood_glucose_level    0.314021
bmi                    0.125966
age                    0.104367
hypertension           0.015910
heart_disease          0.011052
smoking_No Info        0.004464
smoking_former         0.003788
smoking_never          0.003326
smoking_current        0.002543
gender_Male            0.002361
gender_Female          0.002344
smoking_not current    0.002226
smoking_ever           0.002092
gender_Other           0.000003
dtype: float64

# Checking For Imbalance Dataset

In [5]:
# Checking the balance of the target variable 'diabetes'
target_balance = diabetes_data_encoded['diabetes'].value_counts(normalize=True)

target_balance

diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64

# Second Training of Random Forest Model with Important Features and Balance Dataset

In [6]:
from sklearn.utils import resample

# Identify the number of instances with diabetes=1
n_diabetes_pos = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1].shape[0]

# Downsample the instances where diabetes=0 to match the number of diabetes=1
df_majority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 0]
df_minority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1]

df_majority_downsampled_8500 = resample(df_majority, 
                                        replace=False,    # sample without replacement
                                        n_samples=8500,   # to match exactly 8500 samples for diabetes class
                                        random_state=123) # reproducible results

# Combine the downsampled majority class with the original minority class
df_balanced_8500 = pd.concat([df_majority_downsampled_8500, df_minority])

# Split the balanced dataset into features (X) and target variable (y) using only the important features
important_features = ['HbA1c_level', 'blood_glucose_level', 'bmi', 'age']

X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['diabetes']

# Split the data into training and testing sets
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)

rf_balanced_8500 = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the balanced dataset
rf_balanced_8500.fit(X_train_bal_8500, y_train_bal_8500)

# Predict on the testing set
y_pred_bal_8500 = rf_balanced_8500.predict(X_test_bal_8500)

# Generate classification report on the balanced dataset
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_bal_8500))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_bal_8500))

Accuracy on Test Set: 0.8970588235294118

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1687
           1       0.89      0.90      0.90      1713

    accuracy                           0.90      3400
   macro avg       0.90      0.90      0.90      3400
weighted avg       0.90      0.90      0.90      3400



# HyperParameter GridSearching for Random Forest

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter grid for Random Search
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 20),
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Search model
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled. Increase for better results but longer computation.
    cv=3,       # Cross-validation strategy. Increase for more reliable estimates but longer computation.
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Fit the model on the balanced dataset
random_search.fit(X_train_bal_8500, y_train_bal_8500)

# Best parameters found by Random Search
best_params = random_search.best_params_
best_score = random_search.best_score_

best_params, best_score

({'max_depth': 12,
  'max_features': 'sqrt',
  'min_samples_split': 8,
  'n_estimators': 120},
 0.9066178352653335)

# Final Training of Random Forest Model with Best HyperParameters

In [8]:
# Adjusted hyperparameters

rf_balanced_adjusted = RandomForestClassifier(
    n_estimators=120,  # Increased from 100 to 200
    max_depth=12,       # Increased depth
    min_samples_split=8,  # Require more samples to split
    random_state=42,
    max_features = 'sqrt',
    class_weight= 'balanced'
)

# Fit the model on the balanced dataset with adjusted hyperparameters
rf_balanced_adjusted.fit(X_train_bal_8500, y_train_bal_8500)

# Predict on the testing set
y_pred_bal_adjusted = rf_balanced_adjusted.predict(X_test_bal_8500)

# Calculate accuracy and generate classification report on the balanced dataset
accuracy_bal_adjusted = accuracy_score(y_test_bal_8500, y_pred_bal_adjusted)
report_bal_adjusted = classification_report(y_test_bal_8500, y_pred_bal_adjusted)

accuracy_bal_adjusted, report_bal_adjusted

print("Accuracy on Test Set:", accuracy_bal_adjusted)
print("\nClassification Report:")
print(report_bal_adjusted)

Accuracy on Test Set: 0.9073529411764706

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.91      1687
           1       0.90      0.92      0.91      1713

    accuracy                           0.91      3400
   macro avg       0.91      0.91      0.91      3400
weighted avg       0.91      0.91      0.91      3400



--------------------------------------------------------------------------------------------------------------------------------

# Initial Training of Extreme Gradient Boosting Model

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Preparing the data
X = diabetes_data_encoded.drop('diabetes', axis=1)
y = diabetes_data_encoded['diabetes']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the XGBoost classifier
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')

# Fitting the model on the training set
xgb_classifier.fit(X_train, y_train)

# Predicting diabetes with the XGBoost classifier on the test set
y_pred = xgb_classifier.predict(X_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy on Test Set: 0.97145

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18292
           1       0.95      0.70      0.81      1708

    accuracy                           0.97     20000
   macro avg       0.96      0.85      0.90     20000
weighted avg       0.97      0.97      0.97     20000



# Second Training of Extreme Gradient Boosting Model with Important Features and Balance DataSet

In [10]:
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Balancing the dataset
n_diabetes_pos = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1].shape[0]
df_majority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 0]
df_minority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1]

df_majority_downsampled_8500 = resample(df_majority, 
                                        replace=False,    # sample without replacement
                                        n_samples=8500,   # to match minority class count
                                        random_state=123) # reproducible results

df_balanced_8500 = pd.concat([df_majority_downsampled_8500, df_minority])

# Selecting important features
important_features = ['HbA1c_level', 'blood_glucose_level', 'bmi', 'age']
X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['diabetes']

# Splitting the data into training and testing sets
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)

# Initializing and training the XGBoost classifier
xgb_classifier_bal_8500 = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_classifier_bal_8500.fit(X_train_bal_8500, y_train_bal_8500)

# Making predictions on the test set
y_pred_bal_8500 = xgb_classifier_bal_8500.predict(X_test_bal_8500)

# Generating and printing the classification report and accuracy
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_bal_8500))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_bal_8500))


Accuracy on Test Set: 0.8991176470588236

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      1687
           1       0.89      0.91      0.90      1713

    accuracy                           0.90      3400
   macro avg       0.90      0.90      0.90      3400
weighted avg       0.90      0.90      0.90      3400



# Final Training of Extreme Gradient Boosting with Best HyperParameters

In [11]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter distribution to sample from
param_dist = {
    'max_depth': [3, 5, 7],                     # Limits the depth of the tree
    'min_child_weight': [1, 3, 5],              # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0.5, 1, 1.5],                     # Minimum loss reduction required to make a further partition on a leaf node
    'subsample': [0.7, 0.9],                    # Subsample ratio of the training instances
    'colsample_bytree': [0.7, 0.9],             # Subsample ratio of columns when constructing each tree
    'n_estimators': [100, 150],                 # Number of trees in the forest
    'learning_rate': [0.05, 0.1, 0.15]          # Step size shrinkage used to prevent overfitting
}

# Initialize the XGBClassifier and RandomizedSearchCV
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, 
                                   n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_bal_8500, y_train_bal_8500)

# Extract the best model
best_xgb = random_search.best_estimator_

# Predict using the best model
y_pred_best = best_xgb.predict(X_test_bal_8500)

# Evaluate the best model
print("Best Parameters:", random_search.best_params_)
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_best))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_best))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 0.9, 'n_estimators': 150, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.15, 'gamma': 1.5, 'colsample_bytree': 0.9}
Accuracy on Test Set: 0.9061764705882352

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.89      0.90      1687
           1       0.90      0.92      0.91      1713

    accuracy                           0.91      3400
   macro avg       0.91      0.91      0.91      3400
weighted avg       0.91      0.91      0.91      3400



--------------------------------------------------------------------------------------------------------------------------------

# Initial Training of Logistic Regression Model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Preparing the data
X = diabetes_data_encoded.drop('diabetes', axis=1)
y = diabetes_data_encoded['diabetes']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)  # Increased max_iter for convergence

# Fitting the model on the training set
lr_classifier.fit(X_train, y_train)

# Predicting diabetes with the Logistic Regression classifier on the test set
y_pred = lr_classifier.predict(X_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy on Test Set: 0.959

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.86      0.62      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Second Training of Logistic Regression Model with Important Features and Balance DataSet

In [13]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd  # Ensure pandas is imported to handle data operations

# Balancing the dataset
n_diabetes_pos = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1].shape[0]
df_majority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 0]
df_minority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1]

df_majority_downsampled_8500 = resample(df_majority, 
                                        replace=False,    # sample without replacement
                                        n_samples=8500,   # to match minority class count
                                        random_state=123) # reproducible results

df_balanced_8500 = pd.concat([df_majority_downsampled_8500, df_minority])

# Selecting important features
important_features = ['HbA1c_level', 'blood_glucose_level', 'bmi', 'age']
X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['diabetes']

# Splitting the data into training and testing sets
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)

# Initializing and training the Logistic Regression classifier
lr_classifier_bal_8500 = LogisticRegression(max_iter=1000, random_state=42)  # Increased max_iter for convergence
lr_classifier_bal_8500.fit(X_train_bal_8500, y_train_bal_8500)

# Making predictions on the test set
y_pred_bal_8500 = lr_classifier_bal_8500.predict(X_test_bal_8500)

# Generating and printing the classification report and accuracy
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_bal_8500))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_bal_8500))


Accuracy on Test Set: 0.8835294117647059

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      1687
           1       0.89      0.88      0.88      1713

    accuracy                           0.88      3400
   macro avg       0.88      0.88      0.88      3400
weighted avg       0.88      0.88      0.88      3400



# Final Training of Extreme Gradient Boosting with Best HyperParameters

In [14]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Assuming 'diabetes_data_encoded' is already loaded and available

# Balancing the dataset
df_majority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 0]
df_minority = diabetes_data_encoded[diabetes_data_encoded['diabetes'] == 1]
df_majority_downsampled_8500 = resample(df_majority, 
                                        replace=False,    # sample without replacement
                                        n_samples=8500,   # to match minority class count
                                        random_state=123) # reproducible results
df_balanced_8500 = pd.concat([df_majority_downsampled_8500, df_minority])

# Selecting important features
important_features = ['HbA1c_level', 'blood_glucose_level', 'bmi', 'age']
X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['diabetes']

# Splitting the data
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)

# Parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],               # Types of regularization
    'solver': ['liblinear', 'saga']        # Solvers that support l1 penalties
}

# GridSearchCV setup
grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train_bal_8500, y_train_bal_8500)

# Best model
best_lr = grid_search.best_estimator_

# Making predictions and evaluating the best model
y_pred_best = best_lr.predict(X_test_bal_8500)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_best))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_best))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy on Test Set: 0.8844117647058823

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      1687
           1       0.89      0.88      0.88      1713

    accuracy                           0.88      3400
   macro avg       0.88      0.88      0.88      3400
weighted avg       0.88      0.88      0.88      3400

