In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer

file_path = 'diabetes_binary_classification_data.csv'
df = pd.read_csv(file_path)

print(df.head())

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

In [8]:
# Convert 'Diabetes_binary' into binary classification (0 = no diabetes, 1 = prediabetes or diabetes)
df['Diabetes_binary'] = df['Diabetes_binary'].apply(lambda x: 1 if x >= 1 else 0)

# Handle categorical variables (most are already numerical, no need for additional encoding)

# Separate features and target variable
X = df.drop('Diabetes_binary', axis=1)  # Features
y = df['Diabetes_binary']               # Target

# Handle missing values (if any)
imputer = SimpleImputer(strategy='mean')  # You can change the strategy as needed
X = imputer.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Feature scaling (especially for models like Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Importance using RFE (Recursive Feature Elimination)
model_lr = LogisticRegression(max_iter=1000)
rfe = RFE(model_lr, n_features_to_select=5)  # Select the top 5 most important features
rfe = rfe.fit(X_train_scaled, y_train)

# Get the ranking of features
print(f'RFE Ranking of Features: {rfe.ranking_}')

# Train various models and compare them
# Initialize models
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
gradient_boost = GradientBoostingClassifier()

# Train the models and predict
models = {'Logistic Regression': log_reg, 
          'Decision Tree': decision_tree, 
          'Random Forest': random_forest, 
          'Gradient Boosting': gradient_boost}

for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Confusion Matrix and Classification Report
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'\nModel: {name}')
    print(f'Confusion Matrix:\n{cm}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Recall (minimize false negatives): {recall:.4f}')
    print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

# Feature importance from Random Forest
importances = random_forest.feature_importances_
feature_importance = pd.Series(importances, index=df.drop('Diabetes_binary', axis=1).columns)
feature_importance = feature_importance.sort_values(ascending=False)
print("\nFeature Importance from Random Forest:\n", feature_importance)


RFE Ranking of Features: [ 1  1  2  1 16 11  6 12 10 17  3 14 15  1 13  7  8  4  1  9  5]

Model: Logistic Regression
Confusion Matrix:
[[64027  1473]
 [ 8933  1671]]
Accuracy: 0.8633
Recall (minimize false negatives): 0.1576
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.98      0.92     65500
           1       0.53      0.16      0.24     10604

    accuracy                           0.86     76104
   macro avg       0.70      0.57      0.58     76104
weighted avg       0.83      0.86      0.83     76104


Model: Decision Tree
Confusion Matrix:
[[57046  8454]
 [ 7112  3492]]
Accuracy: 0.7955
Recall (minimize false negatives): 0.3293
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     65500
           1       0.29      0.33      0.31     10604

    accuracy                           0.80     76104
   macro avg       0.59      0.60      0.59     76104