LOADING DATASET

In [2]:
import pandas as pd

# Load the uploaded file into a DataFrame
df = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')

# Display the unique values for each column in the DataFrame
unique_values = {column: df[column].unique() for column in df.columns}
#unique_values
print(df.head());

   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0        4.0   
1     

CLEANING DATASET

In [3]:
from sklearn.preprocessing import MinMaxScaler


# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
df['BMI_scaled'] = scaler.fit_transform(df[['BMI']])
df.drop(columns=['BMI'], inplace=True)
df['Age_scaled'] = scaler.fit_transform(df[['Age']])
df.drop(columns=['Age'], inplace=True)


# Dropping all rows with prediabetes ('1') from the dataset
df_binary = df[df['Diabetes_012'] != 1]




TRAINING RANDOM FOREST

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


# Adjusting the target for binary classification (0: No Diabetes, 2: Diabetes)
# Note: Since we're excluding prediabetes, we'll map '2' (diabetes) to '1' for the binary classification
y_binary_no_prediabetes = df_binary['Diabetes_012'].replace({2: 1})

# Features remain the same, but filtered to match the rows of the updated target
X_binary_no_prediabetes = df_binary.drop('Diabetes_012', axis=1)


# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binary_no_prediabetes, y_binary_no_prediabetes, test_size=0.2, random_state=42)

# Initializing the Random Forest classifier
rf_all = RandomForestClassifier(n_estimators=100, random_state=42)

# Fitting the model on the training set
rf_all.fit(X_train, y_train)

# Predicting diabetes with the Random Forest classifier on the test set
y_pred = rf_all.predict(X_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Set: 0.8589841397309778

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     42777
         1.0       0.50      0.20      0.28      7033

    accuracy                           0.86     49810
   macro avg       0.69      0.58      0.60     49810
weighted avg       0.83      0.86      0.83     49810



CHECKING FOR IMBALANCE DATASET

In [5]:

diabetes_counts = df_binary['Diabetes_012'].value_counts(normalize=True)

diabetes_counts


Diabetes_012
0.0    0.858076
2.0    0.141924
Name: proportion, dtype: float64

Balancing Dataset

In [6]:
from sklearn.utils import resample


# Identify the number of instances with diabetes=1
n_diabetes_pos = df_binary[df_binary['Diabetes_012'] == 2].shape[0]

# Downsample the instances where diabetes=0 to match the number of diabetes=1
df_majority = df_binary[df_binary['Diabetes_012'] == 0]
df_minority = df_binary[df_binary['Diabetes_012'] == 2]

df_majority_downsampled_8500 = resample(df_majority,
                                        replace=False,    # sample without replacement
                                        n_samples=8500,   # to match exactly 8500 samples for diabetes class
                                        random_state=123) # reproducible results

# Combine the downsampled majority class with the original minority class
df_balanced_8500 = pd.concat([df_majority_downsampled_8500, df_minority])

Getting important features

In [7]:
feature_importances = pd.Series(rf_all.feature_importances_, index=X_binary_no_prediabetes.columns).sort_values(ascending=False)

feature_importances

BMI_scaled              0.182659
Age_scaled              0.121870
Income                  0.096743
PhysHlth                0.082911
GenHlth                 0.073943
Education               0.069703
MentHlth                0.063143
HighBP                  0.044190
Fruits                  0.033596
Smoker                  0.033428
HighChol                0.028246
Sex                     0.027136
PhysActivity            0.026372
Veggies                 0.026136
DiffWalk                0.024677
HeartDiseaseorAttack    0.018386
NoDocbcCost             0.014579
Stroke                  0.012303
AnyHealthcare           0.008416
HvyAlcoholConsump       0.007881
CholCheck               0.003684
dtype: float64

SECOND TRAINING OF RANDOM FOREST MODEL WITH IMPORTANT FEATURES AND BALANCED DATASET

In [8]:

# Split the balanced dataset into features (X) and target variable (y) using only the important features
important_features = [ 'BMI_scaled', 'Age_scaled','Income','PhysHlth']

X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['Diabetes_012'].replace({2: 1})

# Split the data into training and testing sets
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)
rf_balanced_8500 = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the balanced dataset
rf_balanced_8500.fit(X_train_bal_8500, y_train_bal_8500)

# Predict on the testing set
y_pred_bal_8500 = rf_balanced_8500.predict(X_test_bal_8500)

# Generate classification report on the balanced dataset
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_bal_8500))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_bal_8500))


Accuracy on Test Set: 0.8069555302166477

Classification Report:
              precision    recall  f1-score   support

         0.0       0.53      0.30      0.38      1759
         1.0       0.84      0.93      0.89      7011

    accuracy                           0.81      8770
   macro avg       0.69      0.62      0.63      8770
weighted avg       0.78      0.81      0.78      8770



HYPERPARAMETER TUNING

In [9]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter grid for Random Search
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 20),
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Search model
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled. Increase for better results but longer computation.
    cv=3,       # Cross-validation strategy. Increase for more reliable estimates but longer computation.
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Fit the model on the balanced dataset
random_search.fit(X_train_bal_8500, y_train_bal_8500)

# Best parameters found by Random Search
best_params = random_search.best_params_
best_score = random_search.best_score_

best_params, best_score

({'max_depth': 48,
  'max_features': 'sqrt',
  'min_samples_split': 16,
  'n_estimators': 206},
 0.8311095906032615)

FINAL TRAINING USING BALANCED DATASET, IMPORTANT FEATURES AND HYPERPARAMETER TUNING

In [10]:
# Adjusted hyperparameters

rf_balanced_adjusted = RandomForestClassifier(
    n_estimators=206,
    max_depth=48,
    min_samples_split=16,
    random_state=42,
    max_features = 'sqrt',
    class_weight= 'balanced'
)

# Fit the model on the balanced dataset with adjusted hyperparameters
rf_balanced_adjusted.fit(X_train_bal_8500, y_train_bal_8500)

# Predict on the testing set
y_pred_bal_adjusted = rf_balanced_adjusted.predict(X_test_bal_8500)

# Calculate accuracy and generate classification report on the balanced dataset
accuracy_bal_adjusted = accuracy_score(y_test_bal_8500, y_pred_bal_adjusted)
report_bal_adjusted = classification_report(y_test_bal_8500, y_pred_bal_adjusted)

accuracy_bal_adjusted, report_bal_adjusted

print("Accuracy on Test Set:", accuracy_bal_adjusted)
print("\nClassification Report:")
print(report_bal_adjusted)

Accuracy on Test Set: 0.7411630558722919

Classification Report:
              precision    recall  f1-score   support

         0.0       0.40      0.58      0.47      1759
         1.0       0.88      0.78      0.83      7011

    accuracy                           0.74      8770
   macro avg       0.64      0.68      0.65      8770
weighted avg       0.78      0.74      0.76      8770



INITIAL TRAINING FOR EXTREME GRADIENT BOOSTING MODEL

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Adjusting the target for binary classification (0: No Diabetes, 2: Diabetes)
# Note: Since we're excluding prediabetes, we'll map '2' (diabetes) to '1' for the binary classification
y_binary_no_prediabetes = df_binary['Diabetes_012'].replace({2: 1})

# Features remain the same, but filtered to match the rows of the updated target
X_binary_no_prediabetes = df_binary.drop('Diabetes_012', axis=1)

# Splitting the data into training and test sets
X_train_bin_np, X_test_bin_np, y_train_bin_np, y_test_bin_np = train_test_split(
    X_binary_no_prediabetes,
    y_binary_no_prediabetes,
    test_size=0.2,
    random_state=42
)

# Reinitializing and fitting the XGBoost model for the adjusted binary classification
model_bin_np = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model_bin_np.fit(X_train_bin_np, y_train_bin_np)

# Making predictions on the test set
y_pred_bin_np = model_bin_np.predict(X_test_bin_np)

# Calculating accuracy on the test set
accuracy_bin_np = accuracy_score(y_test_bin_np, y_pred_bin_np)

# Generating classification report for the binary classification
report_bin_np = classification_report(y_test_bin_np, y_pred_bin_np)

print("Accuracy on Test Set:", accuracy_bin_np)
print("\nClassification Report:")
print(report_bin_np)

Accuracy on Test Set: 0.8655089339490062

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     42777
         1.0       0.57      0.20      0.29      7033

    accuracy                           0.87     49810
   macro avg       0.72      0.59      0.61     49810
weighted avg       0.84      0.87      0.84     49810



Getting Important Features

In [12]:
feature_importances = pd.Series(model_bin_np.feature_importances_, index=X_binary_no_prediabetes.columns).sort_values(ascending=False)

feature_importances

HighBP                  0.526742
GenHlth                 0.129897
HighChol                0.061281
HvyAlcoholConsump       0.045658
CholCheck               0.037881
Age_scaled              0.027040
BMI_scaled              0.025858
HeartDiseaseorAttack    0.023390
DiffWalk                0.020704
Sex                     0.016098
Income                  0.010756
Stroke                  0.010615
AnyHealthcare           0.007941
NoDocbcCost             0.007522
PhysActivity            0.007489
Veggies                 0.007074
MentHlth                0.006870
Fruits                  0.006843
Education               0.006821
PhysHlth                0.006764
Smoker                  0.006755
dtype: float32

SECOND TRAINING

In [13]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Selecting important features
important_features = ['HighBP', 'GenHlth', 'HighChol', 'HvyAlcoholConsump']
X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['Diabetes_012'].replace({2: 1})

# Splitting the data into training and testing sets
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)

# Initializing and training the XGBoost classifier
xgb_classifier_bal_8500 = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_classifier_bal_8500.fit(X_train_bal_8500, y_train_bal_8500)

# Making predictions on the test set
y_pred_bal_8500 = xgb_classifier_bal_8500.predict(X_test_bal_8500)

# Generating and printing the classification report and accuracy
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_bal_8500))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_bal_8500))


Accuracy on Test Set: 0.8366020524515393

Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.35      0.46      1759
         1.0       0.85      0.96      0.90      7011

    accuracy                           0.84      8770
   macro avg       0.77      0.66      0.68      8770
weighted avg       0.82      0.84      0.82      8770



FINAL TRAINING USING BALANCED DATASET, IMPORTANT FEATURES AND HYPERPARAMETER TUNING

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter distribution to sample from
param_dist = {
    'max_depth': [3, 5, 7],                     # Limits the depth of the tree
    'min_child_weight': [1, 3, 5],              # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0.5, 1, 1.5],                     # Minimum loss reduction required to make a further partition on a leaf node
    'subsample': [0.7, 0.9],                    # Subsample ratio of the training instances
    'colsample_bytree': [0.7, 0.9],             # Subsample ratio of columns when constructing each tree
    'n_estimators': [100, 150],                 # Number of trees in the forest
    'learning_rate': [0.05, 0.1, 0.15]          # Step size shrinkage used to prevent overfitting
}

# Initialize the XGBClassifier and RandomizedSearchCV
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
                                   n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train_bal_8500, y_train_bal_8500)

# Extract the best model
best_xgb = random_search.best_estimator_

# Predict using the best model
y_pred_best = best_xgb.predict(X_test_bal_8500)


# Evaluate the best model
print("Best Parameters:", random_search.best_params_)
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_best))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_best))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 0.7, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 1.5, 'colsample_bytree': 0.7}
Accuracy on Test Set: 0.8366020524515393

Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.35      0.46      1759
         1.0       0.86      0.96      0.90      7011

    accuracy                           0.84      8770
   macro avg       0.77      0.66      0.68      8770
weighted avg       0.82      0.84      0.82      8770



INITIAL TRAINING FOR LOGISTIC REGRESSION MODEL

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Note: Since we're excluding prediabetes, we'll map '2' (diabetes) to '1' for the binary classification
y_binary_no_prediabetes = df_binary['Diabetes_012'].replace({2: 1})

# Features remain the same, but filtered to match the rows of the updated target
X_binary_no_prediabetes = df_binary.drop('Diabetes_012', axis=1)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_binary_no_prediabetes, y_binary_no_prediabetes, test_size=0.2, random_state=42)

# Initializing the Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)

# Fitting the model on the training set
lr_classifier.fit(X_train, y_train)

# Predicting diabetes with the Logistic Regression classifier on the test set
y_pred = lr_classifier.predict(X_test)

# Evaluating the model's performance
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy on Test Set: 0.8623368801445492

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92     42777
         1.0       0.54      0.17      0.26      7033

    accuracy                           0.86     49810
   macro avg       0.71      0.57      0.59     49810
weighted avg       0.83      0.86      0.83     49810



GETTING IMPORTANT FEATURES

In [16]:
feature_importances = pd.Series(lr_classifier.coef_[0], index=X_binary_no_prediabetes.columns).sort_values(ascending=False)
feature_importances

BMI_scaled              5.319807
Age_scaled              1.492688
CholCheck               1.258332
HighBP                  0.781634
HighChol                0.589184
GenHlth                 0.546086
Sex                     0.268030
HeartDiseaseorAttack    0.227459
DiffWalk                0.125533
Stroke                  0.121459
AnyHealthcare           0.092399
NoDocbcCost             0.010388
MentHlth               -0.003762
PhysHlth               -0.007431
Smoker                 -0.019012
Education              -0.030769
Fruits                 -0.040981
PhysActivity           -0.046003
Income                 -0.054720
Veggies                -0.059343
HvyAlcoholConsump      -0.756774
dtype: float64

SECOND TRAINING

In [17]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd  # Ensure pandas is imported to handle data operations


# Selecting important features
important_features = ['BMI_scaled','Age_scaled','CholCheck','HighBP']
X_balanced_8500 = df_balanced_8500[important_features]
y_balanced_8500 = df_balanced_8500['Diabetes_012'].replace({2: 1})


# Splitting the data into training and testing sets
X_train_bal_8500, X_test_bal_8500, y_train_bal_8500, y_test_bal_8500 = train_test_split(X_balanced_8500, y_balanced_8500, test_size=0.2, random_state=42)

# Initializing and training the Logistic Regression classifier
lr_classifier_bal_8500 = LogisticRegression(max_iter=1000, random_state=42)  # Increased max_iter for convergence
lr_classifier_bal_8500.fit(X_train_bal_8500, y_train_bal_8500)

# Making predictions on the test set
y_pred_bal_8500 = lr_classifier_bal_8500.predict(X_test_bal_8500)

# Generating and printing the classification report and accuracy
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_bal_8500))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_bal_8500))

Accuracy on Test Set: 0.8261117445838084

Classification Report:
              precision    recall  f1-score   support

         0.0       0.69      0.24      0.36      1759
         1.0       0.84      0.97      0.90      7011

    accuracy                           0.83      8770
   macro avg       0.76      0.61      0.63      8770
weighted avg       0.81      0.83      0.79      8770



FINAL TRAINING USING BALANCED DATASET, IMPORTANT FEATURES AND HYPERPARAMETER TUNING

In [18]:
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],               # Types of regularization
    'solver': ['liblinear', 'saga']        # Solvers that support l1 penalties
}

# RandomizedSearchCV setup
grid_search = RandomizedSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train_bal_8500, y_train_bal_8500)

# Best model
best_lr = grid_search.best_estimator_

# Making predictions and evaluating the best model
y_pred_best = best_lr.predict(X_test_bal_8500)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy on Test Set:", accuracy_score(y_test_bal_8500, y_pred_best))
print("\nClassification Report:")
print(classification_report(y_test_bal_8500, y_pred_best))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.1}
Accuracy on Test Set: 0.8259977194982896

Classification Report:
              precision    recall  f1-score   support

         0.0       0.69      0.24      0.35      1759
         1.0       0.84      0.97      0.90      7011

    accuracy                           0.83      8770
   macro avg       0.76      0.61      0.63      8770
weighted avg       0.81      0.83      0.79      8770

