<a href="https://colab.research.google.com/github/MiaMetni/Behavioral-Health-Risk-Factor-Prediction-Model-for-Diabetes/blob/bettys_branch/diabetes_algorithm_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

diabetes_df = pd.read_csv('/SELECTED_diabetes_binary_5050split_health_indicators_BRFSS2021.csv')


In [5]:
diabetes_df.info()
diabetes_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67136 entries, 0 to 67135
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Diabetes_binary       67136 non-null  float64
 1   HighBP                67136 non-null  int64  
 2   HighChol              67136 non-null  float64
 3   CholCheck             67136 non-null  int64  
 4   BMI                   67136 non-null  float64
 5   Smoker                67136 non-null  float64
 6   Stroke                67136 non-null  float64
 7   HeartDiseaseorAttack  67136 non-null  float64
 8   PhysActivity          67136 non-null  int64  
 9   Fruits                67136 non-null  int64  
 10  Veggies               67136 non-null  int64  
 11  HvyAlcoholConsump     67136 non-null  int64  
 12  AnyHealthcare         67136 non-null  int64  
 13  NoDocbcCost           67136 non-null  float64
 14  GenHlth               67136 non-null  float64
 15  MentHlth           

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1,0.0,1,33.0,0.0,0.0,0.0,1,1,...,1,0.0,2.0,15.0,0.0,1.0,1,7,6.0,9.0
1,0.0,0,1.0,1,27.0,1.0,0.0,0.0,1,0,...,1,0.0,2.0,1.0,2.0,0.0,1,7,6.0,6.0
2,0.0,0,1.0,1,26.0,1.0,0.0,0.0,0,0,...,1,0.0,3.0,0.0,30.0,0.0,1,13,4.0,3.0
3,0.0,0,0.0,1,19.0,1.0,0.0,0.0,1,1,...,1,0.0,3.0,0.0,0.0,0.0,0,11,5.0,7.0
4,0.0,1,0.0,1,37.0,0.0,0.0,0.0,1,1,...,1,0.0,2.0,0.0,0.0,0.0,0,5,5.0,3.0


In [6]:
# Define features set
X = diabetes_df.copy()
X.drop('Diabetes_binary', axis = 1, inplace = True)

# Define target vector
y = diabetes_df['Diabetes_binary'].ravel()

In [7]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 78)

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators = 1000, max_features= 'sqrt', random_state = 78)

# Fit the model and use .ravel()on the "y_train" data
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [9]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5835,2560
Actual 1,1815,6574


Accuracy Score : 0.7393350810295519
Classification Report
              precision    recall  f1-score   support

         0.0       0.76      0.70      0.73      8395
         1.0       0.72      0.78      0.75      8389

    accuracy                           0.74     16784
   macro avg       0.74      0.74      0.74     16784
weighted avg       0.74      0.74      0.74     16784



In [10]:
#Get the feature importance array
importances = rf_model.feature_importances_

# List the important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.16273663623431323, 'BMI'),
 (0.13524123455929885, 'Age'),
 (0.10432422755132893, 'Income'),
 (0.09663720536050441, 'GenHlth'),
 (0.07334192201277255, 'HighBP'),
 (0.0624467798299716, 'PhysHlth'),
 (0.05995750534400458, 'MentHlth'),
 (0.05630025447337469, 'Education'),
 (0.038626608616266954, 'HighChol'),
 (0.0287803044488078, 'Fruits'),
 (0.028414938412015003, 'Smoker'),
 (0.025631443074276727, 'Sex'),
 (0.025364352374551518, 'DiffWalk'),
 (0.021913414563825423, 'PhysActivity'),
 (0.021157606196501668, 'Veggies'),
 (0.017553950807518818, 'HeartDiseaseorAttack'),
 (0.010257019841268606, 'HvyAlcoholConsump'),
 (0.009506677871715647, 'NoDocbcCost'),
 (0.009498085749452514, 'Stroke'),
 (0.006482102799060063, 'CholCheck'),
 (0.0058277298791705205, 'AnyHealthcare')]

In [8]:
# Drop 'HvyAlcoholConsump','AnyHealthcare', and 'CholCheck'
# Define a new features set
X_tuning = diabetes_df.copy()
X_tuning.drop(['Diabetes_binary', 'NoDocbcCost','Stroke', 'CholCheck', 'AnyHealthcare'], axis = 1, inplace = True)
X_tuning.head()

Unnamed: 0,HighBP,HighChol,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,0.0,33.0,0.0,0.0,1,1,1,0,2.0,15.0,0.0,1.0,1,7,6.0,9.0
1,0,1.0,27.0,1.0,0.0,1,0,0,0,2.0,1.0,2.0,0.0,1,7,6.0,6.0
2,0,1.0,26.0,1.0,0.0,0,0,0,0,3.0,0.0,30.0,0.0,1,13,4.0,3.0
3,0,0.0,19.0,1.0,0.0,1,1,1,0,3.0,0.0,0.0,0.0,0,11,5.0,7.0
4,1,0.0,37.0,0.0,0.0,1,1,1,0,2.0,0.0,0.0,0.0,0,5,5.0,3.0


In [14]:
# Splitting into Train and Test sets
X_train_tuning, X_test_tuning, y_train_tuning, y_test_tuning = train_test_split(X_tuning, y, random_state = 78)

# Fit the Standard Scaler with the training data
X_scaler_tuning = scaler.fit(X_train_tuning)

# Scale the training data
X_train_scaled_tuning = X_scaler_tuning.transform(X_train_tuning)
X_test_scaled_tuning = X_scaler_tuning.transform(X_test_tuning)

# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators = 800, max_features='sqrt', random_state = 78)

# Fit the model and use .ravel()on the "y_train" data
rf_model_tuning = rf_model.fit(X_train_scaled_tuning, y_train_tuning.ravel())

In [15]:
# Making predictions using the testing data
predictions_tuning = rf_model_tuning.predict(X_test_scaled_tuning)

# Calculating the confusion matrix
cm_tuning = confusion_matrix(y_test_tuning, predictions_tuning)
cm_df_tuning = pd.DataFrame(
    cm_tuning, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score_tuning = accuracy_score(y_test_tuning, predictions_tuning)

In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df_tuning)
print(f"Accuracy Score : {acc_score_tuning}")
print("Classification Report")
print(classification_report(y_test_tuning, predictions_tuning))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5798,2597
Actual 1,1881,6508


Accuracy Score : 0.7331982840800763
Classification Report
              precision    recall  f1-score   support

         0.0       0.76      0.69      0.72      8395
         1.0       0.71      0.78      0.74      8389

    accuracy                           0.73     16784
   macro avg       0.73      0.73      0.73     16784
weighted avg       0.73      0.73      0.73     16784



In [12]:
# Get the feature importance array
importances_tuning = rf_model_tuning.feature_importances_
# List the top 10 most important features
importances_sorted_tuning = sorted(zip(rf_model_tuning.feature_importances_, X_tuning.columns), reverse=True)
importances_sorted_tuning[:10]

[(0.1673428591384368, 'BMI'),
 (0.13877552608845156, 'Age'),
 (0.10907517949378058, 'Income'),
 (0.09798884440246217, 'GenHlth'),
 (0.07673131944254688, 'HighBP'),
 (0.06466273918273245, 'PhysHlth'),
 (0.06263244135910641, 'MentHlth'),
 (0.05954555649334187, 'Education'),
 (0.03869865405042571, 'HighChol'),
 (0.030257372651125367, 'Fruits')]