In [3]:
import pandas as pd

df = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')


print(df.shape)
df.head()

(70692, 22)


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [4]:
# Check for missing values
print(df.isnull().sum())

# Check the data types of each column
print(df.dtypes)

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64
Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64

In [5]:
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [8]:
print(y.value_counts(normalize=True))


Diabetes_binary
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 200, 300],
    'class_weight': ['balanced', None]
}

# Initialize the model
model = LogisticRegression(random_state=42)

# Set up the grid search with 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


Best Parameters: {'C': 0.01, 'class_weight': 'balanced', 'max_iter': 100, 'solver': 'liblinear'}
Best Cross-Validation Score: 0.7471504822223469


In [10]:
# Use the best estimator from the search
best_model = grid_search.best_estimator_  # Or random_search.best_estimator_ if using RandomizedSearchCV

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the performance
from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Accuracy: 0.748868351565447
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75     10669
         1.0       0.74      0.77      0.75     10539

    accuracy                           0.75     21208
   macro avg       0.75      0.75      0.75     21208
weighted avg       0.75      0.75      0.75     21208



In [16]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [17]:
print(df['BMI'].describe())

count    70692.000000
mean        29.856985
std          7.113954
min         12.000000
25%         25.000000
50%         29.000000
75%         33.000000
max         98.000000
Name: BMI, dtype: float64


In [30]:
# Transform BMI
df['BMI_Category'] = pd.cut(
    df['BMI'], 
    bins=[0, 18.5, 24.9, 29.9, 100],  # Define meaningful ranges
    labels=['Underweight', 'Normal', 'Overweight', 'Obese'], 
    right=True
)

# Convert BMI categories to binary columns
df = pd.get_dummies(df, columns=['BMI_Category'], prefix='BMI')

# Transform GenHlth
df['Health_Category'] = pd.cut(
    df['GenHlth'], 
    bins=[1, 2, 3, 4, 5, 6], 
    labels=['Excellent', 'Very good', 'Good', 'Fair', 'Poor'], 
    right=True
)

# Transform MentHlth
df['MentHlth_Category'] = pd.cut(
    df['MentHlth'], 
    bins=[0, 1, 7, 30],  # No poor days, 1-7 poor days (some), 8-30 poor days (chronic)
    labels=['No poor days', 'Some poor days', 'Chronic issues'], 
    right=True
)

# Transform PhysHlth
df['PhysHlth_Category'] = pd.cut(
    df['PhysHlth'], 
    bins=[0, 1, 7, 30],  # Same logic as MentHlth
    labels=['No poor days', 'Some poor days', 'Chronic issues'], 
    right=True
)

# Transform Age
age_mapping = {
    1: "18-24", 2: "25-29", 3: "30-34", 4: "35-39",
    5: "40-44", 6: "45-49", 7: "50-54", 8: "55-59",
    9: "60-64", 10: "65-69", 11: "70-74", 12: "75-79", 
    13: "80 or older"
}
df['Age_Category'] = df['Age'].map(age_mapping)

# Transform Education
education_mapping = {
    1: "Never attended school or kindergarten", 
    2: "Grades 1-8 (Elementary)", 
    3: "Grades 9-11 (Some high school)", 
    4: "Grade 12 or GED (High school graduate)", 
    5: "College 1-3 years (Some college or technical school)", 
    6: "College 4+ years (College graduate)"
}
df['Education_Level'] = df['Education'].map(education_mapping)

# Transform Income
income_mapping = {
    1: "<$10,000", 2: "$10,000-$15,000", 3: "$15,000-$20,000", 
    4: "$20,000-$25,000", 5: "$25,000-$35,000", 
    6: "$35,000-$50,000", 7: "$50,000-$75,000", 
    8: ">$75,000"
}
df['Income_Category'] = df['Income'].map(income_mapping)

# Verify transformations
print(df[['MentHlth', 'MentHlth_Category']].head())
print(df[['PhysHlth', 'PhysHlth_Category']].head())
print(df[['Age', 'Age_Category']].head())
print(df[['Education', 'Education_Level']].head())
print(df[['Income', 'Income_Category']].head())


   MentHlth MentHlth_Category
0       5.0    Some poor days
1       0.0               NaN
2       0.0               NaN
3       0.0               NaN
4       0.0               NaN
   PhysHlth PhysHlth_Category
0      30.0    Chronic issues
1       0.0               NaN
2      10.0    Chronic issues
3       3.0    Some poor days
4       0.0               NaN
    Age Age_Category
0   4.0        35-39
1  12.0        75-79
2  13.0  80 or older
3  11.0        70-74
4   8.0        55-59
   Education                                    Education_Level
0        6.0                College 4+ years (College graduate)
1        6.0                College 4+ years (College graduate)
2        6.0                College 4+ years (College graduate)
3        6.0                College 4+ years (College graduate)
4        5.0  College 1-3 years (Some college or technical s...
   Income Income_Category
0     8.0        >$75,000
1     8.0        >$75,000
2     8.0        >$75,000
3     8.0        >$75,000


In [31]:
# Drop original integer columns
df_transformed = df.drop(columns=['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income'])

# Convert categorical columns to binary
df_transactions = pd.get_dummies(df_transformed)

print(df_transactions.head())  # Preview the transactional dataset


   Diabetes_binary  HighBP  HighChol  CholCheck  Smoker  Stroke  \
0              0.0     1.0       0.0        1.0     0.0     0.0   
1              0.0     1.0       1.0        1.0     1.0     1.0   
2              0.0     0.0       0.0        1.0     0.0     0.0   
3              0.0     1.0       1.0        1.0     1.0     0.0   
4              0.0     0.0       0.0        1.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  \
0                   0.0           1.0     0.0      1.0  ...   
1                   0.0           0.0     1.0      0.0  ...   
2                   0.0           1.0     1.0      1.0  ...   
3                   0.0           1.0     1.0      1.0  ...   
4                   0.0           1.0     1.0      1.0  ...   

   Education_Level_Grades 9-11 (Some high school)  \
0                                           False   
1                                           False   
2                                           False   
3     

In [35]:
# Set display options
pd.set_option('display.max_columns', None)

# View the DataFrame
print(df_transactions.head())

   Diabetes_binary  HighBP  HighChol  CholCheck  Smoker  Stroke  \
0              0.0     1.0       0.0        1.0     0.0     0.0   
1              0.0     1.0       1.0        1.0     1.0     1.0   
2              0.0     0.0       0.0        1.0     0.0     0.0   
3              0.0     1.0       1.0        1.0     1.0     0.0   
4              0.0     0.0       0.0        1.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  \
0                   0.0           1.0     0.0      1.0                0.0   
1                   0.0           0.0     1.0      0.0                0.0   
2                   0.0           1.0     1.0      1.0                0.0   
3                   0.0           1.0     1.0      1.0                0.0   
4                   0.0           1.0     1.0      1.0                0.0   

   AnyHealthcare  NoDocbcCost  GenHlth  DiffWalk  Sex  BMI_Underweight  \
0            1.0          0.0      3.0       0.0  1.0       