In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv
/kaggle/input/diabetes-health-indicators-dataset/diabetes_012_health_indicators_BRFSS2015.csv


# PLAN OF WORK

1. BMI ---> categorical (bins)
2. scale data (StandardScaler)
3. EDA:
    - correlation etc.
  


# Data preparation

## Changing continuous feature into a bunch of binary ones (continuous -> categories -> one hot encoding)

### Module imports

In [2]:
from sklearn.preprocessing import OneHotEncoder
from pandas import DataFrame, Series
from typing import Union, Literal



### Continuous -> discrete

In [3]:
def convertBMIIndexToObesityCategory(bmi_index: float, log_warnings: bool = False) -> Literal["Underweight", "Healthy weight","Overweight","Obese"]:
    """
    Converts BMI index to a corresponding obesity category.
    """
    if bmi_index < 9 and log_warnings:
         print(f"Very low BMI index encountered: {bmi_index}.")
    if bmi_index < 18.5:
        return "Underweight"
    if bmi_index < 24.9:
        return "Healthy weight"
    if bmi_index < 29.9:
         return "Overweight"
    if bmi_index > 186 and log_warnings:
         print(f"Very high BMI index encountered: {bmi_index}.")
    return "Obese"
    
def convertBMIIndexColumnToObesityCategories(bmi_column: Series) -> Series:
    """
    Converts BMI indices in column to obesity category.
    """
    category_list = [None] * len(bmi_column)
    for sample_index, bmi_index in enumerate(bmi_column):
        category_list[sample_index] = convertBMIIndexToObesityCategory(bmi_index, log_warnings = True)
    return Series(category_list, dtype = 'category')

### Discrete -> one-hot encoded

In [4]:
def oneHotEncodeBMIColumn(dataset: DataFrame, feature_column_name: str) -> DataFrame:
    """
    Converts BMI column in dataset into series of one hot encoded obesity categories.
    ---
    Parameters:
    
    - `dataset`: DataFrame that contains column to be converted.
    - `feature_column_index`: index of feature's column.
    
    Returns:
    
    A DataFrame with selected column categorized and one hot encoded.
    """
    dataset_has_correct_type: bool = isinstance(dataset, DataFrame)
    if not dataset_has_correct_type:
        dataset_type_name = type(dataset)
        raise TypeError("Dataset has incorrect type; use pandas.DataFrame.")
    try:
        dataset[feature_column_name]
    except KeyError:
        raise IndexError(f"Incorrect feature column name: {feature_column_name}")
    dataset[feature_column_name] = convertBMIIndexColumnToObesityCategories(dataset[feature_column_name])
    encoder = OneHotEncoder(sparse_output = False)
    feature_encoded = DataFrame(encoder.fit_transform(dataset[[feature_column_name]]))
    feature_encoded.columns = encoder.get_feature_names_out([feature_column_name])
    dataset.drop([feature_column_name], axis = 1, inplace = True)
    return (pd.concat([dataset, feature_encoded], axis=1))

In [5]:
# Data import and preprocessing

diabetes_dataset_5050_split = pd.read_csv("/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
diabetes_dataset_5050_split["BMI"] = convertBMIIndexColumnToObesityCategories(diabetes_dataset_5050_split["BMI"])

print(diabetes_dataset_5050_split.head())

   Diabetes_binary  HighBP  HighChol  CholCheck         BMI  Smoker  Stroke  \
0              0.0     1.0       0.0        1.0  Overweight     0.0     0.0   
1              0.0     1.0       1.0        1.0  Overweight     1.0     1.0   
2              0.0     0.0       0.0        1.0  Overweight     0.0     0.0   
3              0.0     1.0       1.0        1.0  Overweight     1.0     0.0   
4              0.0     0.0       0.0        1.0  Overweight     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           1.0     0.0  ...            1.0   
1                   0.0           0.0     1.0  ...            1.0   
2                   0.0           1.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      3.0       5

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
df = diabetes_dataset_5050_split
df.head()



Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,Overweight,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,Overweight,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,Overweight,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,Overweight,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,Overweight,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Diabetes_binary       70692 non-null  float64 
 1   HighBP                70692 non-null  float64 
 2   HighChol              70692 non-null  float64 
 3   CholCheck             70692 non-null  float64 
 4   BMI                   70692 non-null  category
 5   Smoker                70692 non-null  float64 
 6   Stroke                70692 non-null  float64 
 7   HeartDiseaseorAttack  70692 non-null  float64 
 8   PhysActivity          70692 non-null  float64 
 9   Fruits                70692 non-null  float64 
 10  Veggies               70692 non-null  float64 
 11  HvyAlcoholConsump     70692 non-null  float64 
 12  AnyHealthcare         70692 non-null  float64 
 13  NoDocbcCost           70692 non-null  float64 
 14  GenHlth               70692 non-null  float64 
 15  Me