In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv
/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_health_indicators_BRFSS2015.csv
/kaggle/input/diabetes-health-indicators-dataset/diabetes_012_health_indicators_BRFSS2015.csv


# Data preparation

## Changing continuous feature into a bunch of binary ones (continuous -> categories -> one hot encoding)

### Module imports

In [2]:
from sklearn.preprocessing import OneHotEncoder
from pandas import DataFrame, Series
from typing import Union, Literal
from math import sqrt, ceil



### Continuous -> discrete

In [3]:
def determineBin(value: float|int, bin_lower_bounds: list) -> int|None:
    """
    Determines which bin the passed value fits into.
    
    Parameters
    ---
    - `value`: a numeric value to be labeled.
    - `bin_lower_bounds`: a list of histogram bin lower bounds.

    Example
    ---
    For these three bins:
        - 1.5 - 1.99
        - 2.0 - 2.49
        - 2.5 - 2.99

    bin lower bounds list would be `[1.5, 2.0, 2.5]`.
    """
    first_bin_lower_bound = bin_lower_bounds[0]
    if value < first_bin_lower_bound:
        return None
    last_bin_lower_bound = bin_lower_bounds[-1]
    if value > last_bin_lower_bound:
        last_bin_index = len(bin_lower_bounds)
        return last_bin_index
    for bin_index, bin_lower_bound in enumerate(bin_lower_bounds):
        if value < bin_lower_bound:
            return bin_index

def roundUpToAccuracy(value: float, multiple: float) -> float:
    return multiple * round(value / multiple)

def convertContinuousVariableIntoBins(feature: Series, measure_accuracy: float, bin_num: Literal["auto"]|int = "auto") -> None:
    """
    Converts continuous value vector into a vector histogram bin indices that each value fits into.

    Parameters
    ---
    - `feature`: a pandas.Series object that contains feature's value vector.
    - `measure_accuracy` - the lowest value that could be measured in feature, e.g. for value vector `[1.2, 1.3, 3.4, 2.1]` measure accuracy would be `0.1` and for vector `[1.5, 3.5, 0.5, 15.0]` measure accuracy would be `0.5`.
    """
    amount_of_values = len(feature)
    if amount_of_values == 0:
        raise ValueError("Feature vector is empty.")
    estimated_bin_amount = int(sqrt(amount_of_values))
    min_bin_amount = 5
    max_bin_amount = 30
    bin_amount: int = 0
    def determineBinAmount(min_bin_amount: int, estimated_bin_amount: int, max_bin_amount: int) -> int:
        if estimated_bin_amount < min_bin_amount:
            return min_bin_amount
        if estimated_bin_amount > max_bin_amount:
            return max_bin_amount
        return estimated_bin_amount
    bin_amount = determineBinAmount(min_bin_amount, estimated_bin_amount, max_bin_amount) if bin_num == "auto" else bin_num
    range_length = max(feature) - min(feature)
    bin_length = roundUpToAccuracy(range_length / bin_amount, measure_accuracy)
    first_bin_lower_bound_offset = ((bin_amount * bin_length) - range_length) /2
    first_bin_lower_bound = min(feature) - first_bin_lower_bound_offset
    bin_lower_bounds = [None] * bin_amount #appending values in loop is a bad practice
    for bin_bound_number, _ in enumerate(bin_lower_bounds):
        bin_lower_bounds[bin_bound_number] = first_bin_lower_bound + bin_bound_number * bin_length
    for sample_index, sample_feature_value in enumerate(feature):
        feature[sample_index] = determineBin(sample_feature_value, bin_lower_bounds)
    feature = feature.astype("category")
    

### Discrete -> one-hot encoded

In [4]:
def oneHotEncodeContinousVariable(dataset: DataFrame, feature_column_name: str) -> DataFrame:
    """
    Converts continuous feature vector in dataset into series of one hot encoded categories; every column should hold data for only one feature.
    ---
    Parameters:
    
    - `dataset`: DataFrame that contains feature to be converted.
    - `feature_column_index`: index of feature's column.
    
    Returns:
    
    A DataFrame with selected feature categorized and one hot encoded.
    """
    dataset_has_correct_type: bool = isinstance(dataset, DataFrame)
    if not dataset_has_correct_type:
        dataset_type_name = type(dataset)
        raise TypeError("Dataset has incorrect type; use pandas.DataFrame.")
    try:
        dataset[feature_column_name]
    except KeyError:
        raise IndexError(f"Incorrect feature column name: {feature_column_name}")
    convertContinuousVariableIntoBins(feature = dataset[feature_column_name], measure_accuracy = 1.0)
    encoder = OneHotEncoder(sparse_output = False)
    feature_encoded = DataFrame(encoder.fit_transform(dataset[[feature_column_name]]))
    feature_encoded.columns = encoder.get_feature_names_out([feature_column_name])
    dataset.drop([feature_column_name], axis = 1, inplace = True)
    return (pd.concat([dataset, feature_encoded], axis=1))

In [5]:
# Data import and preprocessing

diabetes_dataset_5050_split = pd.read_csv("/kaggle/input/diabetes-health-indicators-dataset/diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
diabetes_dataset_5050_split_ohencoded = oneHotEncodeContinousVariable(diabetes_dataset_5050_split, "BMI")
print(diabetes_dataset_5050_split_ohencoded.head())

   Diabetes_binary  HighBP  HighChol  CholCheck  Smoker  Stroke  \
0              0.0     1.0       0.0        1.0     0.0     0.0   
1              0.0     1.0       1.0        1.0     1.0     1.0   
2              0.0     0.0       0.0        1.0     0.0     0.0   
3              0.0     1.0       1.0        1.0     1.0     0.0   
4              0.0     0.0       0.0        1.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  BMI_21.0  \
0                   0.0           1.0     0.0      1.0  ...       0.0   
1                   0.0           0.0     1.0      0.0  ...       0.0   
2                   0.0           1.0     1.0      1.0  ...       0.0   
3                   0.0           1.0     1.0      1.0  ...       0.0   
4                   0.0           1.0     1.0      1.0  ...       0.0   

   BMI_22.0  BMI_23.0  BMI_24.0  BMI_25.0  BMI_26.0  BMI_27.0  BMI_28.0  \
0       0.0       0.0       0.0       0.0       0.0       0.0       0.0   
1       

# PLAN OF WORK

1. BMI ---> categorical (bins)
2. scale data (StandardScaler)
3. EDA:
    - correlation etc.
  
