In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data
file_path = '../data/clean_data.csv'
clean_data = pd.read_csv(file_path)

# 1. Polynomial and Interaction Features

In [None]:
'''Interaction Terms: The interaction between BMI and Age, and between PhysHlth and BMI, may capture combined health impacts.'''
clean_data['BMI_Age'] = clean_data['BMI'] * clean_data['Age']  # Interaction term
clean_data['PhysHlth_BMI'] = clean_data['PhysHlth'] * clean_data['BMI']

# 2. Binning for Age and BMI

In [None]:
'''Binned Variables: Age and BMI are categorized into groups, capturing non-linear relationships.'''
clean_data['Age_bin'] = pd.cut(clean_data['Age'], bins=[0, 30, 45, 60, 100], labels=['Young', 'Adult', 'MiddleAge', 'Senior'])
clean_data['BMI_bin'] = pd.cut(clean_data['BMI'], bins=[0, 18.5, 24.9, 29.9, 100], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# 3. Aggregated Health Scores

In [9]:
'''Lifestyle and Risk Scores: These aggregate scores 
simplify complex health information 
and may correlate with outcomes like diabetes.'''

'Lifestyle and Risk Scores: These aggregate scores \nsimplify complex health information \nand may correlate with outcomes like diabetes.'

In [5]:
# Healthy lifestyle score: Sum of healthy behaviors (Fruits, Veggies, No smoking, No heavy drinking)
clean_data['HealthyLifestyleScore'] = clean_data[['Fruits', 'Veggies', 'PhysActivity']].sum(axis=1) - clean_data[['Smoker', 'HvyAlcoholConsump']].sum(axis=1)

In [6]:
# Health risk score based on chronic conditions
clean_data['HealthRiskScore'] = clean_data[['HighBP', 'HighChol', 'Stroke', 'HeartDiseaseorAttack', 'DiffWalk']].sum(axis=1)

# 4. Scaling Continuous Variables

In [7]:
scaler = StandardScaler()
clean_data[['BMI_scaled', 'Age_scaled', 'PhysHlth_scaled', 'MentHlth_scaled']] = scaler.fit_transform(clean_data[['BMI', 'Age', 'PhysHlth', 'MentHlth']])

# Display the updated DataFrame with new features , and Save the modified dataset with new features

In [8]:
print("Data with New Features:\n", clean_data.head())

Data with New Features:
     BMI  PhysHlth  Age  HighBP  HighChol  CholCheck  Smoker  Stroke  \
0  32.0       0.0   85       1         1          1       0       0   
1  24.0       0.0   85       1         0          1       0       0   
2  20.0       0.0   57       0         1          1       1       0   
3  35.0       0.0   72       1         1          1       1       0   
4  29.0       2.0   62       1         1          1       1       0   

   HeartDiseaseorAttack  PhysActivity  ...  BMI_Age  PhysHlth_BMI    Age_bin  \
0                     0             0  ...   2720.0           0.0     Senior   
1                     0             0  ...   2040.0           0.0     Senior   
2                     0             0  ...   1140.0           0.0  MiddleAge   
3                     0             0  ...   2520.0           0.0     Senior   
4                     0             1  ...   1798.0          58.0     Senior   

      BMI_bin  HealthyLifestyleScore  HealthRiskScore  BMI_scaled  

In [11]:
clean_data.to_csv('../data/clean_data_with_features.csv', index=False)
print("Dataset with new features saved as 'clean_data_with_features.csv'")

Dataset with new features saved as 'clean_data_with_features.csv'
