In [None]:
# Select numerical columns
numerical_dataset = df_original.select_dtypes(include=[np.number])
numeric_cols = numerical_dataset.columns.tolist()

Q1 = df_original[numeric_cols].quantile(0.25) 
Q3 = df_original[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR

# Outliers > Upper bound | Outliers < Lower bound
condition = ((df_original[numeric_cols] >= lower_bound) & (df_original[numeric_cols] <= upper_bound)).any(axis=1)

df_original = df_original[condition]

print("Shape after outlier removal:", df_original.shape)

In [None]:
# Create BMI categories using ORIGINAL (unscaled) values
def bmi_category(bmi):
    """Categories based on WHO standards (kg/m²)"""
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'
    
def age_group(age: int) -> str:
    """Categorize a person into age groups."""
    if age < 0:
        return "Invalid"
    elif age < 13:
        return "Child"
    elif age < 20:
        return "Teen"
    elif age < 36:
        return "Young Adult"
    elif age < 51:
        return "Adult"
    elif age < 66:
        return "Middle-aged"
    else:
        return "Senior"

# Categorize features
df_original['BMI_Category'] = df_original['BMI'].apply(bmi_category)
df_original['Age_Group'] = df_original['Age'].apply(age_group)

# Create features using ORIGINAL clinical measurements
df_original['Pulse_Pressure'] = df_original['Systolic_BP'] - df_original['Diastolic_BP']
df_original['Cholesterol_Ratio'] = (df_original['LDL'] + df_original['HDL'] + df_original['Triglycerides']/5) / df_original['HDL']
df_original['Cholesterol_BMI'] = df_original['Cholesterol'] * df_original['BMI']
df_original['BP_Interaction'] = df_original['Systolic_BP'] * df_original['Diastolic_BP']
df_original['LDL_HDL_Ratio'] = df_original['LDL'] / (df_original['HDL'] + 1e-5)
df_original['Triglycerides_Glucose'] = df_original['Triglycerides'] * df_original['Glucose']

# One-Hot Encoding for Categorize features
bmi_category_dummies = pd.get_dummies(df_original["BMI_Category"], prefix="BMI_Category").astype(int)
age_category_dummies= pd.get_dummies(df_original["Age_Group"], prefix="Age_Group").astype(int)

# Add encoded columns to dataset
df_original = pd.concat([df_original, bmi_category_dummies], axis=1)
df_original = pd.concat([df_original, age_category_dummies], axis=1)

df_original.drop('BMI_Category', inplace=True, axis=1)
df_original.drop('Age_Group', inplace=True, axis=1)

# get new shape
print("\nAfter Feature Engineering:")
print("Dataset shape:", df_original.shape)

df_original.to_csv("results/outputs/hypertension_dataset(encoded-balanced-feature_engineered).csv", index=False)

In [None]:
# Created features distribution
numeric_features = ['Pulse_Pressure', 'Cholesterol_Ratio', 'Cholesterol_BMI',
                    'BP_Interaction', 'LDL_HDL_Ratio', 'Triglycerides_Glucose']

plt.figure(figsize=(15, 10))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(3, 2, i)
    sns.histplot(df_original[feature], kde=True, bins=30, color='skyblue')
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.savefig('./results/eda_visualizations/9_Pulse_Pressure,Cholesterol_Ratio,Cholesterol_BMI,BP_Interaction,LDL_HDL_Ratio,Triglycerides_Glucose.png')
plt.show()


In [None]:
target = 'Hypertension'  # replace if your target column has a different name

plt.figure(figsize=(15, 10))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(3, 2, i)
    sns.boxplot(x=target, y=df_original[feature], data=df_original)
    plt.title(f'{feature} vs {target}')
plt.tight_layout()
plt.savefig('./results/eda_visualizations/10_Hypertension_vc_Pulse_Pressure,Cholesterol_Ratio,Cholesterol_BM,BP_Interaction,LDL_HDL_Ratio,Triglycerides_Glucose.png')
plt.show()