In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, VarianceThreshold, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Load dataset
df_original = pd.read_csv("./data/raw/hypertension_dataset.csv")

# print first 5 rows
print("First 5 rows : ")
print(df_original.head())

print("\nDF (Rows, Colums) :", df_original.shape)

In [None]:
# Handle missing values
print("\nMissing values per column:")
print(df_original.isnull().sum())

# remove missing 
df_original = df_original.dropna(axis=1)
print(df_original.head)


In [None]:
# Missing Values Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_original.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.savefig('./results/eda_visualizations/1_missing_values_heatmap.png')
plt.show()

In [None]:
# Gender visualize
plt.figure(figsize=(8,5))
sns.countplot(x="Gender", data=df_original)
plt.title("Gender Distribution Before Encoding")
plt.savefig('./results/eda_visualizations/2_Gender_Distribution_Before_Encodingvisualization.png')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x=df_original["Country"])
plt.title("Country Distribution (Before Encoding)")
plt.xticks(rotation=45)
plt.savefig('./results/eda_visualizations/3_Country_Distribution_Before_Encoding.png')
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(x=df_original["Smoking_Status"])
plt.title("Smoking Status Distribution (Before Encoding)")
plt.savefig('./results/eda_visualizations/4_Smoking_Status_Distribution_Before Encoding.png')
plt.show()

plt.figure(figsize=(6,4))
sns.countplot(x=df_original["Employment_Status"])
plt.title("Employment Status Distribution (Before Encoding)")
plt.savefig('./results/eda_visualizations/5_Employment_Status_Distribution_Before_Encoding.png')
plt.show()


In [None]:
# Encode
print("Before Label Encoding:")
print(df_original.dtypes)

# Label Encode

# Define mappings for ordinal features
hypertension_map = {'Low': 0, 'High': 1}
activity_map = {'Low': 0, 'Moderate': 1, 'High': 2}
education_level_map = {'Primary': 0, 'Secondary': 1, 'Tertiary': 2}
diabetes_map = {'No': 0, 'Yes': 1}
gender_map = {'Female': 0, 'Male': 1}
family_history_map = {'No': 0, 'Yes': 1}

# Apply mappings
df_original['Hypertension'] = df_original['Hypertension'].map(hypertension_map)
df_original['Physical_Activity_Level'] = df_original['Physical_Activity_Level'].map(activity_map)
df_original['Education_Level'] = df_original['Education_Level'].map(education_level_map)
df_original['Diabetes'] = df_original['Diabetes'].map(diabetes_map)
df_original['Gender'] = df_original['Gender'].map(gender_map)
df_original['Family_History'] = df_original['Family_History'].map(family_history_map)

print("\nAfter Label Encoding:")
print(df_original.head())

In [None]:
print(df_original.dtypes)

In [None]:
# One-Hot Encoding for multi-category columns
country_dummies = pd.get_dummies(df_original["Country"], prefix="Country").astype(int)
smoking_status_dummies = pd.get_dummies(df_original["Smoking_Status"], prefix="Smoking_Status").astype(int)
employment_status_dummies = pd.get_dummies(df_original["Employment_Status"], prefix="Employment_Status").astype(int)

# Add encoded columns to dataset
df_original = pd.concat([df_original, country_dummies], axis=1)
df_original = pd.concat([df_original, smoking_status_dummies], axis=1)
df_original = pd.concat([df_original, employment_status_dummies], axis=1)

# Drop old columns
df_original.drop("Country", axis=1, inplace=True)
df_original.drop("Smoking_Status", axis=1, inplace=True)
df_original.drop("Employment_Status", axis=1, inplace=True)

print("\nAfter One-Hot Encoding:")
print("Dataset shape:", df_original.shape)

df_original.to_csv("results/outputs/hypertension_dataset(encoded).csv", index=False)

In [None]:
# Gender visualize
plt.figure(figsize=(8,5))
sns.countplot(x="Gender", data=df_original)
plt.title("Gender Distribution After Encoding")
plt.savefig('./results/eda_visualizations/6_Gender_Distribution_Before_Encoding.png')
plt.show()