In [None]:
# Feature Selection
# Features and target
X = df_original.drop(columns=['Hypertension'], axis=1)
y = df_original['Hypertension']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

df_original.describe()

In [None]:
# Variance Threshold
selector = VarianceThreshold(threshold=0.8)
X_high_var = selector.fit_transform(X_train)
var_features = X_train.columns[selector.get_support()]
print(f"Original features: {X_train.shape[1]}")
print(f"Features after variance threshold: {len(var_features)}")

print("Features by variance threshold :", var_features.tolist())

In [None]:
variances = selector.variances_
plt.figure(figsize=(12, 14))
sns.barplot(x=variances, y=X_train.columns)
plt.axvline(x=0.1, color='r', linestyle='--')
plt.title('Feature Variances')
plt.tight_layout()
plt.savefig('./results/eda_visualizations/13_feature_variance.png')
plt.show()

In [None]:
#  Mutual Information
mi_scores = mutual_info_classif(X_train, y_train)
mi_scores = pd.Series(mi_scores, index=X_train.columns)

# Select top 17 features by MI
top_k_mi = 25
mi_features = mi_scores.sort_values(ascending=True).head(top_k_mi).index.tolist()
print(f"Top {top_k_mi} features by Mutual Information: {mi_features}")

In [None]:
# Visualize mutual information scores
plt.figure(figsize=(12, 14))
sns.barplot(x= mi_scores.sort_values(ascending=False), y=mi_scores.index)
plt.title('Feature Importance (Mutual Information)')
plt.tight_layout()
plt.savefig('./results/eda_visualizations/14_feature_importance_mutual_information.png')
plt.show()

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = X_train.columns
forest_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Select top 21
# features by Random Forest
top_k_rf = 25
rf_features = forest_importances.head(top_k_rf).index.tolist()
print(f"Top {top_k_rf} features by Random Forest: {rf_features}")


In [None]:
plt.figure(figsize=(12, 20))
sns.barplot(x=forest_importances.values, y=forest_importances.index)
plt.title('Feature Importances (Random Forest)')
plt.savefig('./results/eda_visualizations/15_feature_importance_random_forest.png')
plt.tight_layout()
plt.show()

In [None]:
# SelectKBest
top_k = 25
selector = SelectKBest(score_func=mutual_info_classif, k=top_k)
X_new = selector.fit_transform(X_train, y_train)
kbest_features = X_train.columns[selector.get_support()].tolist()
print(f"Top 25 features by SelectKBest: {kbest_features}")

In [None]:
feature_scores = pd.DataFrame({
    'Feature': X_train.columns,
    'Score': selector.scores_
})

feature_scores = feature_scores.dropna().sort_values(by="Score", ascending=False)

plt.figure(figsize=(12,14))
sns.barplot(x="Score", y="Feature", data=feature_scores)
plt.title('Feature Scores (SelectKBest with mutual_info_classif)')
plt.savefig('./results/eda_visualizations/16_feature_importance_select_k_best.png')
plt.tight_layout()
plt.show()

In [None]:
# select features
all_features = var_features.tolist() + mi_features + kbest_features + rf_features
feature_counts = Counter(all_features)
print(feature_counts)

In [None]:
# Convert Counter to dict
feature_names = list(feature_counts.keys())
counts = list(feature_counts.values())

# Feature selection summary
plt.figure(figsize=(12, 6))
plt.bar(feature_names, counts)
plt.xticks(rotation=90)
plt.xlabel("Features")
plt.ylabel("Selection Count")
plt.title("Feature Selection Frequency (across methods)")
plt.tight_layout()
plt.show()

In [None]:
# Features picked by 2 or more methods
final_features = [f for f, c in feature_counts.items() if c >= 2]

print("Final selected features:", final_features)
print("Count:", len(final_features))

In [None]:
# Create new DataFrame with only the final selected features
df_selected = df_original[final_features + ['Hypertension']]

print("Shape of new dataset:", df_selected.shape)
df_selected.head()

df_selected.to_csv("results/outputs/hypertension_dataset(encoded-balanced-feature_engineered-scaled-selected).csv", index=False)