**Feature Engineering**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier


In [3]:
# Load the preprocessed dataset
df = pd.read_csv("/content/drive/MyDrive/Heart_Disease_Prediction/data/processed/heart_disease_cleaned.csv")

**Identify and Encode Categorical Columns**

In [4]:
# Identify categorical columns
categorical_columns = ['dataset', 'cp', 'restecg', 'slope', 'thal']

# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future use


**Create Polynomial Interaction Features**

In [5]:
# Create interaction features (Polynomial Features)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction_features = poly.fit_transform(df.drop(columns=['num']))  # Ensure only numeric columns
interaction_df = pd.DataFrame(interaction_features, columns=poly.get_feature_names_out(df.drop(columns=['num']).columns))

# Merge the new features with the original dataset
df = pd.concat([df, interaction_df], axis=1)


**Feature Selection using ANOVA F-value**

In [6]:
# Define features and target variable
X = df.drop(columns=['num'])  # Features
y = df['num']  # Target variable

# Select top features using ANOVA F-value
selector_anova = SelectKBest(score_func=f_classif, k=10)
X_selected_anova = selector_anova.fit_transform(X, y)
selected_features_anova = X.columns[selector_anova.get_support()]


**Feature Selection using Mutual Information**

In [7]:
# Select top features using Mutual Information
selector_mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected_mi = selector_mi.fit_transform(X, y)
selected_features_mi = X.columns[selector_mi.get_support()]


**Feature Importance using Random Forest**

In [8]:
# Train a Random Forest model for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
selected_features_rf = feature_importance.nlargest(10).index


**Final Features Selection**

In [9]:
# Select final features (intersection of methods)
final_selected_features = list(set(selected_features_anova) & set(selected_features_mi) & set(selected_features_rf))
df_selected = df[final_selected_features + ['num']]


In [10]:
# Define the processed data folder
processed_path = "/content/drive/MyDrive/Heart_Disease_Prediction/data/processed/"

# Save the feature-engineered dataset
df_selected.to_csv(processed_path + "heart_disease_features_selected.csv", index=False)

print("Feature Engineering & Selection completed. Selected features saved successfully!")


Feature Engineering & Selection completed. Selected features saved successfully!
