# import pandas 

In [7]:
import pandas as pd

# Load the data
file_path = 'heart.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


(   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
 0   52    1   0       125   212    0        1      168      0      1.0      2   
 1   53    1   0       140   203    1        0      155      1      3.1      0   
 2   70    1   0       145   174    0        1      125      1      2.6      0   
 3   61    1   0       148   203    0        1      161      0      0.0      2   
 4   62    0   0       138   294    1        1      106      0      1.9      1   
 
    ca  thal  target  
 0   2     3       0  
 1   0     3       0  
 2   0     3       0  
 3   1     3       0  
 4   3     2       0  ,
 None)

# Feature Engineering

In [8]:


# Binned age groups
data['age_bin'] = pd.cut(data['age'], bins=[0, 30, 40, 50, 60, 70, 80, 90], labels=False)

# Ratio of cholesterol to age
data['chol_age_ratio'] = data['chol'] / data['age']

# Ratio of maximum heart rate to age
data['thalach_age_ratio'] = data['thalach'] / data['age']

# Product of chest pain type and resting blood pressure
data['cp_trestbps_product'] = data['cp'] * data['trestbps']

# Display the first few rows to verify the new features
data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_bin,chol_age_ratio,thalach_age_ratio,cp_trestbps_product
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0,3,4.076923,3.230769,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0,3,3.830189,2.924528,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0,4,2.485714,1.785714,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0,4,3.327869,2.639344,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0,4,4.741935,1.709677,0


# Applying PCA

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Separating the features and target variable
X = data.drop(columns=['target'])
y = data['target']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# Number of components selected
pca.n_components_


12

In [13]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

feature_importances_df


Unnamed: 0,Feature,Importance
16,cp_trestbps_product,0.122862
12,thal,0.108623
2,cp,0.100768
11,ca,0.09654
9,oldpeak,0.091939
15,thalach_age_ratio,0.082371
7,thalach,0.067798
0,age,0.05411
14,chol_age_ratio,0.051953
4,chol,0.047495
