In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [12]:
df = pd.read_csv('heart.csv') #create the data frame and read data from csv file

# Data Cleansing

In [13]:
df['chol'] = pd.to_numeric(df['chol'],errors='coerce') #convert non numerical values to NaN to be dropped
df.dropna(inplace=True,how='any')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 292 entries, 0 to 309
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       292 non-null    float64
 1   sex       292 non-null    float64
 2   cp        292 non-null    float64
 3   trestbps  292 non-null    float64
 4   chol      292 non-null    float64
 5   fbs       292 non-null    float64
 6   restecg   292 non-null    float64
 7   thalach   292 non-null    float64
 8   exang     292 non-null    int64  
 9   oldpeak   292 non-null    float64
 10  slope     292 non-null    int64  
 11  ca        292 non-null    float64
 12  thal      292 non-null    float64
 13  target    292 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 34.2 KB


In [14]:
df.drop_duplicates(inplace=True) #drop any duplicates

#print the description after removing the duplicates and transposes the matrix to show all columns
df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,54.42807,0.691228,0.961404,131.954386,246.729825,0.147368,0.519298,149.273684,0.326316,1.047719,1.389474,0.722807,2.319298,0.529825
std,9.048729,0.462799,1.038986,17.585446,52.115196,0.355096,0.521185,23.041348,0.469689,1.174091,0.621698,1.005446,0.604979,0.499988
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,276.0,0.0,1.0,165.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [15]:
#calculate the IQR to remove outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df.clip(lower=lower_bound, upper=upper_bound, axis=1)

df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,54.42807,0.691228,0.961404,131.624561,245.594737,0.0,0.519298,149.322807,0.326316,1.031579,1.389474,0.670175,2.321053,0.529825
std,9.048729,0.462799,1.038986,16.668984,47.765914,0.0,0.521185,22.888301,0.469689,1.120575,0.621698,0.879133,0.598924,0.499988
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,85.0,0.0,0.0,0.0,0.0,0.5,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,276.0,0.0,1.0,165.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,170.0,373.5,0.0,2.0,202.0,1.0,4.0,2.0,2.5,3.0,1.0


# Feature Selection

In [16]:
# Calculate Z-scores for all numerical columns
numeric_cols = df.select_dtypes(include=['number']).columns
z_scores = df[numeric_cols].apply(lambda x: np.abs((x - x.mean()) / x.std()))
mean_z_scores = z_scores.mean().sort_values(ascending=False)
# Display Z-scores for each column
print("Z-scores for each column:")
print(mean_z_scores)

Z-scores for each column:
target      0.996467
restecg     0.978900
exang       0.936082
sex         0.922351
slope       0.909670
cp          0.883121
thal        0.883024
ca          0.877330
age         0.823116
oldpeak     0.818106
thalach     0.810144
chol        0.805329
trestbps    0.796282
fbs              NaN
dtype: float64


In [17]:
n_features = len(mean_z_scores)
high_priority = mean_z_scores[:n_features//3].index.tolist()
medium_priority = mean_z_scores[n_features//3 : 2*(n_features//3)].index.tolist()
low_priority = mean_z_scores[2*(n_features//3):].index.tolist()

print(high_priority)
print(medium_priority)
print(low_priority)
low_priority_to_drop = [col for col in low_priority if col in df.columns]
df.drop(low_priority_to_drop, axis=1, inplace=True)

df.columns.tolist()

['target', 'restecg', 'exang', 'sex']
['slope', 'cp', 'thal', 'ca']
['age', 'oldpeak', 'thalach', 'chol', 'trestbps', 'fbs']


['sex', 'cp', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

In [None]:
features = high_priority + low_priority

#Extract the X and Y values for the target
xTarget = df.drop('target',axis=1)
yTarget = df['target']

#split the data into train data and test data
xtrain, xtest, ytrain, ytest = train_test_split(xTarget, yTarget, test_size=0.2, random_state=42)

features = high_priority + medium_priority
xtrain_selected = xtrain[features]
xtest_selected = xtest[features]