In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [43]:
df = pd.read_csv('heart.csv') #create the data frame and read data from csv file

# Data Cleansing

In [44]:
df['chol'] = pd.to_numeric(df['chol'],errors='coerce') #convert non numerical values to NaN to be dropped
df.dropna(inplace=True,how='any')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 292 entries, 0 to 309
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       292 non-null    float64
 1   sex       292 non-null    float64
 2   cp        292 non-null    float64
 3   trestbps  292 non-null    float64
 4   chol      292 non-null    float64
 5   fbs       292 non-null    float64
 6   restecg   292 non-null    float64
 7   thalach   292 non-null    float64
 8   exang     292 non-null    int64  
 9   oldpeak   292 non-null    float64
 10  slope     292 non-null    int64  
 11  ca        292 non-null    float64
 12  thal      292 non-null    float64
 13  target    292 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 34.2 KB


In [45]:
df.drop_duplicates(inplace=True) #drop any duplicates

#print the description after removing the duplicates and transposes the matrix to show all columns
df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,54.42807,0.691228,0.961404,131.954386,246.729825,0.147368,0.519298,149.273684,0.326316,1.047719,1.389474,0.722807,2.319298,0.529825
std,9.048729,0.462799,1.038986,17.585446,52.115196,0.355096,0.521185,23.041348,0.469689,1.174091,0.621698,1.005446,0.604979,0.499988
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,276.0,0.0,1.0,165.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [46]:
#Extract the X and Y values for the target
xTarget = df.drop('target',axis=1)
yTarget = df['target']

#split the data into train data and test data
xtrain, xtest, ytrain, ytest = train_test_split(xTarget, yTarget, test_size=0.2, random_state=42)

In [47]:
# Remove missing values and duplicates from TRAINING data
xtrain_clean = xtrain.dropna(how='any')
xtrain_clean = xtrain_clean.drop_duplicates()

xtest_clean = xtest.dropna(how='any')
xtest_clean = xtest_clean.drop_duplicates()

In [48]:
#calculate the IQR to remove outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


# Clip outliers in TRAINING data
xtest_clean = xtest_clean.clip(lower=lower_bound, upper=upper_bound, axis=1)


df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,54.42807,0.691228,0.961404,131.954386,246.729825,0.147368,0.519298,149.273684,0.326316,1.047719,1.389474,0.722807,2.319298,0.529825
std,9.048729,0.462799,1.038986,17.585446,52.115196,0.355096,0.521185,23.041348,0.469689,1.174091,0.621698,1.005446,0.604979,0.499988
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,276.0,0.0,1.0,165.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR, SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [50]:
df = pd.read_csv('heart.csv') #create the data frame and read data from csv file

# Data Cleansing

In [51]:
df['chol'] = pd.to_numeric(df['chol'],errors='coerce') #convert non numerical values to NaN to be dropped
df.dropna(inplace=True,how='any')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 292 entries, 0 to 309
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       292 non-null    float64
 1   sex       292 non-null    float64
 2   cp        292 non-null    float64
 3   trestbps  292 non-null    float64
 4   chol      292 non-null    float64
 5   fbs       292 non-null    float64
 6   restecg   292 non-null    float64
 7   thalach   292 non-null    float64
 8   exang     292 non-null    int64  
 9   oldpeak   292 non-null    float64
 10  slope     292 non-null    int64  
 11  ca        292 non-null    float64
 12  thal      292 non-null    float64
 13  target    292 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 34.2 KB


In [52]:
df.drop_duplicates(inplace=True) #drop any duplicates

#print the description after removing the duplicates and transposes the matrix to show all columns
df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,54.42807,0.691228,0.961404,131.954386,246.729825,0.147368,0.519298,149.273684,0.326316,1.047719,1.389474,0.722807,2.319298,0.529825
std,9.048729,0.462799,1.038986,17.585446,52.115196,0.355096,0.521185,23.041348,0.469689,1.174091,0.621698,1.005446,0.604979,0.499988
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,276.0,0.0,1.0,165.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [53]:
#calculate the IQR to remove outliers
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


# Clip outliers in TRAINING data
xtest = xtest_clean.clip(lower=lower_bound, upper=upper_bound, axis=1)


df.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,54.42807,0.691228,0.961404,131.954386,246.729825,0.147368,0.519298,149.273684,0.326316,1.047719,1.389474,0.722807,2.319298,0.529825
std,9.048729,0.462799,1.038986,17.585446,52.115196,0.355096,0.521185,23.041348,0.469689,1.174091,0.621698,1.005446,0.604979,0.499988
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,276.0,0.0,1.0,165.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


# Feature Selection

In [54]:
# Calculate Z-scores for all numerical columns
numeric_cols = xtrain_clean.select_dtypes(include=['number']).columns
numeric_cols = [col for col in numeric_cols if col != 'target']  # Exclude 'target'
z_scores = xtrain_clean[numeric_cols].apply(lambda x: np.abs((x - x.mean()) / x.std()))
mean_z_scores = z_scores.mean().sort_values(ascending=False)

# Display Z-scores for each column
print("Z-scores for each column:")
print(mean_z_scores)

Z-scores for each column:
restecg     0.997190
exang       0.931039
slope       0.921129
sex         0.916777
thal        0.894054
cp          0.884907
ca          0.826986
age         0.826731
thalach     0.813026
chol        0.788981
trestbps    0.785764
oldpeak     0.781349
fbs         0.727683
dtype: float64


In [55]:
n_features = len(mean_z_scores)
high_priority = mean_z_scores[:n_features//3].index.tolist()
medium_priority = mean_z_scores[n_features//3 : 2*(n_features//3)].index.tolist()
low_priority = mean_z_scores[2*(n_features//3):].index.tolist()

print(high_priority)
print(medium_priority)
print(low_priority)
low_priority_to_drop = [col for col in low_priority if col in xtrain_clean.columns]
xtrain_selected = xtrain_clean.drop(low_priority_to_drop, axis=1)
xtest_selected = xtest_clean.drop(low_priority_to_drop, axis=1)

features = high_priority + medium_priority
print(features)

['restecg', 'exang', 'slope', 'sex']
['thal', 'cp', 'ca', 'age']
['thalach', 'chol', 'trestbps', 'oldpeak', 'fbs']
['restecg', 'exang', 'slope', 'sex', 'thal', 'cp', 'ca', 'age']


# Model Training

# Feature Selection

In [56]:
# Calculate Z-scores for all numerical columns
numeric_cols = xtrain_clean.select_dtypes(include=['number']).columns
numeric_cols = [col for col in numeric_cols if col != 'target']  # Exclude 'target'
z_scores = xtrain_clean[numeric_cols].apply(lambda x: np.abs((x - x.mean()) / x.std()))
mean_z_scores = z_scores.mean().sort_values(ascending=False)

# Display Z-scores for each column
print("Z-scores for each column:")
print(mean_z_scores)

Z-scores for each column:
restecg     0.997190
exang       0.931039
slope       0.921129
sex         0.916777
thal        0.894054
cp          0.884907
ca          0.826986
age         0.826731
thalach     0.813026
chol        0.788981
trestbps    0.785764
oldpeak     0.781349
fbs         0.727683
dtype: float64


In [57]:
n_features = len(mean_z_scores)
high_priority = mean_z_scores[:n_features//3].index.tolist()
medium_priority = mean_z_scores[n_features//3 : 2*(n_features//3)].index.tolist()
low_priority = mean_z_scores[2*(n_features//3):].index.tolist()

print(high_priority)
print(medium_priority)
print(low_priority)
low_priority_to_drop = [col for col in low_priority if col in xtrain_clean.columns]
xtrain_selected = xtrain_clean.drop(low_priority_to_drop, axis=1)
xtest_selected = xtest_clean.drop(low_priority_to_drop, axis=1)

features = high_priority + medium_priority
print(features)

['restecg', 'exang', 'slope', 'sex']
['thal', 'cp', 'ca', 'age']
['thalach', 'chol', 'trestbps', 'oldpeak', 'fbs']
['restecg', 'exang', 'slope', 'sex', 'thal', 'cp', 'ca', 'age']


#                      Scalling Data

In [58]:
#converts the data into ranges between 0 and 1 used for KNN and SVM
minMaxScaler = MinMaxScaler()

#fitting the High Priority Data
xTrain_highmd = minMaxScaler.fit_transform(xtrain_selected)
xTest_highmd = minMaxScaler.transform(xtest_selected)

#converting the fitted data into data frames to be easier for tracking
xTrain_df = pd.DataFrame(xTrain_highmd,columns=features,index=xtrain.index)
xTest_df = pd.DataFrame(xTest_highmd, columns=features, index=xtest.index)


In [59]:
#to convert data into Z-Score values to be used in logistic regression and decision tree
scaler = StandardScaler()

#fitting the High Priority Data
X_train_highmed= scaler.fit_transform(xtrain_selected)
X_test_highmed = scaler.transform(xtest_selected)

#recreating the data into data frames to be easier for tracking
x_train_scaled = pd.DataFrame(X_train_highmed, columns=features, index=xtrain.index)
X_test_scaled = pd.DataFrame(X_test_highmed, columns=features, index=xtest.index)