<h1><center> Heart Failure Prediction - Pre-processing & Training</center></h1>

# 1. Data Preprocessing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option("display.max_rows",None) #full dataset loading in notebook view
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.neighbors import KNeighborsClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

In [2]:
# The supplied CSV data file is the raw_data directory
# Create a Dataframe 

heart_df = pd.read_csv('../HeartFailurePrediction/data/heart.csv')

In [3]:
# Checking the columns and the first 5 rows 
heart_df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [4]:
def outliers_graph(df_column):
    Q75, Q25 = np.percentile(df_column, [75 ,25]) 
    IQR = Q75 - Q25
    print('Q25: ',Q25)
    print('Q75: ',Q75)
    print('Inter Quartile Range: ',IQR)
    print('Outliers lie before', Q25-1.5*IQR, 'and beyond', Q75+1.5*IQR)
    print('Number of Rows with Left Extreme Outliers:', len(heart_df[df_column <Q25-1.5*IQR]))
    print('Number of Rows with Right Extreme Outliers:', len(heart_df[df_column>Q75+1.5*IQR]))

In [5]:
outliers_graph(heart_df['Age'])

Q25:  47.0
Q75:  60.0
Inter Quartile Range:  13.0
Outliers lie before 27.5 and beyond 79.5
Number of Rows with Left Extreme Outliers: 0
Number of Rows with Right Extreme Outliers: 0


In [6]:
outliers_graph(heart_df['RestingBP'])

Q25:  120.0
Q75:  140.0
Inter Quartile Range:  20.0
Outliers lie before 90.0 and beyond 170.0
Number of Rows with Left Extreme Outliers: 2
Number of Rows with Right Extreme Outliers: 26


In [7]:
heart_df = heart_df[heart_df.RestingBP>=90]
len(heart_df)

916

In [8]:
outliers_graph(heart_df['Cholesterol'])

Q25:  174.75
Q75:  267.0
Inter Quartile Range:  92.25
Outliers lie before 36.375 and beyond 405.375
Number of Rows with Left Extreme Outliers: 170
Number of Rows with Right Extreme Outliers: 12


In [9]:
#due more value in the right extreme outlier and '0' cholesterol level in dataset

heart_df = heart_df[heart_df.Cholesterol<=450]  
len(heart_df)

908

In [10]:
outliers_graph(heart_df['FastingBS'])

Q25:  0.0
Q75:  0.0
Inter Quartile Range:  0.0
Outliers lie before 0.0 and beyond 0.0
Number of Rows with Left Extreme Outliers: 0
Number of Rows with Right Extreme Outliers: 211


In [11]:
outliers_graph(heart_df['MaxHR'])

Q25:  120.0
Q75:  156.0
Inter Quartile Range:  36.0
Outliers lie before 66.0 and beyond 210.0
Number of Rows with Left Extreme Outliers: 2
Number of Rows with Right Extreme Outliers: 0


In [12]:
heart_df = heart_df[heart_df.MaxHR>=70]
len(heart_df)

905

In [13]:
outliers_graph(heart_df['Oldpeak'])

Q25:  0.0
Q75:  1.5
Inter Quartile Range:  1.5
Outliers lie before -2.25 and beyond 3.75
Number of Rows with Left Extreme Outliers: 1
Number of Rows with Right Extreme Outliers: 15


In [14]:
print('Mean: ', heart_df['Cholesterol'].mean())
print('Median: ', heart_df['Cholesterol'].median())

Mean:  197.12817679558012
Median:  222.0


In [15]:
# Mean value of Cholesterol without including the cholesterol=0

mc = heart_df[heart_df['Cholesterol'] > 0].Cholesterol.mean() 
print('Mean of Cholesterol>0: ', mc)

Mean of Cholesterol>0:  241.7357723577236


In [16]:
heart_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,905.0,53.522652,9.434672,28.0,47.0,54.0,60.0,77.0
RestingBP,905.0,132.632044,17.98981,92.0,120.0,130.0,140.0,200.0
Cholesterol,905.0,197.128177,105.005105,0.0,175.0,222.0,266.0,417.0
FastingBS,905.0,0.232044,0.42237,0.0,0.0,0.0,0.0,1.0
MaxHR,905.0,137.101657,25.156164,70.0,120.0,138.0,156.0,202.0
Oldpeak,905.0,0.889834,1.07097,-2.6,0.0,0.6,1.5,6.2
HeartDisease,905.0,0.551381,0.497628,0.0,0.0,1.0,1.0,1.0


In [17]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [18]:
feature_cols = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
target_cols = 'HeartDisease'

X = heart_df[feature_cols]
y = heart_df[target_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

In [19]:
def data_preprocessing(X_train):
    ''' Replace outliers with mean to make the distribution more normal. '''    
    mean_chol = X_train.loc[X_train['Cholesterol'] > 0, 'Cholesterol'].mean()
    mean_rest = X_train.loc[X_train['RestingBP'] > 0, 'RestingBP'].mean()  # Corrected column name
    
    # Replace outliers and zero values with mean
    X_train.loc[X_train['Cholesterol'] == 0, 'Cholesterol'] = mean_chol.astype(X_train['Cholesterol'].dtype)
    X_train.loc[X_train['Cholesterol'] >= 500, 'Cholesterol'] = mean_chol.astype(X_train['Cholesterol'].dtype)
    X_train.loc[X_train['RestingBP'] == 0, 'RestingBP'] = int(mean_rest)  # Cast mean_rest to int
    X_train.loc[X_train['RestingBP'] == 244.635389, 'RestingBP'] = int(mean_rest)  # Cast mean_rest to int

    return X_train

X_train = data_preprocessing(X_train)


In [20]:
heart_df.columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [21]:
y_train.value_counts()

HeartDisease
1    424
0    345
Name: count, dtype: int64

### 1.1 One Hot Encoding

In [22]:
def OneHotEncoding(dfcolumn):
    global heart_df
    dfcolumn.nunique()
    len(heart_df.columns)
    finallencol = (dfcolumn.nunique() - 1) + (len(heart_df.columns)-1)
    dummies = pd.get_dummies(dfcolumn, drop_first=True, prefix=dfcolumn.name)
    heart_df=pd.concat([heart_df, dummies],axis='columns')
    heart_df.drop(columns=dfcolumn.name,axis=1,inplace=True) 
    if(finallencol==len(heart_df.columns)):
        print('OneHotEncoding is sucessfull') 
        print('')
    else:
        print('Unsucessfull')
    return heart_df.head(5)

In [23]:
OneHotEncoding(heart_df['ChestPainType'])
OneHotEncoding(heart_df['Sex'])
OneHotEncoding(heart_df['RestingECG'])
OneHotEncoding(heart_df['ExerciseAngina'])
OneHotEncoding(heart_df['ST_Slope'])

OneHotEncoding is sucessfull

OneHotEncoding is sucessfull

OneHotEncoding is sucessfull

OneHotEncoding is sucessfull

OneHotEncoding is sucessfull



Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Sex_M,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,False,False,True,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,True,False,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,False,False,True,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,False,True,False,True,True,False,False,False,True


In [24]:
heart_df.describe().columns.to_list()

['Age',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'MaxHR',
 'Oldpeak',
 'HeartDisease']

### 1.2 Imputation

In [25]:
from sklearn.impute import KNNImputer

# Replace zero values of cholesterol with NaN
heart_df['Cholesterol'].replace(to_replace=0, value=np.nan, inplace=False)  # No inplace=True

# Apply KNNImputer
KNN_imputed = KNNImputer(n_neighbors=5)
I = KNN_imputed.fit_transform(heart_df)

# Extract imputed values
Cholesterol = [i[2] for i in I]
heart_df['Cholesterol'] = Cholesterol


### 2.0 Feature Scaling

In [26]:
# altering the DataFrame
heart_df = heart_df[['Age',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'MaxHR',
 'Oldpeak',
 'ChestPainType_ATA',
 'ChestPainType_NAP',
 'ChestPainType_TA',
 'Sex_M',
 'RestingECG_Normal',
 'RestingECG_ST',
 'ExerciseAngina_Y',
 'ST_Slope_Flat',
 'ST_Slope_Up',
 'HeartDisease',]]
 
# printing the altered DataFrame
heart_df.head(5)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Sex_M,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,40,140,289.0,0,172,0.0,True,False,False,True,True,False,False,False,True,0
1,49,160,180.0,0,156,1.0,False,True,False,False,True,False,False,True,False,1
2,37,130,283.0,0,98,0.0,True,False,False,True,False,True,False,False,True,0
3,48,138,214.0,0,108,1.5,False,False,False,False,True,False,True,True,False,1
4,54,150,195.0,0,122,0.0,False,True,False,True,True,False,False,False,True,0


In [27]:
scaler = StandardScaler()
scaler.fit(heart_df.drop('HeartDisease',axis = 1))

In [28]:
scaled_features = scaler.transform(heart_df.drop('HeartDisease',axis = 1))
df_feat = pd.DataFrame(scaled_features,columns = heart_df.columns[:-1])
df_feat.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Sex_M,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.434086,0.409789,0.875411,-0.549689,1.388035,-0.831327,2.064371,-0.532624,-0.23141,0.517211,0.820261,-0.493084,-0.825927,-0.996691,1.141402
1,-0.47963,1.522144,-0.163208,-0.549689,0.751656,0.102922,-0.484409,1.877498,-0.23141,-1.933448,0.820261,-0.493084,-0.825927,1.00332,-0.876115
2,-1.752238,-0.146388,0.818239,-0.549689,-1.555216,-0.831327,2.064371,-0.532624,-0.23141,0.517211,-1.219124,2.028052,-0.825927,-0.996691,1.141402
3,-0.585681,0.298554,0.160765,-0.549689,-1.15748,0.570047,-0.484409,-0.532624,-0.23141,-1.933448,0.820261,-0.493084,1.21076,1.00332,-0.876115
4,0.050623,0.965967,-0.020279,-0.549689,-0.600648,-0.831327,-0.484409,1.877498,-0.23141,0.517211,0.820261,-0.493084,-0.825927,-0.996691,1.141402


In [29]:
heart_df.head(5)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Sex_M,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,40,140,289.0,0,172,0.0,True,False,False,True,True,False,False,False,True,0
1,49,160,180.0,0,156,1.0,False,True,False,False,True,False,False,True,False,1
2,37,130,283.0,0,98,0.0,True,False,False,True,False,True,False,False,True,0
3,48,138,214.0,0,108,1.5,False,False,False,False,True,False,True,True,False,1
4,54,150,195.0,0,122,0.0,False,True,False,True,True,False,False,False,True,0
