## Heart Attack Analysis & Prediction

👉 This study will examine the dataset named as **"Heart Attack Analysis & Prediction"**  at Kaggle website [external link text](https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset?select=o2Saturation.csv).


**Features' Descriptions:**

age - Age of the patient

sex - Sex of the patient

cp - Chest pain type ~ 0 = Typical Angina, 1 = Atypical Angina, 2 = Non-anginal Pain, 3 = Asymptomatic

trtbps - Resting blood pressure (in mm Hg)

chol - Cholestoral in mg/dl fetched via BMI sensor

fbs - (fasting blood sugar > 120 mg/dl) ~ 1 = True, 0 = False

restecg - Resting electrocardiographic results ~ 0 = Normal, 1 = ST-T wave normality, 2 = Left ventricular hypertrophy

thalachh - Maximum heart rate achieved

oldpeak - Previous peak

slp - Slope

caa - Number of major vessels

thall - Thalium Stress Test result ~ (0,3)

exng - Exercise induced angina ~ 1 = Yes, 0 = No

output - Target variable

In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('heart.csv')

In [4]:
df.shape

(303, 14)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [6]:
df.sample(10)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
114,55,1,1,130,262,0,1,155,0,0.0,2,0,2,1
140,51,0,2,120,295,0,0,157,0,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
168,63,1,0,130,254,0,0,147,0,1.4,1,1,3,0
56,48,1,0,122,222,0,0,186,0,0.0,2,0,2,1
225,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
44,39,1,2,140,321,0,0,182,0,0.0,2,0,2,1
294,44,1,0,120,169,0,1,144,1,2.8,0,0,1,0
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3,0


In [7]:
df.corr()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
age,1.0,-0.098447,-0.068653,0.279351,0.213678,0.121308,-0.116211,-0.398522,0.096801,0.210013,-0.168814,0.276326,0.068001,-0.225439
sex,-0.098447,1.0,-0.049353,-0.056769,-0.197912,0.045032,-0.058196,-0.04402,0.141664,0.096093,-0.030711,0.118261,0.210041,-0.280937
cp,-0.068653,-0.049353,1.0,0.047608,-0.076904,0.094444,0.044421,0.295762,-0.39428,-0.14923,0.119717,-0.181053,-0.161736,0.433798
trtbps,0.279351,-0.056769,0.047608,1.0,0.123174,0.177531,-0.114103,-0.046698,0.067616,0.193216,-0.121475,0.101389,0.06221,-0.144931
chol,0.213678,-0.197912,-0.076904,0.123174,1.0,0.013294,-0.15104,-0.00994,0.067023,0.053952,-0.004038,0.070511,0.098803,-0.085239
fbs,0.121308,0.045032,0.094444,0.177531,0.013294,1.0,-0.084189,-0.008567,0.025665,0.005747,-0.059894,0.137979,-0.032019,-0.028046
restecg,-0.116211,-0.058196,0.044421,-0.114103,-0.15104,-0.084189,1.0,0.044123,-0.070733,-0.05877,0.093045,-0.072042,-0.011981,0.13723
thalachh,-0.398522,-0.04402,0.295762,-0.046698,-0.00994,-0.008567,0.044123,1.0,-0.378812,-0.344187,0.386784,-0.213177,-0.096439,0.421741
exng,0.096801,0.141664,-0.39428,0.067616,0.067023,0.025665,-0.070733,-0.378812,1.0,0.288223,-0.257748,0.115739,0.206754,-0.436757
oldpeak,0.210013,0.096093,-0.14923,0.193216,0.053952,0.005747,-0.05877,-0.344187,0.288223,1.0,-0.577537,0.222682,0.210244,-0.430696


In [8]:
abs(df.corr()['output']).sort_values(ascending=False)

output      1.000000
exng        0.436757
cp          0.433798
oldpeak     0.430696
thalachh    0.421741
caa         0.391724
slp         0.345877
thall       0.344029
sex         0.280937
age         0.225439
trtbps      0.144931
restecg     0.137230
chol        0.085239
fbs         0.028046
Name: output, dtype: float64

In [9]:
df.sex=df['sex'].replace([1,0],['E','K'])

In [10]:
df.sex.unique()

array(['E', 'K'], dtype=object)

In [11]:
df.sample(10)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
234,70,E,0,130,322,0,0,109,0,2.4,1,3,2,0
222,65,E,3,138,282,1,0,174,0,1.4,1,1,2,0
144,76,K,2,140,197,0,2,116,0,1.1,1,0,2,1
146,44,K,2,118,242,0,1,149,0,0.3,1,1,2,1
181,65,K,0,150,225,0,0,114,0,1.0,1,3,3,0
283,40,E,0,152,223,0,1,181,0,0.0,2,0,3,0
94,45,K,1,112,160,0,1,138,0,0.0,1,0,2,1
246,56,K,0,134,409,0,0,150,1,1.9,1,2,3,0
79,58,E,2,105,240,0,0,154,1,0.6,1,0,3,1
151,71,K,0,112,149,0,1,125,0,1.6,1,0,2,1


In [13]:
df.thall=df.thall.astype('object')

In [14]:
df['t']=220-df.age

In [15]:
df['riskthalachh']=((220-df.age)-df.thalachh)

In [16]:
df.sample(5)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,t,riskthalachh
56,48,E,0,122,222,0,0,186,0,0.0,2,0,2,1,172,-14
275,52,E,0,125,212,0,1,168,0,1.0,2,2,3,0,168,0
272,67,E,0,120,237,0,1,71,0,1.0,1,0,2,0,153,82
130,54,K,2,160,201,0,1,163,0,0.0,2,1,2,1,166,3
239,35,E,0,126,282,0,0,156,1,0.0,2,0,3,0,185,29


In [17]:
df.thall.unique()

array([1, 2, 3, 0], dtype=object)

In [18]:
df.cp=df.cp.astype("object")

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           303 non-null    int64  
 1   sex           303 non-null    object 
 2   cp            303 non-null    object 
 3   trtbps        303 non-null    int64  
 4   chol          303 non-null    int64  
 5   fbs           303 non-null    int64  
 6   restecg       303 non-null    int64  
 7   thalachh      303 non-null    int64  
 8   exng          303 non-null    int64  
 9   oldpeak       303 non-null    float64
 10  slp           303 non-null    int64  
 11  caa           303 non-null    int64  
 12  thall         303 non-null    object 
 13  output        303 non-null    int64  
 14  t             303 non-null    int64  
 15  riskthalachh  303 non-null    int64  
dtypes: float64(1), int64(12), object(3)
memory usage: 38.0+ KB


In [20]:
df.oldpeak.value_counts(ascending=False)

0.0    99
1.2    17
1.0    14
0.6    14
1.4    13
0.8    13
0.2    12
1.6    11
1.8    10
0.4     9
2.0     9
0.1     7
2.8     6
2.6     6
1.5     5
3.0     5
1.9     5
0.5     5
3.6     4
2.2     4
2.4     3
0.9     3
3.4     3
4.0     3
0.3     3
2.3     2
3.2     2
2.5     2
4.2     2
1.1     2
3.1     1
0.7     1
3.5     1
6.2     1
1.3     1
5.6     1
2.9     1
2.1     1
3.8     1
4.4     1
Name: oldpeak, dtype: int64

In [21]:
df.isnull().sum()

age             0
sex             0
cp              0
trtbps          0
chol            0
fbs             0
restecg         0
thalachh        0
exng            0
oldpeak         0
slp             0
caa             0
thall           0
output          0
t               0
riskthalachh    0
dtype: int64

In [22]:
x=df.drop(['output','chol','fbs','t','trtbps','thalachh'], axis=1)

In [23]:
x.sample()

Unnamed: 0,age,sex,cp,restecg,exng,oldpeak,slp,caa,thall,riskthalachh
168,63,E,0,0,0,1.4,1,1,3,10


In [24]:
y=df.output

In [25]:
x=pd.get_dummies(x,drop_first=True)

In [None]:
#x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [26]:
def classification_funct(x,y):
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from xgboost import XGBClassifier
    from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
    from sklearn.metrics import confusion_matrix,classification_report
    from sklearn.model_selection import train_test_split   
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

    g=GaussianNB()
    b=BernoulliNB()
    KN=KNeighborsClassifier()
    SVC=SVC() 
    D=DecisionTreeClassifier()
    R=RandomForestClassifier()
    Log=LogisticRegression()
    XGB=XGBClassifier()
    
    algos=[g,b,KN,SVC,D,R,Log,XGB]
    algo_names=['GaussianNB','BernoulliNB','KNeighborsClassifier','SVC','DecisionTreeClassifier','RandomForestClassifier','LogisticRegression','XGBClassifier']
   
    accuracy_scored=[]

    
    for item in algos:
        item.fit(x_train,y_train)
        item.predict(x_test)
        
        accuracy_scored.append(accuracy_score(y_test,item.predict(x_test)))
       
    result=pd.DataFrame(columns=['accuracy_score'],index=algo_names)
    result['accuracy_score']=accuracy_scored
    return result.sort_values('accuracy_score',ascending=False)

In [27]:
classification_funct(x,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,accuracy_score
LogisticRegression,0.901639
GaussianNB,0.868852
BernoulliNB,0.852459
RandomForestClassifier,0.819672
XGBClassifier,0.786885
SVC,0.721311
DecisionTreeClassifier,0.704918
KNeighborsClassifier,0.672131


In [None]:
#For the higher precision scores I need more and various data of heart attack issue.Best accuracy score is from Logistic Regression.