In [1]:
import pandas as pd

Download the dataset

In [2]:
import os
import wget

dataset_url = 'https://github.com/GopalSaraf/Practicals/releases/download/ML-Datasets/heart.csv'

if not os.path.exists('heart.csv'):
    wget.download(dataset_url)

Create the dataframe

In [3]:
df = pd.read_csv("heart.csv", index_col=0)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 303 entries, 1 to 303
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        303 non-null    int64  
 1   Sex        303 non-null    int64  
 2   ChestPain  303 non-null    object 
 3   RestBP     303 non-null    int64  
 4   Chol       303 non-null    int64  
 5   Fbs        303 non-null    int64  
 6   RestECG    303 non-null    int64  
 7   MaxHR      303 non-null    int64  
 8   ExAng      303 non-null    int64  
 9   Oldpeak    303 non-null    float64
 10  Slope      303 non-null    int64  
 11  Ca         299 non-null    float64
 12  Thal       301 non-null    object 
 13  AHD        303 non-null    object 
dtypes: float64(2), int64(9), object(3)
memory usage: 35.5+ KB


In [5]:
df

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
300,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
301,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
302,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


Shape of data

In [6]:
df.shape

(303, 14)

Missing values

In [7]:
df.isnull()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,False,False,False,False,False,False,False,False,False,False,False,False,False,False
300,False,False,False,False,False,False,False,False,False,False,False,False,False,False
301,False,False,False,False,False,False,False,False,False,False,False,False,False,False
302,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
df.isnull().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           4
Thal         2
AHD          0
dtype: int64

Removing NULL values

In [9]:
df = df.dropna()

In [10]:
df.isna().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           0
Thal         0
AHD          0
dtype: int64

Column datatypes

In [11]:
df.dtypes

Age            int64
Sex            int64
ChestPain     object
RestBP         int64
Chol           int64
Fbs            int64
RestECG        int64
MaxHR          int64
ExAng          int64
Oldpeak      float64
Slope          int64
Ca           float64
Thal          object
AHD           object
dtype: object

Count number of zeros

In [12]:
(df == 0).sum()

Age            0
Sex           96
ChestPain      0
RestBP         0
Chol           0
Fbs          254
RestECG      147
MaxHR          0
ExAng        200
Oldpeak       96
Slope          0
Ca           174
Thal           0
AHD            0
dtype: int64

Mean of age

In [13]:
df['Age'].mean()

54.54208754208754

Now extract only Age, Sex, ChestPain, RestBP, Chol. Randomly divide dataset in training (75%) and testing (25%)

In [14]:
X = df[['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol']]
X

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol
1,63,1,typical,145,233
2,67,1,asymptomatic,160,286
3,67,1,asymptomatic,120,229
4,37,1,nonanginal,130,250
5,41,0,nontypical,130,204
...,...,...,...,...,...
298,57,0,asymptomatic,140,241
299,45,1,typical,110,264
300,68,1,asymptomatic,144,193
301,57,1,asymptomatic,130,131


In [15]:
y = df[['AHD']]
y

Unnamed: 0,AHD
1,No
2,Yes
3,Yes
4,No
5,No
...,...
298,Yes
299,Yes
300,Yes
301,Yes


Splitting the data

In [16]:
from train_test_split import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(223, 5)
(74, 5)
(223, 1)
(74, 1)


In [18]:
X_train

array([[44, 0, 'nonanginal', 108, 141],
       [46, 1, 'nontypical', 101, 197],
       [62, 0, 'asymptomatic', 160, 164],
       ...,
       [63, 1, 'asymptomatic', 130, 330],
       [50, 1, 'asymptomatic', 150, 243],
       [64, 1, 'asymptomatic', 145, 212]], dtype=object)

In [19]:
y_train

array([['No'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['No'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
      

In [20]:
X_test

array([[52, 1, 'typical', 118, 186],
       [42, 0, 'asymptomatic', 102, 265],
       [56, 1, 'nonanginal', 130, 256],
       [59, 1, 'asymptomatic', 138, 271],
       [59, 1, 'nontypical', 140, 221],
       [44, 1, 'asymptomatic', 110, 197],
       [38, 1, 'typical', 120, 231],
       [62, 0, 'asymptomatic', 140, 394],
       [69, 1, 'nonanginal', 140, 254],
       [46, 0, 'asymptomatic', 138, 243],
       [65, 1, 'asymptomatic', 110, 248],
       [57, 1, 'nonanginal', 150, 168],
       [66, 1, 'asymptomatic', 160, 228],
       [57, 1, 'nonanginal', 128, 229],
       [58, 1, 'nontypical', 120, 284],
       [67, 0, 'nonanginal', 152, 277],
       [61, 1, 'asymptomatic', 140, 207],
       [51, 0, 'nonanginal', 120, 295],
       [58, 1, 'asymptomatic', 146, 218],
       [62, 1, 'nontypical', 120, 281],
       [54, 1, 'asymptomatic', 120, 188],
       [54, 1, 'asymptomatic', 110, 239],
       [41, 1, 'nontypical', 110, 235],
       [57, 1, 'nonanginal', 150, 126],
       [66, 1, 'asymptom

In [21]:
y_test

array([['No'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['No'],
       ['No'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       [