In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Dataset Description
| Variable | Description |
| -------- | ----------- |
| Age      | Age of the patient |
| Sex      | Sex of the patient |
| cp       | Chest Pain type chest pain type: <br>Value 1: typical angina <br>Value 2: atypical angina <br>Value 3: non-anginal pain <br>Value 4: asymptomatic |
|trtbps|resting blood pressure (in mm Hg)|
|chol|cholestoral in mg/dl fetched via BMI sensor|
|fbs|(fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)|
|restecg|resting electrocardiographic results: <br>Value 0: normal<br>Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)<br>Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria|
thalach|maximum heart rate achieved|
|ouput|0= less chance of heart attack 1= more chance of heart attack|

In [2]:
df=pd.read_csv('./Data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df.shape

(303, 14)

# Data Cleansing

### Check dubplicates

In [5]:
# discover duplicates
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


In [6]:
# remove duplicates
df = df.drop_duplicates()

In [7]:
df.shape

(302, 14)

### Listing Null Values form data

In [8]:
# check for null values
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trtbps    302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalachh  302 non-null    int64  
 8   exng      302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slp       302 non-null    int64  
 11  caa       302 non-null    int64  
 12  thall     302 non-null    int64  
 13  output    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.4 KB
