## 1. importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats

---

In [2]:
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")
pd.set_option("display.max_columns", None)

plt.rcParams['figure.figsize'] = (12, 6)

---

## 2. load dataset

In [None]:
## load data
df = pd.read_csv('../data/raw/heart.csv')

In [4]:
print(f"shape = {df.shape}")
print(f"total rows = {df.shape[0]}")
print(f"total columns = {df.shape[1]}")

shape = (920, 16)
total rows = 920
total columns = 16


---

## 3. data overview

In [6]:
## preview data
print("displying first 10 rows of data")
display(df.head(10))

displying first 10 rows of data


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,6,56,Male,Cleveland,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,7,62,Female,Cleveland,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,3
7,8,57,Female,Cleveland,asymptomatic,120.0,354.0,False,normal,163.0,True,0.6,upsloping,0.0,normal,0
8,9,63,Male,Cleveland,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,2
9,10,53,Male,Cleveland,asymptomatic,140.0,203.0,True,lv hypertrophy,155.0,True,3.1,downsloping,0.0,reversable defect,1


In [None]:
## info dataset
print(f"dataset info")
df.info()

dataset info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [7]:
## statistik deskriptif
print(f"descriptive statistic")
display(df.describe())

descriptive statistic


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


## 4. data quality check

In [9]:
# missing value
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Column' : missing.index,
    'Missing_Count' : missing.values,
    'Percentage' : missing_pct.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    display(missing_df)
else:
    print("No missing value")

Unnamed: 0,Column,Missing_Count,Percentage
13,ca,611,66.413043
14,thal,486,52.826087
12,slope,309,33.586957
7,fbs,90,9.782609
11,oldpeak,62,6.73913
5,trestbps,59,6.413043
10,exang,55,5.978261
9,thalch,55,5.978261
6,chol,30,3.26087
8,restecg,2,0.217391


In [10]:
# duplicate rows
duplicates = df.duplicated().sum()
print(f"\nduplicate rows : {duplicates}")
if duplicates > 0:
    print(f"percentage : {(duplicates / len(df)) * 100:.2f}%")


duplicate rows : 0


In [11]:
# unique value per column
unique_count = pd.DataFrame({
    'Column' : df.columns,
    'Unique_Values' : [df[col].nunique() for col in df.columns],
    'Data_Type' : df.dtypes.values
})
display(unique_count)

Unnamed: 0,Column,Unique_Values,Data_Type
0,id,920,int64
1,age,50,int64
2,sex,2,object
3,dataset,4,object
4,cp,4,object
5,trestbps,61,float64
6,chol,217,float64
7,fbs,2,object
8,restecg,3,object
9,thalch,119,float64
