# Importing Data

In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('heart.csv')


print("Original dataset shape:", df.shape)

Original dataset shape: (303, 14)


# Removing some data

In [2]:
columns_to_unbalance = ['age', 'thalachh', 'chol', 'oldpeak', 'trtbps']

In [3]:
for col in columns_to_unbalance:
    for i in range(np.random.randint(70,100)):
        random_row_index = np.random.choice(df.index)
        df.loc[random_row_index, col] = np.nan

In [4]:
missing_val_count_by_column = (df.isnull().sum())
print(missing_val_count_by_column)

age         64
sex          0
cp           0
trtbps      79
chol        65
fbs          0
restecg      0
thalachh    71
exng         0
oldpeak     85
slp          0
caa          0
thall        0
output       0
dtype: int64


In [5]:
df.to_csv('heart_unbalanced.csv', index=False)

# Method 1:

# Dropping columns

In [6]:
dfp = df

In [7]:
columns_to_drop = ['age', 'thalachh', 'chol', 'oldpeak', 'trtbps']
dfp.drop(columns=columns_to_drop, inplace=True)

In [8]:
dfp

Unnamed: 0,sex,cp,fbs,restecg,exng,slp,caa,thall,output
0,1,3,1,0,0,0,0,1,1
1,1,2,0,1,0,0,0,2,1
2,0,1,0,0,0,2,0,2,1
3,1,1,0,1,0,2,0,2,1
4,0,0,0,1,1,2,0,2,1
...,...,...,...,...,...,...,...,...,...
298,0,0,0,1,1,1,0,3,0
299,1,3,0,1,0,1,0,3,0
300,1,0,1,1,0,1,2,3,0
301,1,0,0,1,1,1,1,3,0


In [9]:
dfp.to_csv('heart_dropped_cols.csv', index=False)

# Method 2

# Replacing messing data

In [10]:
df

Unnamed: 0,sex,cp,fbs,restecg,exng,slp,caa,thall,output
0,1,3,1,0,0,0,0,1,1
1,1,2,0,1,0,0,0,2,1
2,0,1,0,0,0,2,0,2,1
3,1,1,0,1,0,2,0,2,1
4,0,0,0,1,1,2,0,2,1
...,...,...,...,...,...,...,...,...,...
298,0,0,0,1,1,1,0,3,0
299,1,3,0,1,0,1,0,3,0
300,1,0,1,1,0,1,2,3,0
301,1,0,0,1,1,1,1,3,0


In [11]:
missing_val_count_by_column = (df.isnull().sum())
print(missing_val_count_by_column)

sex        0
cp         0
fbs        0
restecg    0
exng       0
slp        0
caa        0
thall      0
output     0
dtype: int64


In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer


df = pd.read_csv('heart_unbalanced.csv')

print("Dataset Info Before Imputation:")
print(df.info())



Dataset Info Before Imputation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       239 non-null    float64
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    224 non-null    float64
 4   chol      238 non-null    float64
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  232 non-null    float64
 8   exng      303 non-null    int64  
 9   oldpeak   218 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(5), int64(9)
memory usage: 33.3 KB
None


# Using Mean, median, most frequent and constant technique

In [13]:
imputer_mean = SimpleImputer(strategy='mean')
imputer_median = SimpleImputer(strategy='median')
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
imputer_constant = SimpleImputer(strategy='constant', fill_value=0)  

df_mean = pd.DataFrame(imputer_mean.fit_transform(df), columns=df.columns)
df_median = pd.DataFrame(imputer_median.fit_transform(df), columns=df.columns)
df_most_frequent = pd.DataFrame(imputer_most_frequent.fit_transform(df), columns=df.columns)
df_constant = pd.DataFrame(imputer_constant.fit_transform(df), columns=df.columns)


df_mean.to_csv('heart_imputed_mean.csv', index=False)
df_median.to_csv('heart_imputed_median.csv', index=False)
df_most_frequent.to_csv('heart_imputed_most_frequent.csv', index=False)
df_constant.to_csv('heart_imputed_constant.csv', index=False)


# Using KNN technique

In [14]:
from sklearn.impute import KNNImputer

In [15]:
imputer_knn = KNNImputer()

df_knn = pd.DataFrame(imputer_knn.fit_transform(df), columns=df.columns)

df_knn.to_csv('heart_imputed_knn.csv', index=False)