In [1]:
# Importing necessary libraries for data cleaning
import pandas as pd

# Loading the dataset
data = pd.read_csv("../data/HeartDisease.csv")


In [2]:
# Step 1: Identifying Missing Values
print("Missing Values in Each Column Before Cleaning:")
print(data.isnull().sum())

Missing Values in Each Column Before Cleaning:
age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64


In [3]:
# Step 2: Handling Missing Values
# Since no missing values were found, this step is for reference. 
# Common strategies include:
# - Dropping rows/columns with many missing values
# - Imputing with mean/median/mode for numerical columns or most frequent for categorical

# Step 3: Checking and Handling Inconsistent or Incorrect Data Entries

In [4]:
# Converting categorical values to a standardized format
data['sex'] = data['sex'].str.capitalize()

In [5]:
# Standardizing values in other categorical columns
categorical_columns = ['chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 
                       'exercise_induced_angina', 'slope', 'vessels_colored_by_flourosopy', 
                       'thalassemia']

In [6]:
for col in categorical_columns:
    data[col] = data[col].str.lower()

# Step 4: Handling Outliers using IQR method

In [7]:
numeric_columns = ['age', 'resting_blood_pressure', 'cholestoral', 'Max_heart_rate', 'oldpeak']
for col in numeric_columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Cap outliers
    data[col] = data[col].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

# Step 5: Verifying Data Consistency Post-Cleaning

In [8]:
print("\nData Information After Cleaning:")
data.info()
print("\nFirst 5 Rows After Cleaning:")
print(data.head())


Data Information After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1025 non-null   int64  
 1   sex                            1025 non-null   object 
 2   chest_pain_type                1025 non-null   object 
 3   resting_blood_pressure         1025 non-null   float64
 4   cholestoral                    1025 non-null   float64
 5   fasting_blood_sugar            1025 non-null   object 
 6   rest_ecg                       1025 non-null   object 
 7   Max_heart_rate                 1025 non-null   float64
 8   exercise_induced_angina        1025 non-null   object 
 9   oldpeak                        1025 non-null   float64
 10  slope                          1025 non-null   object 
 11  vessels_colored_by_flourosopy  1025 non-null   object 
 12  thalassemia   

In [9]:
# Summary of Changes
print("\nMissing Values After Cleaning:")
print(data.isnull().sum())
print("\nNumber of Duplicates Removed:", data.duplicated().sum())


Missing Values After Cleaning:
age                              0
sex                              0
chest_pain_type                  0
resting_blood_pressure           0
cholestoral                      0
fasting_blood_sugar              0
rest_ecg                         0
Max_heart_rate                   0
exercise_induced_angina          0
oldpeak                          0
slope                            0
vessels_colored_by_flourosopy    0
thalassemia                      0
target                           0
dtype: int64

Number of Duplicates Removed: 723
