In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Starting Data Cleaning...")

Starting Data Cleaning...


In [2]:
# Load dataset
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

df = pd.read_csv('../data/raw/processed.cleveland.data', 
                 names=column_names, na_values='?')

# Convert target to binary
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

print(f"Original shape: {df.shape}")
print(f"\nMissing values before cleaning:")
print(df.isnull().sum()[df.isnull().sum() > 0])

Original shape: (303, 14)

Missing values before cleaning:
ca      4
thal    2
dtype: int64


In [3]:
# See which rows have missing values
print("Rows with missing values:")
missing_rows = df[df.isnull().any(axis=1)]
print(f"Total rows with missing data: {len(missing_rows)}")
print(missing_rows[['ca', 'thal', 'target']].head(10))

Rows with missing values:
Total rows with missing data: 6
      ca  thal  target
87   0.0   NaN       0
166  NaN   3.0       0
192  NaN   7.0       1
266  0.0   NaN       1
287  NaN   7.0       0
302  NaN   3.0       0


In [4]:
# Fill missing ca with median
print("Handling 'ca' column (number of vessels):")
print(f"Missing before: {df['ca'].isnull().sum()}")

# Use median to fill
ca_median = df['ca'].median()
df['ca'].fillna(ca_median, inplace=True)

print(f"Missing after: {df['ca'].isnull().sum()}")
print(f"Filled with median: {ca_median}")

Handling 'ca' column (number of vessels):
Missing before: 4
Missing after: 0
Filled with median: 0.0


In [5]:
# Fill missing thal with mode
print("Handling 'thal' column (thalassemia):")
print(f"Missing before: {df['thal'].isnull().sum()}")

# Use mode (most common value)
thal_mode = df['thal'].mode()[0]
df['thal'].fillna(thal_mode, inplace=True)

print(f"Missing after: {df['thal'].isnull().sum()}")
print(f"Filled with mode: {thal_mode}")

Handling 'thal' column (thalassemia):
Missing before: 2
Missing after: 0
Filled with mode: 3.0


In [6]:
# Check all missing values are handled
print("Missing values after cleaning:")
print(df.isnull().sum().sum())

if df.isnull().sum().sum() == 0:
    print("\n All missing values handled successfully!")
else:
    print("\n Still have missing values!")

Missing values after cleaning:
0

 All missing values handled successfully!


In [7]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"Removing {duplicates} duplicates...")
    df.drop_duplicates(inplace=True)
    print(f"Shape after removing duplicates: {df.shape}")
else:
    print("No duplicates found!")

Duplicate rows: 0
No duplicates found!


In [8]:
# Handle cholesterol outliers
print("Cholesterol outliers:")
Q1 = df['chol'].quantile(0.25)
Q3 = df['chol'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Normal range: {lower_bound:.1f} to {upper_bound:.1f}")
print(f"Values outside range: {((df['chol'] < lower_bound) | (df['chol'] > upper_bound)).sum()}")

# Cap outliers instead of removing
df['chol'] = df['chol'].clip(lower=lower_bound, upper=upper_bound)

print(f"After capping - Max cholesterol: {df['chol'].max():.1f}")
print(f"After capping - Min cholesterol: {df['chol'].min():.1f}")

Cholesterol outliers:
Normal range: 115.0 to 371.0
Values outside range: 5
After capping - Max cholesterol: 371.0
After capping - Min cholesterol: 126.0


In [9]:
# Verify all columns are numeric
print("Data types after cleaning:")
print(df.dtypes)

# All should be numeric for ML
print(f"\nAll numeric: {df.select_dtypes(include=[np.number]).shape[1] == df.shape[1]}")

Data types after cleaning:
age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
target        int64
dtype: object

All numeric: True


In [10]:
# Save cleaned version
df_clean = df.copy()
print(f"Clean dataset shape: {df_clean.shape}")
print("\nFirst few rows:")
df_clean.head()

Clean dataset shape: (303, 14)

First few rows:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [11]:
# Create age groups as numeric feature
df_clean['age_above_50'] = (df_clean['age'] > 50).astype(int)
print("Created feature: age_above_50")
print(df_clean['age_above_50'].value_counts())

Created feature: age_above_50
age_above_50
1    209
0     94
Name: count, dtype: int64


In [12]:
# High cholesterol indicator
df_clean['high_chol'] = (df_clean['chol'] > 200).astype(int)
print("Created feature: high_chol")
print(df_clean['high_chol'].value_counts())

Created feature: high_chol
high_chol
1    253
0     50
Name: count, dtype: int64


In [13]:
# Low heart rate indicator (lower is riskier based on correlation)
df_clean['low_heart_rate'] = (df_clean['thalach'] < 140).astype(int)
print("Created feature: low_heart_rate")
print(df_clean['low_heart_rate'].value_counts())

Created feature: low_heart_rate
low_heart_rate
0    218
1     85
Name: count, dtype: int64


In [14]:
# Show new columns
print("Original features: 14")
print(f"After feature engineering: {df_clean.shape[1]}")
print(f"\nNew features added: {df_clean.shape[1] - 14}")
print("\nAll columns:")
print(df_clean.columns.tolist())

Original features: 14
After feature engineering: 17

New features added: 3

All columns:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 'age_above_50', 'high_chol', 'low_heart_rate']


In [15]:
# Save to CSV
df_clean.to_csv('../data/processed/heart_disease_cleaned.csv', index=False)
print("Cleaned dataset saved to: data/processed/heart_disease_cleaned.csv")
print(f"\nFinal shape: {df_clean.shape}")

Cleaned dataset saved to: data/processed/heart_disease_cleaned.csv

Final shape: (303, 17)


In [16]:
# Cleaning summary
print("="*70)
print("DATA CLEANING SUMMARY")
print("="*70)
print(f"Original records: 303")
print(f"Final records: {df_clean.shape[0]}")
print(f"Original features: 14")
print(f"Final features: {df_clean.shape[1]}")
print(f"\nCleaning steps performed:")
print("1. Filled ca missing values (4) with median")
print("2. Filled thal missing values (2) with mode")
print("3. Checked for duplicates: 0 found")
print("4. Capped cholesterol outliers (5) within IQR range")
print("5. Created 3 new features: age_above_50, high_chol, low_heart_rate")
print(f"\n Dataset ready for ML modeling!")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

DATA CLEANING SUMMARY
Original records: 303
Final records: 303
Original features: 14
Final features: 17

Cleaning steps performed:
1. Filled ca missing values (4) with median
2. Filled thal missing values (2) with mode
3. Checked for duplicates: 0 found
4. Capped cholesterol outliers (5) within IQR range
5. Created 3 new features: age_above_50, high_chol, low_heart_rate

 Dataset ready for ML modeling!
Missing values: 0
