In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import necessary libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Heart_Disease_Prediction/data/raw_data/heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
# Check for Missing Values
# Identifies columns with missing values and their count
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


Missing values in each column:
 id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


**Handle missing values**

In [5]:
#Handle missing values
# Filling numerical missing values with median
num_cols = ["trestbps", "chol", "thalch", "oldpeak"]
imputer = SimpleImputer(strategy="median")
df[num_cols] = imputer.fit_transform(df[num_cols])

In [6]:
# Filling categorical missing values with most frequent value
cat_cols = ["slope", "ca", "thal"]
imputer_cat = SimpleImputer(strategy="most_frequent")
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

In [7]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future use

In [8]:
# Feature Scaling (Standardization)
scaler = StandardScaler()
numeric_features = ["age", "trestbps", "chol", "thalch", "oldpeak"]
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [9]:
import os

# Define the file path
file_path = "/content/drive/MyDrive/Heart_Disease_Prediction/data/processed"

# Create directory if it doesn't exist
os.makedirs(file_path, exist_ok=True)

# Save the preprocessed dataset
df.to_csv(f"{file_path}/heart_disease_cleaned.csv", index=False)

print(f"Data preprocessing completed. Cleaned data saved successfully at {file_path}/heart_disease_cleaned.csv")


Data preprocessing completed. Cleaned data saved successfully at /content/drive/MyDrive/Heart_Disease_Prediction/data/processed/heart_disease_cleaned.csv
