In [51]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [52]:
# Load the data
df = pd.read_csv('heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


# 1. Clean the data by handling missing values, duplicates, and outliers

In [53]:
# Check for missing values
print(df.isnull().sum())

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [54]:
# Handle missing values
df = df.dropna()

In [55]:
# Check for duplicates
print(df.duplicated().sum())

0


In [56]:
# Remove duplicates
df = df.drop_duplicates()

In [57]:
# Handle outliers using IQR method
for col in df.columns:
    if df[col].dtype != 'object':
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# 2. Standardize data formats and units to ensure consistency

In [58]:
# Encode categorical features
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

# 3. Perform data transformation, such as log scaling or normalization, for improved model performance

In [59]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [60]:
# Save the cleaned data to a new CSV file
cleaned_file_path = 'cleaned_heart_disease_uci.csv'
df_scaled.to_csv(cleaned_file_path, index=False)

In [61]:
print("Data cleaning and preprocessing complete. Cleaned data saved to:", cleaned_file_path)

Data cleaning and preprocessing complete. Cleaned data saved to: cleaned_heart_disease_uci.csv


In [62]:
# Load the data
df = pd.read_csv('cleaned_heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,-1.66131,1.043902,0.728149,0.0,1.840504,1.00837,-0.226973,2.619555,-1.101934,-0.179366,-0.604375,1.547301,-2.471829,-0.631285,-2.244407,-0.655189
1,-1.639082,1.482832,0.728149,0.0,-1.060198,-0.606609,-0.318243,-0.381744,-1.101934,-1.180307,1.654601,1.860795,-0.814249,2.431119,1.372022,0.767507
2,-1.627967,-1.809142,0.728149,0.0,0.873603,0.039383,0.160923,-0.381744,0.876341,1.584195,-0.604375,2.801275,-2.471829,-0.631285,-0.436192,-0.655189
3,-1.616853,-1.370212,-1.373345,0.0,-0.093297,0.039383,-0.888678,-0.381744,-1.101934,0.869237,-0.604375,0.606821,0.84333,-0.631285,-0.436192,-0.655189
4,-1.605739,0.275775,0.728149,0.0,-0.093297,-0.606609,-0.158521,-0.381744,0.876341,1.15522,-0.604375,-0.020166,0.84333,-0.631285,-0.436192,-0.655189


In [63]:
# Check for missing values
print(df.isnull().sum())

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64
