# 1. Importing Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Loading the Dataset
The dataset is loaded from the processed Cleveland data file.

In [None]:
df = pd.read_csv('Dataset/processed.cleveland.data')
df.head()

# 3. Assigning Column Names
As the dataset does not include header names, we refer to `cleve.mod` for assigning the correct column names.

In [None]:
attributes = ['age','gender','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','result']
df.columns = attributes

# 4. Data Type Conversion
Converting all columns to appropriate numeric data types.

In [None]:
df = df.apply(pd.to_numeric, errors='coerce')
df.dtypes

# 5. Handling Missing Values
The dataset contains '?' as missing values, especially in the `ca` and `thal` columns. We impute them with the most frequent values.

In [None]:
df['ca'].fillna(0, inplace=True)
df['thal'].fillna(3, inplace=True)

# 6. Target Variable Binarization
Converting `result` into a binary format: 0 = No Heart Disease, 1 = Presence of Heart Disease.

In [None]:
df['result'] = df['result'].apply(lambda x: 1 if x > 0 else 0)

# 7. Exploratory Data Analysis (EDA)
Understanding feature distributions and relationships.

In [None]:
# Count plot of target
sns.countplot(x='result', data=df, palette='Set2')

In [None]:
# Categorical features vs target
for col in ['gender', 'cp', 'exang', 'thal']:
    sns.countplot(x=col, hue='result', data=df)
    plt.title(f'{col} vs Heart Disease')
    plt.show()

In [None]:
# Numerical features vs target
for col in ['age', 'chol', 'thalach']:
    sns.boxplot(x='result', y=col, data=df)
    plt.title(f'{col} by Heart Disease')
    plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')