# Heart Disease Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the heart disease dataset to understand patterns, distributions, and relationships in the data.

## Table of Contents
1. Data Loading and Overview
2. Data Quality Assessment  
3. Univariate Analysis
4. Bivariate Analysis
5. Feature Engineering
6. Target Variable Creation
7. Summary and Insights


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src')
from utils import load_and_create_target, plot_confusion_matrix, plot_roc_curve

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")


## 1. Data Loading and Overview


In [None]:
# Load the dataset
df = pd.read_csv('../data/heart_dataset.csv')

print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Number of features: {df.shape[1]}")
print(f"Number of samples: {df.shape[0]}")

print("\n=== COLUMN INFORMATION ===")
print("Columns:", list(df.columns))

print("\n=== FIRST FEW ROWS ===")
df.head()


In [None]:
# Get basic information about the dataset
print("=== DATA TYPES ===")
print(df.dtypes)

print("\n=== BASIC STATISTICS ===")
df.describe()


## 2. Data Quality Assessment


In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
missing_values = df.isnull().sum()
print(missing_values)

print(f"\nTotal missing values: {missing_values.sum()}")
print(f"Percentage of missing data: {(missing_values.sum() / len(df)) * 100:.2f}%")

# Check for duplicates
print(f"\n=== DUPLICATES ===")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Data type analysis
print(f"\n=== FEATURE TYPES ===")
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=['object', 'bool']).columns.tolist()

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")


## 3. Target Variable Creation and Analysis


In [None]:
# Create target variable using risk-based approach
df_with_target = load_and_create_target('../data/heart_dataset.csv')

print("=== TARGET VARIABLE CREATED ===")
print(f"Target distribution:")
target_counts = df_with_target['target'].value_counts()
print(target_counts)

# Calculate percentages
target_percentages = df_with_target['target'].value_counts(normalize=True) * 100
print(f"\nTarget percentages:")
print(f"No Heart Disease (0): {target_percentages[0]:.1f}%")
print(f"Heart Disease (1): {target_percentages[1]:.1f}%")

# Visualize target distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
target_counts.plot(kind='bar', color=['lightblue', 'salmon'])
plt.title('Target Variable Distribution')
plt.xlabel('Heart Disease')
plt.ylabel('Count')
plt.xticks([0, 1], ['No Disease', 'Disease'], rotation=0)

plt.subplot(1, 2, 2)
plt.pie(target_counts.values, labels=['No Disease', 'Disease'], 
        autopct='%1.1f%%', colors=['lightblue', 'salmon'])
plt.title('Target Variable Proportion')

plt.tight_layout()
plt.show()

# Update our working dataframe
df = df_with_target.copy()


## 4. Univariate Analysis


In [None]:
# Analyze numeric features
numeric_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

# Create histograms for numeric features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(numeric_features):
    if feature in df.columns:
        axes[i].hist(df[feature], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        axes[i].set_title(f'Distribution of {feature}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Statistical summary for numeric features
print("=== NUMERIC FEATURES SUMMARY ===")
for feature in numeric_features:
    if feature in df.columns:
        print(f"\n{feature.upper()}:")
        print(f"  Mean: {df[feature].mean():.2f}")
        print(f"  Median: {df[feature].median():.2f}")
        print(f"  Std: {df[feature].std():.2f}")
        print(f"  Min: {df[feature].min():.2f}")
        print(f"  Max: {df[feature].max():.2f}")


In [None]:
# Analyze categorical features
categorical_features = ['sex_Female', 'sex_Male', 'cp_asymptomatic', 'cp_atypical angina', 
                        'cp_non-anginal', 'cp_typical angina', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# Create bar plots for binary and categorical features
binary_features = [col for col in categorical_features if col.startswith(('sex_', 'cp_'))]

print("=== CATEGORICAL FEATURES ANALYSIS ===")

# Gender distribution
sex_cols = [col for col in df.columns if col.startswith('sex_')]
if sex_cols:
    gender_counts = {}
    for col in sex_cols:
        gender_counts[col.replace('sex_', '')] = df[col].sum()
    
    plt.figure(figsize=(8, 5))
    plt.bar(gender_counts.keys(), gender_counts.values(), color=['pink', 'lightblue'])
    plt.title('Gender Distribution')
    plt.ylabel('Count')
    plt.show()

# Chest pain type distribution
cp_cols = [col for col in df.columns if col.startswith('cp_')]
if cp_cols:
    cp_counts = {}
    for col in cp_cols:
        cp_counts[col.replace('cp_', '')] = df[col].sum()
    
    plt.figure(figsize=(12, 5))
    plt.bar(cp_counts.keys(), cp_counts.values(), color='lightgreen')
    plt.title('Chest Pain Type Distribution')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Other categorical features
other_cat_features = ['fbs', 'restecg', 'exang', 'slope', 'thal']
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.ravel()

for i, feature in enumerate(other_cat_features):
    if feature in df.columns and i < len(axes):
        value_counts = df[feature].value_counts()
        axes[i].bar(range(len(value_counts)), value_counts.values, color='coral')
        axes[i].set_title(f'{feature} Distribution')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Count')
        axes[i].set_xticks(range(len(value_counts)))
        axes[i].set_xticklabels(value_counts.index, rotation=45)

# Hide empty subplot
if len(other_cat_features) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.show()


## 5. Bivariate Analysis - Features vs Target


In [None]:
# Analyze relationship between numeric features and target
numeric_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(numeric_features):
    if feature in df.columns:
        # Box plots by target
        df.boxplot(column=feature, by='target', ax=axes[i])
        axes[i].set_title(f'{feature} by Heart Disease Status')
        axes[i].set_xlabel('Heart Disease (0=No, 1=Yes)')
        axes[i].set_ylabel(feature)

plt.suptitle('Numeric Features by Target Variable', y=1.02)
plt.tight_layout()
plt.show()

# Statistical comparison
print("=== FEATURE MEANS BY TARGET ===")
for feature in numeric_features:
    if feature in df.columns:
        mean_no_disease = df[df['target'] == 0][feature].mean()
        mean_disease = df[df['target'] == 1][feature].mean()
        print(f"{feature}:")
        print(f"  No Disease: {mean_no_disease:.2f}")
        print(f"  Disease: {mean_disease:.2f}")
        print(f"  Difference: {mean_disease - mean_no_disease:.2f}")
        print()


In [None]:
# Correlation analysis
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Find features most correlated with target
target_correlations = correlation_matrix['target'].abs().sort_values(ascending=False)
print("=== FEATURES MOST CORRELATED WITH TARGET ===")
print(target_correlations[1:])  # Exclude target itself

# Top correlations
print(f"\nTop 5 features correlated with heart disease:")
for i, (feature, corr) in enumerate(target_correlations[1:6].items()):
    print(f"{i+1}. {feature}: {corr:.3f}")


## 6. Key Insights and Summary


In [None]:
print("=== EXPLORATORY DATA ANALYSIS SUMMARY ===")
print()
print("DATASET OVERVIEW:")
print(f"   • {df.shape[0]} samples, {df.shape[1]} features")
print(f"   • Target balance: {df['target'].value_counts().to_dict()}")
print()

print("DATA QUALITY:")
print(f"   • Missing values: {df.isnull().sum().sum()}")
print(f"   • Duplicates: {df.duplicated().sum()}")
print()

print("KEY FINDINGS:")
print("   • Dataset contains heart disease risk factors")
print("   • Mix of numerical and categorical features")
print("   • Target variable created based on medical risk factors")
print("   • Features show varying correlations with heart disease risk")
print()

print("READY FOR MODELING:")
print("   • Data is clean and preprocessed")
print("   • Target variable is well-defined")
print("   • Features show discriminative power")
print("   • Dataset is suitable for machine learning")

print("\n" + "="*60)
print("NEXT STEPS: Proceed to model training and evaluation!")
print("="*60)
