In [None]:
# Heart Disease Dataset - Exploratory Data Analysis
# Team: Mercy Thokozani Ngwenya & Mediator Nhongo  
# Date: 2025-10-27

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# 1. Setup and Data Loading

# Load the dataset
column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

df = pd.read_csv('../data/cleveland.data', names=column_names, na_values='?', skipinitialspace=True)

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")

# 2. Basic Dataset Information

print("=== DATASET OVERVIEW ===")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")
print(f"\nColumn names: {list(df.columns)}")

print("\n=== DATA TYPES ===")
print(df.dtypes)

print("\n=== FIRST 5 ROWS ===")
print(df.head())

# Missing values analysis
print("=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0])

# 3. Target Variable Analysis

df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

print("=== TARGET VARIABLE DISTRIBUTION ===")
target_counts = df['target'].value_counts()
target_percent = df['target'].value_counts(normalize=True) * 100

target_summary = pd.DataFrame({
    'Count': target_counts,
    'Percentage': target_percent
})
print(target_summary)

print("\n=== ORIGINAL TARGET DISTRIBUTION (num) ===")
print(df['num'].value_counts().sort_index())

# Visualize target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

colors = ['lightcoral', 'lightblue']
ax1.pie(target_counts.values, labels=['No Disease', 'Disease'], autopct='%1.1f%%', 
        colors=colors, startangle=90)
ax1.set_title('Heart Disease Distribution (Binary)')

original_counts = df['num'].value_counts().sort_index()
ax2.bar(original_counts.index, original_counts.values, color='skyblue', alpha=0.7)
ax2.set_xlabel('Heart Disease Severity (0-4)')
ax2.set_ylabel('Number of Patients')
ax2.set_title('Original Heart Disease Severity Distribution')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 4. Feature Distributions

numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, feature in enumerate(numerical_features):
    axes[i].hist(df[feature], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True, alpha=0.3)

fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.ravel()

for i, feature in enumerate(categorical_features):
    value_counts = df[feature].value_counts().sort_index()
    axes[i].bar(value_counts.index.astype(str), value_counts.values, 
                color='lightgreen', alpha=0.7, edgecolor='black')
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x', rotation=45)
    
    for j, v in enumerate(value_counts.values):
        axes[i].text(j, v + 1, str(v), ha='center', va='bottom')

for i in range(len(categorical_features), len(axes)):
    fig.delaxes(axes[i])
    
plt.tight_layout()
plt.show()

# 5. Statistical Summary

print("=== NUMERICAL FEATURES STATISTICAL SUMMARY ===")
print(df[numerical_features].describe())

print("=== CATEGORICAL FEATURES SUMMARY ===")
for feature in categorical_features:
    print(f"\n{feature}:")
    print(df[feature].value_counts().sort_index())

# 6. Correlation Analysis

plt.figure(figsize=(16, 12))

correlation_matrix = df.corr()

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
            center=0, square=True, fmt='.2f',
            cbar_kws={'shrink': 0.8})

plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("=== TOP CORRELATIONS WITH TARGET ===")
target_correlations = correlation_matrix['target'].sort_values(ascending=False)
target_correlations_df = pd.DataFrame({
    'Feature': target_correlations.index,
    'Correlation': target_correlations.values
})
print(target_correlations_df.head(10))

# 7. Feature vs Target Analysis

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, feature in enumerate(numerical_features):
    data_to_plot = [df[df['target'] == 0][feature], df[df['target'] == 1][feature]]
    axes[i].boxplot(data_to_plot, labels=['No Disease', 'Disease'])
    axes[i].set_title(f'{feature} vs Heart Disease')
    axes[i].set_ylabel(feature)
    axes[i].grid(True, alpha=0.3)

fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(3, 3, figsize=(20, 15))
axes = axes.ravel()

for i, feature in enumerate(categorical_features):
    cross_tab = pd.crosstab(df[feature], df['target'], normalize='index') * 100
    cross_tab.plot(kind='bar', ax=axes[i], color=['lightcoral', 'lightblue'])
    axes[i].set_title(f'{feature} vs Heart Disease')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Percentage (%)')
    axes[i].legend(['No Disease', 'Disease'])
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].grid(True, alpha=0.3)

for i in range(len(categorical_features), len(axes)):
    fig.delaxes(axes[i])
    
plt.tight_layout()
plt.show()

# 8. Missing Value Analysis

print("=== DETAILED MISSING VALUE ANALYSIS ===")
missing_summary = pd.DataFrame({
    'Feature': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100
})

missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

if not missing_summary.empty:
    print(missing_summary)
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(missing_summary['Feature'], missing_summary['Missing_Percentage'], 
                   color='salmon', alpha=0.7, edgecolor='darkred')
    plt.title('Missing Values Percentage by Feature', fontsize=14, fontweight='bold')
    plt.xlabel('Features')
    plt.ylabel('Missing Percentage (%)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.1f}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

# 9. Outlier Detection

print("=== OUTLIER DETECTION (IQR METHOD) ===")

outlier_summary = []

for feature in numerical_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outlier_count = len(outliers)
    outlier_percentage = (outlier_count / len(df)) * 100
    
    outlier_summary.append({
        'Feature': feature,
        'Outlier_Count': outlier_count,
        'Outlier_Percentage': outlier_percentage,
        'Lower_Bound': lower_bound,
        'Upper_Bound': upper_bound
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df)

# 10. Key Insights and Conclusions

print("=== KEY INSIGHTS ===")
print(f"1. Dataset contains {df.shape[0]} patients and {df.shape[1]} features")

disease_percentage = (df['target'].sum() / len(df)) * 100
print(f"2. {disease_percentage:.1f}% of patients have heart disease")

total_missing = df.isnull().sum().sum()
print(f"3. Total missing values: {total_missing}")

top_corr_features = target_correlations.index[1:4]  # Skip target itself
print(f"4. Top features correlated with heart disease: {list(top_corr_features)}")

avg_age = df['age'].mean()
print(f"5. Average patient age: {avg_age:.1f} years")

male_percentage = (df['sex'].sum() / len(df)) * 100  # sex=1 is male
print(f"6. Male patients: {male_percentage:.1f}%")

# 11. Data Quality Assessment

print("=== DATA QUALITY ASSESSMENT ===")

quality_metrics = {
    'Metric': [
        'Total Samples',
        'Total Features', 
        'Missing Values Percentage',
        'Duplicate Rows',
        'Class Balance (Disease/No Disease)',
        'Data Types Consistency',
        'Outlier Percentage (Average)'
    ],
    'Value': [
        f"{len(df):,}",
        f"{len(df.columns)}",
        f"{(df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100:.2f}%",
        f"{df.duplicated().sum()}",
        f"{disease_percentage:.1f}% / {100-disease_percentage:.1f}%",
        "Consistent" if len(df.dtypes.unique()) <= 2 else "Check Needed",
        f"{outlier_df['Outlier_Percentage'].mean():.1f}%"
    ],
    'Status': [
        '✅ Good' if len(df) > 100 else '⚠ Small',
        '✅ Good' if len(df.columns) >= 10 else '⚠ Limited',
        '✅ Good' if (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100 < 5 else '⚠ High',
        '✅ Good' if df.duplicated().sum() == 0 else '⚠ Duplicates Found',
        '✅ Balanced' if 40 <= disease_percentage <= 60 else '⚠ Imbalanced',
        '✅ Good',
        '✅ Good' if outlier_df['Outlier_Percentage'].mean() < 10 else '⚠ High Outliers'
    ]
}

quality_df = pd.DataFrame(quality_metrics)
print(quality_df)

# Summary
print("""
---
📋 Summary

This exploratory analysis provides comprehensive insights into the Heart Disease dataset. Key findings include:

- Dataset Quality: Good overall with minimal missing values
- Class Distribution: Slightly imbalanced but manageable
- Feature Relationships: Several strong correlations with target variable
- Data Types: Appropriate for machine learning
- Next Steps: Proceed with preprocessing and model development

The dataset appears suitable for building predictive models for heart disease diagnosis.
""")
