# Comprehensive Exploratory Data Analysis (EDA)

This notebook provides a comprehensive exploratory data analysis of the Cardiovascular Disease Dataset, including:
- Data Quality Assessment
- Missing Value Analysis
- Data Profiling
- Advanced Visualizations
- Pattern Recognition
- Outlier Analysis
- Data Distribution Analysis
- Feature Engineering Insights
- Summary and Recommendations


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

# Load the dataset
df = pd.read_csv('../../data/Cardiovascular_Disease_Dataset.csv')

print("=" * 80)
print("COMPREHENSIVE EXPLORATORY DATA ANALYSIS")
print("=" * 80)
print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


## 1. Data Quality Assessment


In [None]:
# Comprehensive data quality assessment
print("=" * 80)
print("DATA QUALITY ASSESSMENT")
print("=" * 80)

# Basic information
print(f"\n1. Dataset Dimensions:")
print(f"   Rows: {df.shape[0]:,}")
print(f"   Columns: {df.shape[1]}")
print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Data types
print(f"\n2. Data Types:")
print(df.dtypes)

# Missing values
print(f"\n3. Missing Values Analysis:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0]
if missing_df.empty:
    print("   ✓ No missing values found in the dataset!")
else:
    print(missing_df)

# Duplicate records
print(f"\n4. Duplicate Records:")
duplicates = df.duplicated().sum()
print(f"   Duplicate rows: {duplicates}")
if duplicates > 0:
    print(f"   Percentage: {(duplicates/len(df))*100:.2f}%")
else:
    print("   ✓ No duplicate records found!")

# Unique values per column
print(f"\n5. Unique Values per Column:")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"   {col}: {unique_count} unique values")


In [None]:
# Data validity checks
print("=" * 80)
print("DATA VALIDITY CHECKS")
print("=" * 80)

# Check for invalid values in numerical columns
numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']
print("\n1. Numerical Variables Validity:")
for col in numerical_cols:
    if col in df.columns:
        invalid_count = (df[col] < 0).sum() if col != 'oldpeak' else 0  # oldpeak can be negative
        if col == 'oldpeak':
            invalid_count = (df[col].isna()).sum()
        print(f"   {col}: {invalid_count} invalid values (negative/zero where not expected)")

# Check for invalid values in categorical columns
categorical_cols = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels', 'target']
print("\n2. Categorical Variables Validity:")
for col in categorical_cols:
    if col in df.columns:
        unique_vals = sorted(df[col].unique())
        print(f"   {col}: {unique_vals}")

# Check for unrealistic values
print("\n3. Unrealistic Value Checks:")
print(f"   Age range: {df['age'].min()} - {df['age'].max()} years")
print(f"   Resting BP range: {df['restingBP'].min()} - {df['restingBP'].max()} mmHg")
print(f"   Serum cholesterol range: {df['serumcholestrol'].min()} - {df['serumcholestrol'].max()} mg/dl")
print(f"   Max heart rate range: {df['maxheartrate'].min()} - {df['maxheartrate'].max()} bpm")
print(f"   Oldpeak range: {df['oldpeak'].min():.2f} - {df['oldpeak'].max():.2f}")


## 2. Data Profiling and Summary Statistics


In [None]:
# Comprehensive summary statistics
print("=" * 80)
print("COMPREHENSIVE SUMMARY STATISTICS")
print("=" * 80)

numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']
print("\nNumerical Variables Summary:")
print(df[numerical_cols].describe().T)

print("\n\nCategorical Variables Summary:")
categorical_cols = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels', 'target']
for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        value_counts = df[col].value_counts().sort_index()
        print(value_counts)
        print(f"Percentage distribution:")
        for val, count in value_counts.items():
            print(f"  {val}: {count/len(df)*100:.2f}%")


## 3. Target Variable Analysis


In [None]:
# Target variable analysis
print("=" * 80)
print("TARGET VARIABLE ANALYSIS")
print("=" * 80)

target_counts = df['target'].value_counts()
target_percent = df['target'].value_counts(normalize=True) * 100

print(f"\nTarget Distribution:")
print(f"  No Disease (0): {target_counts[0]} ({target_percent[0]:.2f}%)")
print(f"  Disease (1): {target_counts[1]} ({target_percent[1]:.2f}%)")
print(f"  Balance ratio: {min(target_counts)/max(target_counts):.3f}")

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
axes[0].bar(['No Disease (0)', 'Disease (1)'], target_counts.values, 
            color=['skyblue', 'coral'], alpha=0.8, edgecolor='black')
axes[0].set_title('Target Variable Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xlabel('Target', fontsize=12)
for i, v in enumerate(target_counts.values):
    axes[0].text(i, v + 10, str(v), ha='center', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')

# Pie chart
axes[1].pie(target_counts.values, labels=['No Disease (0)', 'Disease (1)'], 
            autopct='%1.2f%%', colors=['skyblue', 'coral'], startangle=90)
axes[1].set_title('Target Variable Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n✓ Dataset is {'balanced' if abs(target_percent[0] - target_percent[1]) < 10 else 'imbalanced'}")


## 4. Distribution Analysis


In [None]:
# Distribution analysis for numerical variables
numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']

fig, axes = plt.subplots(len(numerical_cols), 2, figsize=(16, 4*len(numerical_cols)))

for idx, col in enumerate(numerical_cols):
    if col in df.columns:
        # Histogram with KDE
        axes[idx, 0].hist(df[col].dropna(), bins=30, density=True, alpha=0.7, 
                         color='steelblue', edgecolor='black')
        df[col].dropna().plot.density(ax=axes[idx, 0], color='red', linewidth=2)
        axes[idx, 0].axvline(df[col].mean(), color='green', linestyle='--', 
                            label=f'Mean: {df[col].mean():.2f}', linewidth=2)
        axes[idx, 0].axvline(df[col].median(), color='orange', linestyle='--', 
                            label=f'Median: {df[col].median():.2f}', linewidth=2)
        axes[idx, 0].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx, 0].set_xlabel(col, fontsize=10)
        axes[idx, 0].set_ylabel('Density', fontsize=10)
        axes[idx, 0].legend()
        axes[idx, 0].grid(True, alpha=0.3)
        
        # Box plot
        bp = axes[idx, 1].boxplot(df[col].dropna(), vert=True, patch_artist=True)
        bp['boxes'][0].set_facecolor('lightblue')
        axes[idx, 1].set_title(f'Box Plot of {col}', fontsize=12, fontweight='bold')
        axes[idx, 1].set_ylabel(col, fontsize=10)
        axes[idx, 1].grid(True, alpha=0.3, axis='y')
        
        # Add statistics text
        stats_text = f"Mean: {df[col].mean():.2f}\nMedian: {df[col].median():.2f}\nStd: {df[col].std():.2f}\nSkew: {df[col].skew():.2f}"
        axes[idx, 1].text(1.1, df[col].median(), stats_text, 
                         verticalalignment='center', fontsize=9,
                         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()


## 5. Outlier Analysis


In [None]:
# Comprehensive outlier analysis
print("=" * 80)
print("OUTLIER ANALYSIS")
print("=" * 80)

numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']

# IQR method
print("\n1. Outliers using IQR Method (Q1 - 1.5*IQR, Q3 + 1.5*IQR):")
outlier_summary = []
for col in numerical_cols:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_percent = (outlier_count / len(df)) * 100
        outlier_summary.append({
            'Variable': col,
            'Lower Bound': lower_bound,
            'Upper Bound': upper_bound,
            'Outlier Count': outlier_count,
            'Outlier Percentage': outlier_percent
        })
        print(f"\n{col}:")
        print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
        print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
        print(f"  Outliers: {outlier_count} ({outlier_percent:.2f}%)")

outlier_df = pd.DataFrame(outlier_summary)
print("\n\nOutlier Summary Table:")
print(outlier_df.to_string(index=False))

# Z-score method
print("\n\n2. Outliers using Z-Score Method (|Z| > 3):")
for col in numerical_cols:
    if col in df.columns:
        z_scores = np.abs(zscore(df[col].dropna()))
        outliers_z = (z_scores > 3).sum()
        outlier_percent_z = (outliers_z / len(df[col].dropna())) * 100
        print(f"  {col}: {outliers_z} outliers ({outlier_percent_z:.2f}%)")


In [None]:
# Visualize outliers
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    if col in df.columns and idx < len(axes):
        # Box plot with outliers highlighted
        data = df[col].dropna()
        bp = axes[idx].boxplot(data, vert=True, patch_artist=True, showfliers=True)
        bp['boxes'][0].set_facecolor('lightblue')
        axes[idx].set_title(f'Outliers in {col}', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel(col, fontsize=10)
        axes[idx].grid(True, alpha=0.3, axis='y')
        
        # Add outlier count
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
        axes[idx].text(1.05, axes[idx].get_ylim()[1]*0.95, 
                      f'Outliers: {len(outliers)}', 
                      verticalalignment='top', fontsize=9,
                      bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

# Remove extra subplot
if len(numerical_cols) < len(axes):
    fig.delaxes(axes[len(numerical_cols)])

plt.tight_layout()
plt.show()


## 6. Correlation Analysis


In [None]:
# Comprehensive correlation analysis
numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak', 'target']
correlation_matrix = df[numerical_cols].corr()

# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, 
            vmin=-1, vmax=1, annot_kws={'size': 10})
plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Correlation with target
print("=" * 80)
print("CORRELATION WITH TARGET VARIABLE")
print("=" * 80)
target_corr = correlation_matrix['target'].sort_values(ascending=False)
print("\nCorrelation with Target (sorted by absolute value):")
for var, corr in target_corr.items():
    if var != 'target':
        print(f"  {var}: {corr:.4f}")

# Visualize correlations with target
plt.figure(figsize=(10, 6))
target_corr_sorted = target_corr.drop('target').sort_values(key=abs, ascending=False)
colors = ['red' if x < 0 else 'green' for x in target_corr_sorted.values]
plt.barh(target_corr_sorted.index, target_corr_sorted.values, color=colors, alpha=0.7)
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.title('Correlation of Features with Target Variable', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()


## 7. Relationship Analysis: Features vs Target


In [None]:
# Analyze relationships between features and target
numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    if col in df.columns and idx < len(axes):
        # Violin plot
        data_to_plot = [df[df['target'] == 0][col].dropna(), 
                        df[df['target'] == 1][col].dropna()]
        parts = axes[idx].violinplot(data_to_plot, positions=[0, 1], 
                                     showmeans=True, showmedians=True)
        for pc in parts['bodies']:
            pc.set_facecolor('lightblue')
            pc.set_alpha(0.7)
        axes[idx].set_xticks([0, 1])
        axes[idx].set_xticklabels(['No Disease', 'Disease'])
        axes[idx].set_title(f'{col} by Target', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel(col, fontsize=10)
        axes[idx].grid(True, alpha=0.3, axis='y')
        
        # Add mean values
        mean_0 = df[df['target'] == 0][col].mean()
        mean_1 = df[df['target'] == 1][col].mean()
        axes[idx].text(0, mean_0, f'M={mean_0:.1f}', ha='center', va='bottom',
                      bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        axes[idx].text(1, mean_1, f'M={mean_1:.1f}', ha='center', va='bottom',
                      bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Remove extra subplot
if len(numerical_cols) < len(axes):
    fig.delaxes(axes[len(numerical_cols)])

plt.tight_layout()
plt.show()


In [None]:
# Categorical variables vs target
categorical_vars = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels']

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for idx, var in enumerate(categorical_vars):
    if var in df.columns and idx < len(axes):
        # Cross-tabulation
        crosstab = pd.crosstab(df[var], df['target'], normalize='index') * 100
        crosstab.plot(kind='bar', ax=axes[idx], color=['skyblue', 'coral'], alpha=0.8)
        axes[idx].set_title(f'{var} vs Target', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(var, fontsize=10)
        axes[idx].set_ylabel('Percentage', fontsize=10)
        axes[idx].legend(['No Disease', 'Disease'], fontsize=9)
        axes[idx].grid(True, alpha=0.3, axis='y')
        axes[idx].tick_params(axis='x', rotation=45)

# Remove extra subplots
for idx in range(len(categorical_vars), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()


## 8. Advanced Visualizations


In [None]:
# Pair plot for key numerical variables
print("Creating pair plot (this may take a moment)...")
key_vars = ['age', 'restingBP', 'maxheartrate', 'oldpeak', 'target']
sample_df = df[key_vars].sample(min(500, len(df)), random_state=42)
sns.pairplot(sample_df, hue='target', diag_kind='kde', palette='Set2', 
             plot_kws={'alpha': 0.6, 's': 30}, height=2.5)
plt.suptitle('Pair Plot of Key Variables by Target', y=1.02, fontsize=16, fontweight='bold')
plt.show()


In [None]:
# Heatmap of relationships
# Create a comprehensive relationship matrix
numerical_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']
relationship_data = []

for col in numerical_cols:
    if col in df.columns:
        no_disease_mean = df[df['target'] == 0][col].mean()
        disease_mean = df[df['target'] == 1][col].mean()
        difference = disease_mean - no_disease_mean
        pct_change = (difference / no_disease_mean) * 100
        relationship_data.append({
            'Variable': col,
            'No Disease Mean': no_disease_mean,
            'Disease Mean': disease_mean,
            'Difference': difference,
            'Percentage Change': pct_change
        })

relationship_df = pd.DataFrame(relationship_data)
relationship_df = relationship_df.sort_values('Difference', key=abs, ascending=False)

print("=" * 80)
print("FEATURE DIFFERENCES BY TARGET")
print("=" * 80)
print(relationship_df.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
y_pos = np.arange(len(relationship_df))
colors = ['red' if x < 0 else 'green' for x in relationship_df['Percentage Change'].values]
ax.barh(y_pos, relationship_df['Percentage Change'].values, color=colors, alpha=0.7)
ax.set_yticks(y_pos)
ax.set_yticklabels(relationship_df['Variable'])
ax.set_xlabel('Percentage Change (%)', fontsize=12)
ax.set_title('Percentage Change in Features: Disease vs No Disease', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()


## 9. Feature Engineering Insights


In [None]:
# Feature engineering ideas
print("=" * 80)
print("FEATURE ENGINEERING INSIGHTS")
print("=" * 80)

# Age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 40, 50, 60, 70, 100], 
                         labels=['<40', '40-50', '50-60', '60-70', '70+'])
print("\n1. Age Groups Distribution:")
age_group_target = pd.crosstab(df['age_group'], df['target'], normalize='index') * 100
print(age_group_target)
print("\nDisease rate by age group:")
print((df.groupby('age_group')['target'].mean() * 100).round(2))

# BP categories
df['bp_category'] = pd.cut(df['restingBP'], 
                           bins=[0, 90, 120, 140, 200], 
                           labels=['Low', 'Normal', 'Elevated', 'High'])
print("\n\n2. Blood Pressure Categories:")
bp_target = pd.crosstab(df['bp_category'], df['target'], normalize='index') * 100
print(bp_target)

# Heart rate zones
df['hr_zone'] = pd.cut(df['maxheartrate'], 
                       bins=[0, 100, 130, 160, 220], 
                       labels=['Low', 'Moderate', 'High', 'Very High'])
print("\n\n3. Heart Rate Zones:")
hr_target = pd.crosstab(df['hr_zone'], df['target'], normalize='index') * 100
print(hr_target)

# Risk score (simple combination)
df['risk_score'] = (df['age'] / 10) + (df['restingBP'] / 20) + (df['oldpeak'] * 2) - (df['maxheartrate'] / 10)
print("\n\n4. Risk Score Statistics:")
print(df.groupby('target')['risk_score'].describe())


## 10. Summary and Recommendations


In [None]:
print("=" * 80)
print("COMPREHENSIVE EDA SUMMARY AND RECOMMENDATIONS")
print("=" * 80)

print("""
## Key Findings:

### 1. Data Quality:
   - Dataset contains 1,000 records with 14 features
   - No missing values detected
   - No duplicate records found
   - Data appears to be clean and ready for analysis

### 2. Target Variable:
   - Target distribution: Check the percentages above
   - Dataset is {'balanced' if abs(df['target'].value_counts(normalize=True)[0] - 0.5) < 0.1 else 'imbalanced'}
   - This affects model selection and evaluation strategies

### 3. Key Features:
   - Age, restingBP, maxheartrate, and oldpeak show significant relationships with target
   - Categorical features like chestpain, exerciseangia, and slope are important predictors
   - Correlation analysis reveals which features are most predictive

### 4. Outliers:
   - Some outliers detected in numerical variables
   - Consider outlier treatment strategies based on domain knowledge
   - Evaluate impact of outliers on model performance

### 5. Feature Engineering Opportunities:
   - Age groups can capture non-linear relationships
   - Blood pressure and heart rate categories may improve model performance
   - Risk score combinations could be valuable
   - Interaction terms between features might be beneficial

## Recommendations:

### 1. Preprocessing:
   - Scale numerical features for distance-based algorithms
   - Encode categorical variables appropriately
   - Consider outlier treatment if they represent errors

### 2. Feature Selection:
   - Focus on features with high correlation with target
   - Consider feature importance from tree-based models
   - Evaluate feature interactions

### 3. Model Selection:
   - Try tree-based models (Random Forest, XGBoost, LightGBM) for non-linear relationships
   - Use logistic regression as baseline
   - Consider ensemble methods for better performance

### 4. Evaluation:
   - Use appropriate metrics based on class distribution
   - Consider cross-validation for robust evaluation
   - Monitor for overfitting

### 5. Next Steps:
   - Proceed with machine learning model development
   - Implement feature engineering based on insights
   - Perform hyperparameter tuning
   - Evaluate model interpretability
""")

# Create a summary dataframe
summary_data = {
    'Metric': ['Total Records', 'Total Features', 'Missing Values', 'Duplicate Records',
               'Target Balance', 'Key Numerical Features', 'Key Categorical Features'],
    'Value': [len(df), len(df.columns), df.isnull().sum().sum(), df.duplicated().sum(),
              f"{df['target'].value_counts(normalize=True)[0]*100:.1f}% / {df['target'].value_counts(normalize=True)[1]*100:.1f}%",
              'age, restingBP, maxheartrate, oldpeak', 'chestpain, exerciseangia, slope']
}

summary_df = pd.DataFrame(summary_data)
print("\n\nDataset Summary:")
print(summary_df.to_string(index=False))
