# Heart Disease Dataset - Univariate, Bivariate, and Multivariate Analysis

This notebook performs comprehensive analysis:
- **Univariate Analysis**: Analysis of individual variables
- **Bivariate Analysis**: Analysis of relationships between two variables
- **Multivariate Analysis**: Analysis of relationships between multiple variables


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import normaltest, shapiro
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
df = pd.read_csv('../../data/heart-disease.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


## 1. Univariate Analysis

Univariate analysis examines each variable individually to understand its distribution, central tendencies, and variability.


In [None]:
# Define variable types
numerical_cols = ['age', 'rest_bp', 'chol', 'max_hr', 'st_depr']
categorical_cols = ['sex', 'chest_pain', 'heart_disease']

print("="*60)
print("UNIVARIATE ANALYSIS - NUMERICAL VARIABLES")
print("="*60)

# Descriptive statistics
print("\nDescriptive Statistics:")
print(df[numerical_cols].describe())

# Additional statistics
print("\n\nAdditional Statistics:")
for col in numerical_cols:
    print(f"\n{col.upper()}:")
    print(f"  Mean: {df[col].mean():.2f}")
    print(f"  Median: {df[col].median():.2f}")
    print(f"  Mode: {df[col].mode().values[0] if len(df[col].mode()) > 0 else 'N/A'}")
    print(f"  Std Dev: {df[col].std():.2f}")
    print(f"  Variance: {df[col].var():.2f}")
    print(f"  Skewness: {df[col].skew():.2f}")
    print(f"  Kurtosis: {df[col].kurtosis():.2f}")
    print(f"  Range: [{df[col].min():.2f}, {df[col].max():.2f}]")
    print(f"  IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.2f}")


In [None]:
# Histograms and density plots for numerical variables
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    # Histogram
    axes[idx].hist(df[col], bins=30, alpha=0.7, edgecolor='black', density=True, label='Histogram')
    # Density curve
    df[col].plot.density(ax=axes[idx], color='red', linewidth=2, label='Density')
    # Mean and median lines
    axes[idx].axvline(df[col].mean(), color='green', linestyle='--', linewidth=2, label=f'Mean: {df[col].mean():.2f}')
    axes[idx].axvline(df[col].median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {df[col].median():.2f}')
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Density')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[5])

plt.suptitle('Univariate Analysis: Numerical Variable Distributions', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()


In [None]:
# Box plots for numerical variables
fig, axes = plt.subplots(1, 5, figsize=(20, 5))

for idx, col in enumerate(numerical_cols):
    box_plot = axes[idx].boxplot(df[col], vert=True, patch_artist=True, 
                                  showmeans=True, meanline=True)
    axes[idx].set_title(f'Box Plot: {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(True, alpha=0.3, axis='y')
    
    # Color the boxes
    for patch in box_plot['boxes']:
        patch.set_facecolor('lightblue')
        patch.set_alpha(0.7)
    
    # Add outliers count
    outliers = len(df[(df[col] < df[col].quantile(0.25) - 1.5*(df[col].quantile(0.75) - df[col].quantile(0.25))) |
                       (df[col] > df[col].quantile(0.75) + 1.5*(df[col].quantile(0.75) - df[col].quantile(0.25)))])
    axes[idx].text(0.5, 0.95, f'Outliers: {outliers}', transform=axes[idx].transAxes,
                   ha='center', va='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.suptitle('Univariate Analysis: Box Plots for Numerical Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Categorical variables analysis
print("="*60)
print("UNIVARIATE ANALYSIS - CATEGORICAL VARIABLES")
print("="*60)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(categorical_cols):
    value_counts = df[col].value_counts().sort_index()
    axes[idx].bar(range(len(value_counts)), value_counts.values, alpha=0.7, edgecolor='black')
    axes[idx].set_xticks(range(len(value_counts)))
    axes[idx].set_xticklabels([str(x) for x in value_counts.index], rotation=0)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for i, v in enumerate(value_counts.values):
        axes[idx].text(i, v, str(v), ha='center', va='bottom', fontweight='bold')
    
    # Print statistics
    print(f"\n{col.upper()}:")
    print(f"  Categories: {value_counts.index.tolist()}")
    print(f"  Counts: {value_counts.values.tolist()}")
    print(f"  Proportions: {(value_counts.values / len(df) * 100).round(2).tolist()}%")

plt.suptitle('Univariate Analysis: Categorical Variable Distributions', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 2. Bivariate Analysis

Bivariate analysis examines the relationship between two variables to identify patterns, correlations, and associations.


In [None]:
# Numerical vs Target (Heart Disease)
print("="*60)
print("BIVARIATE ANALYSIS: NUMERICAL VARIABLES vs HEART DISEASE")
print("="*60)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    # Violin plots
    data_to_plot = [df[df['heart_disease'] == 0][col], df[df['heart_disease'] == 1][col]]
    parts = axes[idx].violinplot(data_to_plot, positions=[0, 1], showmeans=True, showmedians=True)
    
    # Customize violins
    for pc in parts['bodies']:
        pc.set_facecolor('lightblue')
        pc.set_alpha(0.7)
    
    axes[idx].set_xticks([0, 1])
    axes[idx].set_xticklabels(['No Heart Disease', 'Heart Disease'])
    axes[idx].set_title(f'{col} vs Heart Disease', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col)
    axes[idx].grid(True, alpha=0.3, axis='y')
    
    # Statistical test
    from scipy.stats import ttest_ind
    stat, p_value = ttest_ind(df[df['heart_disease'] == 0][col], 
                              df[df['heart_disease'] == 1][col])
    axes[idx].text(0.5, 0.95, f'p-value: {p_value:.4f}', transform=axes[idx].transAxes,
                   ha='center', va='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    print(f"\n{col}:")
    print(f"  No Heart Disease - Mean: {df[df['heart_disease'] == 0][col].mean():.2f}, Std: {df[df['heart_disease'] == 0][col].std():.2f}")
    print(f"  Heart Disease - Mean: {df[df['heart_disease'] == 1][col].mean():.2f}, Std: {df[df['heart_disease'] == 1][col].std():.2f}")
    print(f"  T-test p-value: {p_value:.4f}")

fig.delaxes(axes[5])
plt.suptitle('Bivariate Analysis: Numerical Variables vs Heart Disease', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Categorical vs Target
print("="*60)
print("BIVARIATE ANALYSIS: CATEGORICAL VARIABLES vs HEART DISEASE")
print("="*60)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Sex vs Heart Disease
crosstab1 = pd.crosstab(df['sex'], df['heart_disease'], normalize='index') * 100
crosstab1.plot(kind='bar', ax=axes[0], alpha=0.7, edgecolor='black', color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('Sex vs Heart Disease', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Sex')
axes[0].set_ylabel('Percentage')
axes[0].legend(['No Heart Disease', 'Heart Disease'])
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].tick_params(axis='x', rotation=0)

# Chi-square test
from scipy.stats import chi2_contingency
contingency1 = pd.crosstab(df['sex'], df['heart_disease'])
chi2_1, p_value_1, dof_1, expected_1 = chi2_contingency(contingency1)
print(f"\nSex vs Heart Disease:")
print(f"  Chi-square statistic: {chi2_1:.4f}")
print(f"  P-value: {p_value_1:.4f}")
print(f"  Significant: {'Yes' if p_value_1 < 0.05 else 'No'}")

# Chest Pain vs Heart Disease
crosstab2 = pd.crosstab(df['chest_pain'], df['heart_disease'], normalize='index') * 100
crosstab2.plot(kind='bar', ax=axes[1], alpha=0.7, edgecolor='black', color=['#FF6B6B', '#4ECDC4'])
axes[1].set_title('Chest Pain vs Heart Disease', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Chest Pain Type')
axes[1].set_ylabel('Percentage')
axes[1].legend(['No Heart Disease', 'Heart Disease'])
axes[1].grid(True, alpha=0.3, axis='y')
axes[1].tick_params(axis='x', rotation=0)

# Chi-square test
contingency2 = pd.crosstab(df['chest_pain'], df['heart_disease'])
chi2_2, p_value_2, dof_2, expected_2 = chi2_contingency(contingency2)
print(f"\nChest Pain vs Heart Disease:")
print(f"  Chi-square statistic: {chi2_2:.4f}")
print(f"  P-value: {p_value_2:.4f}")
print(f"  Significant: {'Yes' if p_value_2 < 0.05 else 'No'}")

plt.suptitle('Bivariate Analysis: Categorical Variables vs Heart Disease', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Correlation analysis between numerical variables
print("="*60)
print("BIVARIATE ANALYSIS: CORRELATION MATRIX")
print("="*60)

correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, fmt='.2f',
            annot_kws={'size': 10})
plt.title('Correlation Matrix - Numerical Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nCorrelation Matrix:")
print(correlation_matrix)


In [None]:
# Scatter plots for pairwise relationships
print("="*60)
print("BIVARIATE ANALYSIS: PAIRWISE SCATTER PLOTS")
print("="*60)

# Select key pairs for visualization
key_pairs = [('age', 'chol'), ('age', 'max_hr'), ('chol', 'max_hr'), 
             ('rest_bp', 'st_depr'), ('age', 'rest_bp')]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, (x_col, y_col) in enumerate(key_pairs):
    # Color by heart disease status
    axes[idx].scatter(df[df['heart_disease'] == 0][x_col], df[df['heart_disease'] == 0][y_col],
                     alpha=0.6, label='No Heart Disease', color='#FF6B6B', s=50)
    axes[idx].scatter(df[df['heart_disease'] == 1][x_col], df[df['heart_disease'] == 1][y_col],
                     alpha=0.6, label='Heart Disease', color='#4ECDC4', s=50)
    axes[idx].set_xlabel(x_col)
    axes[idx].set_ylabel(y_col)
    axes[idx].set_title(f'{x_col} vs {y_col}', fontsize=12, fontweight='bold')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
    
    # Calculate correlation
    corr = df[x_col].corr(df[y_col])
    axes[idx].text(0.05, 0.95, f'r = {corr:.3f}', transform=axes[idx].transAxes,
                   ha='left', va='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

fig.delaxes(axes[5])
plt.suptitle('Bivariate Analysis: Pairwise Scatter Plots', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 3. Multivariate Analysis

Multivariate analysis examines relationships between multiple variables simultaneously to understand complex patterns and interactions.


In [None]:
# Comprehensive correlation heatmap including target
print("="*60)
print("MULTIVARIATE ANALYSIS: COMPREHENSIVE CORRELATION HEATMAP")
print("="*60)

correlation_matrix_full = df[numerical_cols + ['heart_disease']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_full, annot=True, cmap='RdYlBu_r', center=0, 
            square=True, linewidths=2, cbar_kws={"shrink": 0.8}, fmt='.2f',
            annot_kws={'size': 11})
plt.title('Multivariate Correlation Heatmap (Including Target)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nCorrelations with Heart Disease:")
for col in numerical_cols:
    corr = df[col].corr(df['heart_disease'])
    print(f"  {col}: {corr:.4f}")


In [None]:
# Pairplot with heart disease as hue
print("="*60)
print("MULTIVARIATE ANALYSIS: PAIRPLOT")
print("="*60)

sns.pairplot(df[numerical_cols + ['heart_disease']], hue='heart_disease', 
             diag_kind='kde', markers=['o', 's'], palette='Set2', 
             plot_kws={'alpha': 0.6, 's': 30})
plt.suptitle('Multivariate Pairwise Relationships', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()


In [None]:
# Faceted analysis: Multiple variables by groups
print("="*60)
print("MULTIVARIATE ANALYSIS: FACETED PLOTS")
print("="*60)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Age vs Cholesterol by Heart Disease and Sex
for hd_status in [0, 1]:
    for sex in df['sex'].unique():
        subset = df[(df['heart_disease'] == hd_status) & (df['sex'] == sex)]
        label = f'HD={hd_status}, {sex}'
        axes[0, 0].scatter(subset['age'], subset['chol'], alpha=0.6, label=label, s=50)
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Cholesterol')
axes[0, 0].set_title('Age vs Cholesterol by Heart Disease and Sex', fontweight='bold')
axes[0, 0].legend(fontsize=8)
axes[0, 0].grid(True, alpha=0.3)

# Age vs Max Heart Rate by Heart Disease
for hd_status in [0, 1]:
    subset = df[df['heart_disease'] == hd_status]
    axes[0, 1].scatter(subset['age'], subset['max_hr'], alpha=0.6, label=f'HD={hd_status}', s=50)
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Max Heart Rate')
axes[0, 1].set_title('Age vs Max Heart Rate by Heart Disease', fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Cholesterol vs Max Heart Rate by Heart Disease
for hd_status in [0, 1]:
    subset = df[df['heart_disease'] == hd_status]
    axes[1, 0].scatter(subset['chol'], subset['max_hr'], alpha=0.6, label=f'HD={hd_status}', s=50)
axes[1, 0].set_xlabel('Cholesterol')
axes[1, 0].set_ylabel('Max Heart Rate')
axes[1, 0].set_title('Cholesterol vs Max Heart Rate by Heart Disease', fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Rest BP vs ST Depression by Heart Disease
for hd_status in [0, 1]:
    subset = df[df['heart_disease'] == hd_status]
    axes[1, 1].scatter(subset['rest_bp'], subset['st_depr'], alpha=0.6, label=f'HD={hd_status}', s=50)
axes[1, 1].set_xlabel('Resting Blood Pressure')
axes[1, 1].set_ylabel('ST Depression')
axes[1, 1].set_title('Rest BP vs ST Depression by Heart Disease', fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Multivariate Analysis: Faceted Scatter Plots', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Grouped statistics by multiple variables
print("="*60)
print("MULTIVARIATE ANALYSIS: GROUPED STATISTICS")
print("="*60)

print("\n1. Statistics by Sex and Heart Disease:")
grouped_stats1 = df.groupby(['sex', 'heart_disease'])[numerical_cols].agg(['mean', 'std', 'median'])
print(grouped_stats1)

print("\n\n2. Statistics by Chest Pain and Heart Disease:")
grouped_stats2 = df.groupby(['chest_pain', 'heart_disease'])[numerical_cols].agg(['mean', 'std', 'median'])
print(grouped_stats2)

print("\n\n3. Statistics by Sex, Chest Pain, and Heart Disease:")
grouped_stats3 = df.groupby(['sex', 'chest_pain', 'heart_disease'])[numerical_cols].mean()
print(grouped_stats3)


In [None]:
# Summary and key findings
print("="*60)
print("SUMMARY OF KEY FINDINGS")
print("="*60)

print("\n1. UNIVARIATE ANALYSIS:")
print("   - Examined distributions, central tendencies, and variability of each variable")
print("   - Identified outliers and skewness in numerical variables")
print("   - Analyzed frequency distributions of categorical variables")

print("\n2. BIVARIATE ANALYSIS:")
print("   - Compared numerical variables between heart disease groups")
print("   - Tested associations between categorical variables and heart disease")
print("   - Examined correlations between numerical variables")

print("\n3. MULTIVARIATE ANALYSIS:")
print("   - Analyzed complex relationships between multiple variables")
print("   - Identified patterns across different variable combinations")
print("   - Examined grouped statistics by multiple categorical variables")

print("\n4. KEY INSIGHTS:")
# Calculate key correlations
corr_with_target = df[numerical_cols].corrwith(df['heart_disease']).abs().sort_values(ascending=False)
print(f"   - Strongest correlations with heart disease:")
for col, corr in corr_with_target.items():
    print(f"     * {col}: {corr:.4f}")

print("\nAnalysis complete!")
