# Heart Disease Dataset - Statistical Analysis

## Descriptive, Inferential, and Exploratory Statistical Analysis

This notebook contains comprehensive statistical analysis of the heart disease dataset including:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Data Analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency, pearsonr, normaltest, shapiro
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
df = pd.read_csv('../../data/heart-disease.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()


## 1. Descriptive Statistics


In [None]:
# Basic information about the dataset
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"Total number of records: {len(df)}")
print(f"Total number of features: {len(df.columns)}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nDuplicate rows: {df.duplicated().sum()}")


In [None]:
# Descriptive statistics for numerical variables
print("="*60)
print("DESCRIPTIVE STATISTICS - NUMERICAL VARIABLES")
print("="*60)
numerical_cols = ['age', 'rest_bp', 'chol', 'max_hr', 'st_depr']
desc_stats = df[numerical_cols].describe()
print(desc_stats)
print("\nAdditional Statistics:")
print(f"\nSkewness:\n{df[numerical_cols].skew()}")
print(f"\nKurtosis:\n{df[numerical_cols].kurtosis()}")


In [None]:
# Descriptive statistics for categorical variables
print("="*60)
print("DESCRIPTIVE STATISTICS - CATEGORICAL VARIABLES")
print("="*60)
categorical_cols = ['sex', 'chest_pain', 'heart_disease']
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(df[col].value_counts())
    print(f"Proportions:\n{df[col].value_counts(normalize=True) * 100}")
    print("-" * 40)


## 2. Inferential Statistics


In [None]:
# Hypothesis Testing: Compare means between heart disease groups
print("="*60)
print("HYPOTHESIS TESTING: T-TESTS")
print("="*60)

# Split data by heart disease
heart_disease = df[df['heart_disease'] == 1]
no_heart_disease = df[df['heart_disease'] == 0]

# Test each numerical variable
for col in numerical_cols:
    stat, p_value = ttest_ind(heart_disease[col], no_heart_disease[col])
    print(f"\n{col.upper()}:")
    print(f"  Mean (Heart Disease): {heart_disease[col].mean():.2f}")
    print(f"  Mean (No Heart Disease): {no_heart_disease[col].mean():.2f}")
    print(f"  T-statistic: {stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    if p_value < 0.05:
        print(f"  Result: Significant difference (p < 0.05)")
    else:
        print(f"  Result: No significant difference (p >= 0.05)")


In [None]:
# Chi-square test for categorical variables
print("="*60)
print("CHI-SQUARE TESTS FOR CATEGORICAL VARIABLES")
print("="*60)

# Test sex vs heart_disease
contingency_table = pd.crosstab(df['sex'], df['heart_disease'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print("\nSex vs Heart Disease:")
print(contingency_table)
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Degrees of freedom: {dof}")
if p_value < 0.05:
    print("Result: Significant association (p < 0.05)")
else:
    print("Result: No significant association (p >= 0.05)")

# Test chest_pain vs heart_disease
contingency_table2 = pd.crosstab(df['chest_pain'], df['heart_disease'])
chi2_2, p_value_2, dof_2, expected_2 = chi2_contingency(contingency_table2)
print("\n\nChest Pain vs Heart Disease:")
print(contingency_table2)
print(f"Chi-square statistic: {chi2_2:.4f}")
print(f"P-value: {p_value_2:.4f}")
print(f"Degrees of freedom: {dof_2}")
if p_value_2 < 0.05:
    print("Result: Significant association (p < 0.05)")
else:
    print("Result: No significant association (p >= 0.05)")


In [None]:
# Correlation analysis
print("="*60)
print("CORRELATION ANALYSIS")
print("="*60)

# Pearson correlation
correlation_matrix = df[numerical_cols + ['heart_disease']].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Test significant correlations with heart_disease
print("\n\nCorrelations with Heart Disease:")
for col in numerical_cols:
    corr, p_value = pearsonr(df[col], df['heart_disease'])
    print(f"{col}: r = {corr:.4f}, p-value = {p_value:.4f}")
    if p_value < 0.05:
        print(f"  → Significant correlation (p < 0.05)")
    else:
        print(f"  → No significant correlation (p >= 0.05)")


In [None]:
# Normality tests
print("="*60)
print("NORMALITY TESTS (Shapiro-Wilk Test)")
print("="*60)

for col in numerical_cols:
    # Sample if too large (Shapiro-Wilk works best with n <= 5000)
    sample_data = df[col].sample(min(5000, len(df)), random_state=42)
    stat, p_value = shapiro(sample_data)
    print(f"\n{col}:")
    print(f"  Shapiro-Wilk statistic: {stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    if p_value < 0.05:
        print(f"  Result: Data is NOT normally distributed (p < 0.05)")
    else:
        print(f"  Result: Data appears normally distributed (p >= 0.05)")


## 3. Exploratory Statistical Analysis


In [None]:
# Distribution plots for numerical variables
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
    axes[idx].axvline(df[col].median(), color='green', linestyle='--', label=f'Median: {df[col].median():.2f}')
    axes[idx].legend()

# Remove empty subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()


In [None]:
# Box plots comparing heart disease groups
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    data_to_plot = [df[df['heart_disease'] == 0][col], df[df['heart_disease'] == 1][col]]
    axes[idx].boxplot(data_to_plot, labels=['No Heart Disease', 'Heart Disease'])
    axes[idx].set_title(f'{col} by Heart Disease Status', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col)

# Remove empty subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.show()


In [None]:
# Heatmap of correlations
plt.figure(figsize=(10, 8))
correlation_matrix = df[numerical_cols + ['heart_disease']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, fmt='.2f')
plt.title('Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


In [None]:
# Summary statistics by heart disease status
print("="*60)
print("SUMMARY STATISTICS BY HEART DISEASE STATUS")
print("="*60)

print("\nNO HEART DISEASE:")
print(no_heart_disease[numerical_cols].describe())

print("\n\nHEART DISEASE:")
print(heart_disease[numerical_cols].describe())
