# Statistical Analysis: Descriptive, Inferential, and Exploratory

This notebook contains comprehensive statistical analysis of the Cardiovascular Disease Dataset, including:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Data Analysis
- Hypothesis Testing


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency, f_oneway, normaltest, shapiro
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load the dataset
df = pd.read_csv('../../data/Cardiovascular_Disease_Dataset.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


## 1. Descriptive Statistics


In [None]:
# Basic information about the dataset
print("=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
print(f"\nTotal records: {len(df)}")
print(f"Total features: {len(df.columns)}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"\nData types:\n{df.dtypes}")

# Summary statistics for numerical variables
print("\n" + "=" * 80)
print("DESCRIPTIVE STATISTICS - NUMERICAL VARIABLES")
print("=" * 80)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove('patientid')  # Remove patient ID
print(df[numerical_cols].describe())


In [None]:
# Descriptive statistics for categorical variables
print("=" * 80)
print("DESCRIPTIVE STATISTICS - CATEGORICAL VARIABLES")
print("=" * 80)

categorical_cols = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels', 'target']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        print(df[col].value_counts())
        print(f"Mode: {df[col].mode()[0] if not df[col].mode().empty else 'N/A'}")


In [None]:
# Measures of Central Tendency
print("=" * 80)
print("MEASURES OF CENTRAL TENDENCY")
print("=" * 80)

numerical_cols_for_stats = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak']

for col in numerical_cols_for_stats:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        print(f"  Mean: {df[col].mean():.2f}")
        print(f"  Median: {df[col].median():.2f}")
        print(f"  Mode: {df[col].mode()[0] if not df[col].mode().empty else 'N/A'}")
        print(f"  Skewness: {df[col].skew():.4f}")
        print(f"  Kurtosis: {df[col].kurtosis():.4f}")


In [None]:
# Measures of Dispersion
print("=" * 80)
print("MEASURES OF DISPERSION")
print("=" * 80)

for col in numerical_cols_for_stats:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        print(f"  Range: {df[col].max() - df[col].min():.2f}")
        print(f"  Variance: {df[col].var():.2f}")
        print(f"  Standard Deviation: {df[col].std():.2f}")
        print(f"  Coefficient of Variation: {(df[col].std() / df[col].mean() * 100):.2f}%")
        print(f"  IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.2f}")


## 2. Inferential Statistics


In [None]:
# Normality Tests
print("=" * 80)
print("NORMALITY TESTS (Shapiro-Wilk Test)")
print("=" * 80)

for col in numerical_cols_for_stats:
    if col in df.columns:
        # Take a sample if dataset is large (Shapiro-Wilk works best on samples < 5000)
        sample_size = min(5000, len(df))
        sample_data = df[col].sample(n=sample_size, random_state=42)
        
        statistic, p_value = shapiro(sample_data)
        is_normal = "Yes" if p_value > 0.05 else "No"
        
        print(f"\n{col.upper()}:")
        print(f"  Statistic: {statistic:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Normally distributed: {is_normal} (α = 0.05)")


In [None]:
# T-test: Compare means between groups (disease vs no disease)
print("=" * 80)
print("T-TEST: COMPARING MEANS BETWEEN DISEASE AND NO DISEASE GROUPS")
print("=" * 80)

disease_0 = df[df['target'] == 0]
disease_1 = df[df['target'] == 1]

for col in numerical_cols_for_stats:
    if col in df.columns:
        group_0 = disease_0[col].dropna()
        group_1 = disease_1[col].dropna()
        
        if len(group_0) > 0 and len(group_1) > 0:
            statistic, p_value = ttest_ind(group_0, group_1)
            
            print(f"\n{col.upper()}:")
            print(f"  Mean (No Disease): {group_0.mean():.2f}")
            print(f"  Mean (Disease): {group_1.mean():.2f}")
            print(f"  t-statistic: {statistic:.4f}")
            print(f"  p-value: {p_value:.4f}")
            print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α = 0.05)")


In [None]:
# Chi-square test for categorical variables
print("=" * 80)
print("CHI-SQUARE TEST: ASSOCIATION BETWEEN CATEGORICAL VARIABLES AND TARGET")
print("=" * 80)

categorical_vars = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 
                    'exerciseangia', 'slope', 'noofmajorvessels']

for var in categorical_vars:
    if var in df.columns and var != 'target':
        contingency_table = pd.crosstab(df[var], df['target'])
        
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        
        print(f"\n{var.upper()} vs TARGET:")
        print(f"  Chi-square statistic: {chi2:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Degrees of freedom: {dof}")
        print(f"  Significant association: {'Yes' if p_value < 0.05 else 'No'} (α = 0.05)")
        print(f"\n  Contingency Table:")
        print(contingency_table)


In [None]:
# ANOVA: Compare means across multiple groups
print("=" * 80)
print("ANOVA: COMPARING MEANS ACROSS CHEST PAIN TYPES")
print("=" * 80)

chestpain_groups = [df[df['chestpain'] == i]['age'].dropna() for i in sorted(df['chestpain'].unique())]

if len(chestpain_groups) > 2:
    f_statistic, p_value = f_oneway(*chestpain_groups)
    
    print(f"F-statistic: {f_statistic:.4f}")
    print(f"p-value: {p_value:.4f}")
    print(f"Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α = 0.05)")
    
    print("\nMean age by chest pain type:")
    for i in sorted(df['chestpain'].unique()):
        print(f"  Chest Pain Type {i}: {df[df['chestpain'] == i]['age'].mean():.2f}")


## 3. Exploratory Data Analysis


In [None]:
# Distribution of target variable
plt.figure(figsize=(10, 6))
target_counts = df['target'].value_counts()
plt.bar(['No Disease (0)', 'Disease (1)'], target_counts.values, color=['skyblue', 'coral'])
plt.title('Distribution of Target Variable', fontsize=16, fontweight='bold')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Target', fontsize=12)
for i, v in enumerate(target_counts.values):
    plt.text(i, v + 10, str(v), ha='center', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

print(f"Percentage of patients with disease: {(df['target'].sum() / len(df) * 100):.2f}%")
print(f"Percentage of patients without disease: {((1 - df['target'].sum() / len(df)) * 100):.2f}%")


In [None]:
# Distribution of numerical variables
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols_for_stats):
    if col in df.columns:
        axes[idx].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col, fontsize=10)
        axes[idx].set_ylabel('Frequency', fontsize=10)
        axes[idx].axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
        axes[idx].axvline(df[col].median(), color='green', linestyle='--', label=f'Median: {df[col].median():.2f}')
        axes[idx].legend()

# Remove extra subplot
if len(numerical_cols_for_stats) < 6:
    fig.delaxes(axes[5])

plt.tight_layout()
plt.show()


In [None]:
# Box plots for numerical variables by target
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols_for_stats):
    if col in df.columns:
        data_to_plot = [df[df['target'] == 0][col].dropna(), df[df['target'] == 1][col].dropna()]
        axes[idx].boxplot(data_to_plot, labels=['No Disease', 'Disease'])
        axes[idx].set_title(f'{col} by Target', fontsize=12, fontweight='bold')
        axes[idx].set_ylabel(col, fontsize=10)
        axes[idx].grid(True, alpha=0.3)

# Remove extra subplot
if len(numerical_cols_for_stats) < 6:
    fig.delaxes(axes[5])

plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Variables', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


In [None]:
# Summary statistics by target
print("=" * 80)
print("SUMMARY STATISTICS BY TARGET VARIABLE")
print("=" * 80)

print("\nNO DISEASE (Target = 0):")
print(df[df['target'] == 0][numerical_cols_for_stats].describe())

print("\n\nDISEASE (Target = 1):")
print(df[df['target'] == 1][numerical_cols_for_stats].describe())


## 4. Confidence Intervals


In [None]:
# Calculate 95% confidence intervals for means
print("=" * 80)
print("95% CONFIDENCE INTERVALS FOR MEANS")
print("=" * 80)

confidence_level = 0.95
alpha = 1 - confidence_level

for col in numerical_cols_for_stats:
    if col in df.columns:
        data = df[col].dropna()
        n = len(data)
        mean = data.mean()
        std_err = stats.sem(data)
        h = std_err * stats.t.ppf((1 + confidence_level) / 2, n - 1)
        
        ci_lower = mean - h
        ci_upper = mean + h
        
        print(f"\n{col.upper()}:")
        print(f"  Mean: {mean:.2f}")
        print(f"  95% CI: [{ci_lower:.2f}, {ci_upper:.2f}]")


## 5. Key Findings and Conclusions

### Descriptive Statistics:
- Dataset contains 1,000 records with 14 features
- Target variable is fairly balanced/unbalanced (check above)
- Numerical variables show various distributions

### Inferential Statistics:
- T-tests reveal significant differences between disease and no-disease groups for several variables
- Chi-square tests show associations between categorical variables and target
- ANOVA tests reveal differences across groups

### Exploratory Findings:
- Correlation analysis reveals relationships between variables
- Distribution plots show data characteristics
- Box plots reveal outliers and group differences
