# Statistical Analysis: Descriptive, Inferential, and Exploratory

## Overview
This notebook contains comprehensive statistical analysis including:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Data Analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../data/health_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")


## 1. Data Cleaning and Preprocessing


In [None]:
# Remove unnecessary columns
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# Convert age from days to years
df['age_years'] = df['age'] / 365.25

# Calculate BMI
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

# Clean blood pressure data (remove unrealistic values)
df = df[(df['ap_hi'] >= 80) & (df['ap_hi'] <= 250)]
df = df[(df['ap_lo'] >= 40) & (df['ap_lo'] <= 150)]
df = df[df['ap_hi'] >= df['ap_lo']]

# Clean height and weight (remove outliers)
df = df[(df['height'] >= 100) & (df['height'] <= 220)]
df = df[(df['weight'] >= 30) & (df['weight'] <= 200)]

# Clean BMI
df = df[(df['bmi'] >= 10) & (df['bmi'] <= 60)]

print(f"Dataset shape after cleaning: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()}")


## 2. Descriptive Statistics


In [None]:
# Numerical variables descriptive statistics
numerical_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
print("=" * 80)
print("DESCRIPTIVE STATISTICS - NUMERICAL VARIABLES")
print("=" * 80)
desc_stats = df[numerical_cols].describe()
print(desc_stats)

# Additional statistics
print("\n" + "=" * 80)
print("ADDITIONAL STATISTICS")
print("=" * 80)
for col in numerical_cols:
    print(f"\n{col.upper()}:")
    print(f"  Skewness: {stats.skew(df[col].dropna()):.4f}")
    print(f"  Kurtosis: {stats.kurtosis(df[col].dropna()):.4f}")
    print(f"  Median: {df[col].median():.4f}")
    print(f"  IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.4f}")


In [None]:
# Categorical variables descriptive statistics
categorical_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
print("=" * 80)
print("DESCRIPTIVE STATISTICS - CATEGORICAL VARIABLES")
print("=" * 80)
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(df[col].value_counts().sort_index())
    print(f"Proportions:")
    print(df[col].value_counts(normalize=True).sort_index())


## 3. Inferential Statistics


In [None]:
# T-test: Compare age between cardio and non-cardio patients
cardio_yes = df[df['cardio'] == 1]['age_years']
cardio_no = df[df['cardio'] == 0]['age_years']

t_stat, p_value = ttest_ind(cardio_yes, cardio_no)
print("=" * 80)
print("T-TEST: Age difference between cardio and non-cardio patients")
print("=" * 80)
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Mean age (Cardio=1): {cardio_yes.mean():.2f} years")
print(f"Mean age (Cardio=0): {cardio_no.mean():.2f} years")
if p_value < 0.05:
    print("Result: Significant difference (p < 0.05)")
else:
    print("Result: No significant difference (p >= 0.05)")


In [None]:
# Chi-square test for categorical variables
print("=" * 80)
print("CHI-SQUARE TESTS: Association between categorical variables and cardio")
print("=" * 80)

categorical_vars = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
for var in categorical_vars:
    contingency_table = pd.crosstab(df[var], df['cardio'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    print(f"\n{var.upper()} vs Cardio:")
    print(f"  Chi-square statistic: {chi2:.4f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Degrees of freedom: {dof}")
    if p_value < 0.05:
        print(f"  Result: Significant association (p < 0.05)")
    else:
        print(f"  Result: No significant association (p >= 0.05)")


In [None]:
# Correlation analysis
print("=" * 80)
print("CORRELATION ANALYSIS")
print("=" * 80)
correlation_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi', 'cardio']
correlation_matrix = df[correlation_cols].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Variables', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../figures/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Exploratory Data Analysis


In [None]:
# Distribution of target variable
plt.figure(figsize=(8, 6))
cardio_counts = df['cardio'].value_counts()
plt.bar(cardio_counts.index, cardio_counts.values, color=['skyblue', 'salmon'])
plt.xlabel('Cardiovascular Disease', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Cardiovascular Disease', fontsize=14, fontweight='bold')
plt.xticks([0, 1], ['No (0)', 'Yes (1)'])
for i, v in enumerate(cardio_counts.values):
    plt.text(i, v + 500, str(v), ha='center', va='bottom', fontweight='bold')
plt.tight_layout()
plt.savefig('../figures/cardio_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
