# Comprehensive Exploratory Data Analysis

## Overview
This notebook provides a comprehensive exploratory data analysis of the health dataset including:
- Data overview and quality assessment
- Outlier detection and analysis
- Distribution analysis
- Relationship analysis
- Target variable analysis
- Feature engineering insights


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import missingno as msno
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

# Load data
df = pd.read_csv('../data/health_data.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


## 1. Data Cleaning and Preprocessing


In [None]:
# Clean data
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

df['age_years'] = df['age'] / 365.25
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)
df = df[(df['ap_hi'] >= 80) & (df['ap_hi'] <= 250)]
df = df[(df['ap_lo'] >= 40) & (df['ap_lo'] <= 150)]
df = df[df['ap_hi'] >= df['ap_lo']]
df = df[(df['height'] >= 100) & (df['height'] <= 220)]
df = df[(df['weight'] >= 30) & (df['weight'] <= 200)]
df = df[(df['bmi'] >= 10) & (df['bmi'] <= 60)]

print(f"Dataset shape after cleaning: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nData types:\n{df.dtypes}")


## 2. Data Overview


In [None]:
# Summary statistics
print("Summary Statistics:")
print(df.describe())

# Missing data visualization
if df.isnull().sum().sum() > 0:
    msno.matrix(df)
    plt.title('Missing Data Pattern', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('../figures/missing_data.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("\nNo missing values found in the dataset.")


## 3. Outlier Analysis


In [None]:
# Detect outliers using IQR method
numerical_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'bmi']
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col.upper()}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.2f}%)")
    
    axes[i].boxplot(df[col], vert=True)
    axes[i].set_title(f'{col.upper()} - Box Plot', fontsize=12, fontweight='bold')
    axes[i].set_ylabel(col)
    axes[i].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../figures/outlier_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Distribution Analysis


In [None]:
# Distribution plots with normal distribution overlay
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[col], bins=50, density=True, alpha=0.7, edgecolor='black', label='Data')
    
    # Normal distribution overlay
    mu, sigma = df[col].mean(), df[col].std()
    x = np.linspace(df[col].min(), df[col].max(), 100)
    axes[i].plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal')
    
    # Statistics
    skewness = stats.skew(df[col])
    kurtosis = stats.kurtosis(df[col])
    axes[i].text(0.05, 0.95, f'Skew: {skewness:.2f}\nKurtosis: {kurtosis:.2f}', 
                transform=axes[i].transAxes, fontsize=10, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    axes[i].set_title(f'{col.upper()} Distribution', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../figures/distribution_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Target Variable Analysis


In [None]:
# Target variable distribution
plt.figure(figsize=(10, 6))
cardio_counts = df['cardio'].value_counts()
plt.bar(cardio_counts.index.astype(str), cardio_counts.values, 
        color=['skyblue', 'salmon'], edgecolor='black')
plt.xlabel('Cardiovascular Disease', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Distribution of Target Variable (Cardio)', fontsize=14, fontweight='bold')
plt.xticks([0, 1], ['No (0)', 'Yes (1)'])
for i, v in enumerate(cardio_counts.values):
    plt.text(i, v + 500, f'{v}\n({v/len(df)*100:.2f}%)', 
             ha='center', va='bottom', fontweight='bold')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('../figures/target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Comparison by target variable
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    df.boxplot(column=col, by='cardio', ax=axes[i])
    axes[i].set_title(f'{col.upper()} by Cardio Status', fontsize=12, fontweight='bold')
    axes[i].set_xlabel('Cardiovascular Disease')
    axes[i].set_ylabel(col)
    axes[i].grid(True, alpha=0.3)

plt.suptitle('')
plt.tight_layout()
plt.savefig('../figures/target_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
