# Comprehensive Exploratory Data Analysis
## Diabetes Binary Health Indicators - BRFSS 2021

This notebook provides a comprehensive exploratory data analysis including:
- Data Overview and Quality Assessment
- Descriptive Statistics
- Target Variable Analysis
- Feature Distributions
- Correlation Analysis
- Categorical Variable Analysis
- Outlier Detection
- Pattern Recognition


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

df = pd.read_csv('../../data/diabetes_binary_health_indicators_BRFSS2021.csv')
print(f"Dataset shape: {df.shape}")


## 1. Data Overview


In [None]:
# Comprehensive data overview
print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)
print(f"\nDataset Shape: {df.shape}")
print(f"Number of features: {len(df.columns)}")
print(f"Number of records: {len(df):,}")

print("\nData Types:")
print(df.dtypes)

print("\nMissing Values:")
missing = df.isnull().sum()
print(missing[missing > 0])

print(f"\nDuplicate Rows: {df.duplicated().sum()}")

df.head()
df.info()


## 2. Target Variable Analysis


In [None]:
# Target variable analysis
target_dist = df['Diabetes_binary'].value_counts()
target_prop = target_dist / len(df)

print("Target Distribution:")
print(target_dist)
print("\nProportions:")
print(target_prop)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
axes[0].bar(target_dist.index, target_dist.values, color=['skyblue', 'salmon'])
axes[0].set_title('Diabetes Distribution (Count)')
axes[0].set_xlabel('Diabetes (0=No, 1=Yes)')
axes[0].set_ylabel('Frequency')

axes[1].bar(target_prop.index, target_prop.values, color=['skyblue', 'salmon'])
axes[1].set_title('Diabetes Distribution (Proportion)')
axes[1].set_xlabel('Diabetes (0=No, 1=Yes)')
axes[1].set_ylabel('Proportion')
plt.tight_layout()
plt.savefig('../../results/figures/target_distribution_eda.png', dpi=300, bbox_inches='tight')
plt.show()


## 3. Feature Distributions and Statistics


In [None]:
# Descriptive statistics
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
desc_stats = df[numerical_cols].describe()
print(desc_stats)

# Distribution plots
key_vars = ['BMI', 'Age', 'GenHlth', 'MentHlth', 'PhysHlth']
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, var in enumerate(key_vars):
    if var in df.columns:
        axes[i].hist(df[var].dropna(), bins=30, color='steelblue', alpha=0.7, edgecolor='black')
        axes[i].set_title(f'Distribution of {var}')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Frequency')
        axes[i].axvline(df[var].mean(), color='red', linestyle='--', label=f'Mean: {df[var].mean():.2f}')
        axes[i].legend()

plt.tight_layout()
plt.savefig('../../results/figures/feature_distributions_eda.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Correlation Analysis


In [None]:
# Correlation matrix
correlation_matrix = df[numerical_cols].corr()
diabetes_corr = correlation_matrix['Diabetes_binary'].sort_values(ascending=False)

print("Correlation with Diabetes_binary:")
print(diabetes_corr)

# Heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../../results/figures/correlation_matrix_eda.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Categorical Variable Analysis


In [None]:
# Categorical variables vs target
categorical_vars = ['HighBP', 'HighChol', 'Smoker', 'PhysActivity', 'Sex', 'HeartDiseaseorAttack']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, var in enumerate(categorical_vars):
    if var in df.columns:
        crosstab = pd.crosstab(df[var], df['Diabetes_binary'], normalize='index') * 100
        crosstab.plot(kind='bar', ax=axes[i], color=['skyblue', 'salmon'])
        axes[i].set_title(f'Diabetes Prevalence by {var}')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Percentage (%)')
        axes[i].legend(['No Diabetes', 'Diabetes'])
        axes[i].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig('../../results/figures/categorical_analysis_eda.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Key Insights and Summary


In [None]:
print("=" * 60)
print("KEY INSIGHTS FROM EDA")
print("=" * 60)

print(f"\n1. Dataset: {len(df):,} records, {len(df.columns)} features")
print(f"2. Diabetes Prevalence: {df['Diabetes_binary'].mean()*100:.2f}%")
print(f"\n3. Top 5 Variables Correlated with Diabetes:")
top_corr = diabetes_corr.drop('Diabetes_binary').abs().sort_values(ascending=False).head(5)
for var, corr in top_corr.items():
    print(f"   - {var}: {corr:.3f}")

print("\n4. Data Quality:")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicate rows: {df.duplicated().sum()}")
