# Univariate, Bivariate, and Multivariate Analysis
## Diabetes Binary Health Indicators - BRFSS 2021


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

df = pd.read_csv('../../data/diabetes_binary_health_indicators_BRFSS2021.csv')
print(f"Dataset shape: {df.shape}")


## 1. Univariate Analysis


In [None]:
# Univariate analysis for numerical variables
numerical_cols = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

univariate_summary = []
for col in numerical_cols:
    if col in df.columns:
        data = df[col].dropna()
        univariate_summary.append({
            'Variable': col,
            'Mean': data.mean(),
            'Median': data.median(),
            'Std Dev': data.std(),
            'Skewness': data.skew(),
            'Kurtosis': data.kurtosis(),
            'Min': data.min(),
            'Max': data.max(),
            'IQR': data.quantile(0.75) - data.quantile(0.25)
        })

summary_df = pd.DataFrame(univariate_summary)
print(summary_df)

# Visualizations
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for i, col in enumerate(numerical_cols[:9]):
    if col in df.columns:
        axes[i].hist(df[col].dropna(), bins=30, density=True, alpha=0.7, 
                    color='steelblue', edgecolor='black')
        axes[i].set_title(f'{col} - Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Density')
        axes[i].axvline(df[col].mean(), color='green', linestyle='--', 
                       linewidth=2, label=f'Mean: {df[col].mean():.2f}')
        axes[i].legend()

plt.tight_layout()
plt.savefig('../../results/figures/univariate_numerical.png', dpi=300, bbox_inches='tight')
plt.show()


## 2. Bivariate Analysis


In [None]:
# Correlation analysis
correlation_results = []
for col in numerical_cols:
    if col in df.columns and col != 'Diabetes_binary':
        pearson_corr, pearson_p = stats.pearsonr(
            df[col].dropna(), 
            df.loc[df[col].notna(), 'Diabetes_binary']
        )
        correlation_results.append({
            'Variable': col,
            'Pearson r': pearson_corr,
            'Pearson p-value': pearson_p,
            'Significant': 'Yes' if pearson_p < 0.05 else 'No'
        })

corr_df = pd.DataFrame(correlation_results)
print(corr_df)

# Box plots
fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for i, col in enumerate(numerical_cols[:9]):
    if col in df.columns and col != 'Diabetes_binary':
        df.boxplot(column=col, by='Diabetes_binary', ax=axes[i], grid=False)
        axes[i].set_title(f'{col} by Diabetes Status')
        axes[i].set_xlabel('Diabetes (0=No, 1=Yes)')
        axes[i].set_ylabel(col)
        axes[i].get_figure().suptitle('')

plt.tight_layout()
plt.savefig('../../results/figures/bivariate_numerical_vs_target.png', dpi=300, bbox_inches='tight')
plt.show()


## 3. Multivariate Analysis


In [None]:
# Correlation heatmap
numerical_vars_all = df.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df[numerical_vars_all].corr()

plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Multivariate Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('../../results/figures/multivariate_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Multivariate group analysis
df['BMI_Category'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 100], 
                            labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
multi_group = df.groupby(['HighBP', 'HighChol'])['Diabetes_binary'].agg(['mean', 'count']).reset_index()
multi_group['Diabetes_Rate'] = multi_group['mean'] * 100
print("\nDiabetes Prevalence by HighBP and HighChol:")
print(multi_group)

# Heatmap
pivot_table = df.groupby(['HighBP', 'HighChol'])['Diabetes_binary'].mean().unstack() * 100
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, fmt='.2f', cmap='YlOrRd', 
            cbar_kws={'label': 'Diabetes Prevalence (%)'})
plt.title('Diabetes Prevalence by HighBP and HighChol')
plt.xlabel('HighChol (0=No, 1=Yes)')
plt.ylabel('HighBP (0=No, 1=Yes)')
plt.tight_layout()
plt.savefig('../../results/figures/multivariate_heatmap_bp_chol.png', dpi=300, bbox_inches='tight')
plt.show()
