# Descriptive, Inferential, and Exploratory Statistical Analysis
## Diabetes Binary Health Indicators - BRFSS 2021

This notebook contains:
- Descriptive Statistics
- Inferential Statistics
- Exploratory Data Analysis


## 1. Import Libraries and Load Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../../data/diabetes_binary_health_indicators_BRFSS2021.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")


## 2. Data Overview and Basic Information


In [None]:
# Basic information
print("=" * 50)
print("DATASET INFORMATION")
print("=" * 50)
df.info()

print("\n" + "=" * 50)
print("FIRST FEW ROWS")
print("=" * 50)
df.head()

print("\n" + "=" * 50)
print("MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

print("\n" + "=" * 50)
print("DUPLICATE ROWS")
print("=" * 50)
print(f"Number of duplicate rows: {df.duplicated().sum()}")


## 3. Descriptive Statistics


In [None]:
# Descriptive statistics for numerical variables
print("=" * 50)
print("DESCRIPTIVE STATISTICS - NUMERICAL VARIABLES")
print("=" * 50)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
desc_stats = df[numerical_cols].describe()
print(desc_stats)

# Additional statistics
print("\n" + "=" * 50)
print("ADDITIONAL STATISTICS")
print("=" * 50)
additional_stats = pd.DataFrame({
    'Mean': df[numerical_cols].mean(),
    'Median': df[numerical_cols].median(),
    'Std Dev': df[numerical_cols].std(),
    'Variance': df[numerical_cols].var(),
    'Skewness': df[numerical_cols].skew(),
    'Kurtosis': df[numerical_cols].kurtosis(),
    'Min': df[numerical_cols].min(),
    'Max': df[numerical_cols].max(),
    'Range': df[numerical_cols].max() - df[numerical_cols].min(),
    'Q1': df[numerical_cols].quantile(0.25),
    'Q3': df[numerical_cols].quantile(0.75),
    'IQR': df[numerical_cols].quantile(0.75) - df[numerical_cols].quantile(0.25)
})
print(additional_stats.T)


In [None]:
# Target variable distribution
print("=" * 50)
print("TARGET VARIABLE DISTRIBUTION (Diabetes_binary)")
print("=" * 50)
target_dist = df['Diabetes_binary'].value_counts()
print(target_dist)
print(f"\nProportions:")
print(target_dist / len(df))

# Visualization
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
df['Diabetes_binary'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Diabetes Distribution (Count)')
plt.xlabel('Diabetes (0=No, 1=Yes)')
plt.ylabel('Frequency')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df['Diabetes_binary'].value_counts(normalize=True).plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Diabetes Distribution (Proportion)')
plt.xlabel('Diabetes (0=No, 1=Yes)')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('../../results/figures/target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Inferential Statistics


In [None]:
# Hypothesis Testing: Chi-square test for categorical variables
print("=" * 50)
print("CHI-SQUARE TESTS FOR CATEGORICAL VARIABLES")
print("=" * 50)

categorical_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
                    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
                    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']

results_chi2 = []
for col in categorical_cols:
    if col in df.columns:
        contingency_table = pd.crosstab(df[col], df['Diabetes_binary'])
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        results_chi2.append({
            'Variable': col,
            'Chi-square': chi2,
            'p-value': p_value,
            'Degrees of Freedom': dof,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })

results_chi2_df = pd.DataFrame(results_chi2)
print(results_chi2_df)


In [None]:
# T-test and Mann-Whitney U test for numerical variables
print("=" * 50)
print("T-TEST AND MANN-WHITNEY U TEST FOR NUMERICAL VARIABLES")
print("=" * 50)

numerical_vars = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']

results_tests = []
for var in numerical_vars:
    if var in df.columns:
        # Split data by diabetes status
        no_diabetes = df[df['Diabetes_binary'] == 0][var].dropna()
        yes_diabetes = df[df['Diabetes_binary'] == 1][var].dropna()
        
        # T-test
        t_stat, t_pvalue = ttest_ind(no_diabetes, yes_diabetes)
        
        # Mann-Whitney U test (non-parametric)
        u_stat, u_pvalue = mannwhitneyu(no_diabetes, yes_diabetes, alternative='two-sided')
        
        results_tests.append({
            'Variable': var,
            'T-statistic': t_stat,
            'T-test p-value': t_pvalue,
            'T-test Significant': 'Yes' if t_pvalue < 0.05 else 'No',
            'U-statistic': u_stat,
            'Mann-Whitney p-value': u_pvalue,
            'Mann-Whitney Significant': 'Yes' if u_pvalue < 0.05 else 'No'
        })

results_tests_df = pd.DataFrame(results_tests)
print(results_tests_df)


In [None]:
# Correlation analysis
print("=" * 50)
print("CORRELATION ANALYSIS")
print("=" * 50)

# Pearson correlation
correlation_matrix = df[numerical_cols].corr()
print("\nPearson Correlation with Diabetes_binary:")
diabetes_corr = correlation_matrix['Diabetes_binary'].sort_values(ascending=False)
print(diabetes_corr)

# Visualization
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix Heatmap', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('../../results/figures/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Correlation with target variable
plt.figure(figsize=(10, 8))
diabetes_corr.drop('Diabetes_binary').sort_values().plot(kind='barh', color='steelblue')
plt.title('Correlation with Diabetes_binary', fontsize=14)
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='red', linestyle='--', linewidth=1)
plt.tight_layout()
plt.savefig('../../results/figures/diabetes_correlation.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Exploratory Data Analysis


In [None]:
# Distribution of key numerical variables
print("=" * 50)
print("DISTRIBUTION ANALYSIS")
print("=" * 50)

key_vars = ['BMI', 'Age', 'GenHlth', 'MentHlth', 'PhysHlth']
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for i, var in enumerate(key_vars):
    if var in df.columns:
        axes[i].hist(df[var].dropna(), bins=30, color='steelblue', alpha=0.7, edgecolor='black')
        axes[i].set_title(f'Distribution of {var}', fontsize=12)
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Frequency')
        axes[i].axvline(df[var].mean(), color='red', linestyle='--', label=f'Mean: {df[var].mean():.2f}')
        axes[i].axvline(df[var].median(), color='green', linestyle='--', label=f'Median: {df[var].median():.2f}')
        axes[i].legend()

plt.tight_layout()
plt.savefig('../../results/figures/distributions.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Key Findings Summary
print("=" * 50)
print("KEY FINDINGS")
print("=" * 50)
print("\n1. Dataset Overview:")
print(f"   - Total records: {len(df):,}")
print(f"   - Features: {len(df.columns)}")
print(f"   - Missing values: {df.isnull().sum().sum()}")

print("\n2. Target Variable:")
diabetes_rate = df['Diabetes_binary'].mean() * 100
print(f"   - Diabetes prevalence: {diabetes_rate:.2f}%")

print("\n3. Key Associations:")
print("   - Variables with strongest correlation to diabetes:")
top_corr = diabetes_corr.drop('Diabetes_binary').abs().sort_values(ascending=False).head(5)
for var, corr in top_corr.items():
    print(f"     {var}: {corr:.3f}")

print("\n4. Statistical Significance:")
print(f"   - Significant categorical associations: {results_chi2_df[results_chi2_df['Significant'] == 'Yes'].shape[0]}/{len(results_chi2_df)}")
print(f"   - Significant numerical differences: {results_tests_df[results_tests_df['T-test Significant'] == 'Yes'].shape[0]}/{len(results_tests_df)}")
