# Data Exploration: UCI Diabetes 130-US Hospitals Dataset

**Objective**: Understand the data structure, quality, and relationships before modeling.

This notebook explores the UCI Diabetes dataset containing 100K+ patient encounters from 130 US hospitals (1999-2008) to prepare for readmission risk modeling.

In [None]:
# Cell 1: Setup and Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-whitegrid')

# Set up paths
DATA_DIR = Path('../data/raw')
OUTPUT_DIR = Path('../data/processed')
OUTPUT_DIR.mkdir(exist_ok=True)

print("Libraries loaded successfully")

In [None]:
# Cell 2: Load UCI Diabetes Data
df = pd.read_csv(DATA_DIR / 'diabetic_data.csv')

print("=" * 60)
print("UCI DIABETES 130-US HOSPITALS DATASET")
print("=" * 60)
print(f"\nDataset Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"\nColumn Names:")
for i, col in enumerate(df.columns):
    print(f"  {i+1:2d}. {col}")

In [None]:
# Cell 3: Data Types and Missing Values Analysis
print("\n" + "=" * 60)
print("DATA TYPES")
print("=" * 60)
print(df.dtypes)

print("\n" + "=" * 60)
print("MISSING VALUES ANALYSIS")
print("=" * 60)

# Count '?' as missing (UCI dataset uses '?' for missing)
missing_counts = {}
for col in df.columns:
    missing_count = (df[col] == '?').sum() if df[col].dtype == 'object' else df[col].isnull().sum()
    if missing_count > 0:
        missing_pct = (missing_count / len(df)) * 100
        missing_counts[col] = {'count': missing_count, 'percent': missing_pct}

if missing_counts:
    missing_df = pd.DataFrame(missing_counts).T.sort_values('percent', ascending=False)
    print(missing_df)
else:
    print("No missing values found.")

In [None]:
# Cell 4: Target Variable Analysis
print("\n" + "=" * 60)
print("TARGET VARIABLE: READMISSION STATUS")
print("=" * 60)

readmission_counts = df['readmitted'].value_counts()
readmission_pcts = df['readmitted'].value_counts(normalize=True) * 100

print("\nReadmission Distribution:")
for val, count in readmission_counts.items():
    pct = readmission_pcts[val]
    print(f"  {val}: {count:,} ({pct:.1f}%)")

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#2ecc71', '#f39c12', '#e74c3c']
bars = ax.bar(range(len(readmission_counts)), readmission_counts.values, color=colors)
ax.set_title('Readmission Status Distribution', fontsize=14, fontweight='bold')
ax.set_xlabel('Readmission Status')
ax.set_ylabel('Count')
ax.set_xticks(range(len(readmission_counts)))
ax.set_xticklabels(['Not Readmitted (NO)', 'After 30 Days (>30)', 'Within 30 Days (<30)'], rotation=0)

for i, (bar, count) in enumerate(zip(bars, readmission_counts.values)):
    pct = readmission_pcts.iloc[i]
    ax.annotate(f'{count:,}\n({pct:.1f}%)', 
                xy=(bar.get_x() + bar.get_width()/2, bar.get_height()), 
                ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'readmission_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\n*** KEY INSIGHT: Only {readmission_pcts.get('<30', 0):.1f}% readmitted within 30 days ***")
print("*** This class imbalance MUST be addressed in modeling ***")

In [None]:
# Cell 5: Key Feature Distributions
print("\n" + "=" * 60)
print("KEY FEATURE ANALYSIS")
print("=" * 60)

# Age distribution
print("\nAge Distribution:")
age_counts = df['age'].value_counts().sort_index()
print(age_counts)

# Visualize age distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Age
ax1 = axes[0, 0]
age_counts.plot(kind='bar', ax=ax1, color='#3498db')
ax1.set_title('Age Distribution', fontweight='bold')
ax1.set_xlabel('Age Range')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Time in hospital
ax2 = axes[0, 1]
df['time_in_hospital'].hist(bins=14, ax=ax2, color='#9b59b6', edgecolor='white')
ax2.set_title('Time in Hospital (Days)', fontweight='bold')
ax2.set_xlabel('Days')
ax2.set_ylabel('Count')
ax2.axvline(df['time_in_hospital'].mean(), color='red', linestyle='--', label=f"Mean: {df['time_in_hospital'].mean():.1f}")
ax2.legend()

# Number of medications
ax3 = axes[1, 0]
df['num_medications'].hist(bins=30, ax=ax3, color='#e74c3c', edgecolor='white')
ax3.set_title('Number of Medications', fontweight='bold')
ax3.set_xlabel('Number of Medications')
ax3.set_ylabel('Count')
ax3.axvline(df['num_medications'].mean(), color='blue', linestyle='--', label=f"Mean: {df['num_medications'].mean():.1f}")
ax3.legend()

# Number of diagnoses
ax4 = axes[1, 1]
df['number_diagnoses'].hist(bins=16, ax=ax4, color='#2ecc71', edgecolor='white')
ax4.set_title('Number of Diagnoses', fontweight='bold')
ax4.set_xlabel('Number of Diagnoses')
ax4.set_ylabel('Count')
ax4.axvline(df['number_diagnoses'].mean(), color='red', linestyle='--', label=f"Mean: {df['number_diagnoses'].mean():.1f}")
ax4.legend()

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nTime in Hospital: Mean={df['time_in_hospital'].mean():.1f}, Median={df['time_in_hospital'].median():.0f}")
print(f"Number of Medications: Mean={df['num_medications'].mean():.1f}, Median={df['num_medications'].median():.0f}")
print(f"Number of Diagnoses: Mean={df['number_diagnoses'].mean():.1f}, Median={df['number_diagnoses'].median():.0f}")

In [None]:
# Cell 6: Feature Correlation with 30-Day Readmission
print("\n" + "=" * 60)
print("FEATURES CORRELATED WITH 30-DAY READMISSION")
print("=" * 60)

# Create binary target
df_temp = df.copy()
df_temp['readmitted_30day'] = (df_temp['readmitted'] == '<30').astype(int)

# Calculate readmission rates by key features
print("\nReadmission Rate by Age Group:")
age_readmit = df_temp.groupby('age')['readmitted_30day'].mean().sort_index()
for age, rate in age_readmit.items():
    print(f"  {age}: {rate*100:.1f}%")

print("\nReadmission Rate by Number of Prior Inpatient Visits:")
inpatient_readmit = df_temp.groupby('number_inpatient')['readmitted_30day'].mean()
for visits in [0, 1, 2, 3]:
    if visits in inpatient_readmit.index:
        print(f"  {visits} visits: {inpatient_readmit[visits]*100:.1f}%")
print(f"  4+ visits: {df_temp[df_temp['number_inpatient']>=4]['readmitted_30day'].mean()*100:.1f}%")

# Visualize readmission rate by age
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
age_readmit_pct = age_readmit * 100
age_readmit_pct.plot(kind='bar', ax=ax1, color='#e74c3c')
ax1.set_title('30-Day Readmission Rate by Age', fontweight='bold')
ax1.set_xlabel('Age Group')
ax1.set_ylabel('Readmission Rate (%)')
ax1.tick_params(axis='x', rotation=45)
ax1.axhline(df_temp['readmitted_30day'].mean()*100, color='blue', linestyle='--', 
           label=f"Overall: {df_temp['readmitted_30day'].mean()*100:.1f}%")
ax1.legend()

ax2 = axes[1]
inpatient_bins = df_temp.groupby(pd.cut(df_temp['number_inpatient'], bins=[-1, 0, 1, 2, 5, 100]))['readmitted_30day'].mean() * 100
inpatient_bins.index = ['0', '1', '2', '3-5', '6+']
inpatient_bins.plot(kind='bar', ax=ax2, color='#3498db')
ax2.set_title('30-Day Readmission Rate by Prior Inpatient Visits', fontweight='bold')
ax2.set_xlabel('Number of Prior Inpatient Visits (Last Year)')
ax2.set_ylabel('Readmission Rate (%)')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'readmission_by_features.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cell 7: Correlation Analysis for Numeric Features
print("\n" + "=" * 60)
print("NUMERIC FEATURE CORRELATIONS")
print("=" * 60)

numeric_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures',
                'num_medications', 'number_outpatient', 'number_emergency',
                'number_inpatient', 'number_diagnoses']

# Calculate correlations with target
correlations = df_temp[numeric_cols + ['readmitted_30day']].corr()['readmitted_30day'].drop('readmitted_30day')
correlations = correlations.sort_values(ascending=False)

print("\nCorrelation with 30-Day Readmission:")
for feature, corr in correlations.items():
    direction = "↑" if corr > 0 else "↓"
    print(f"  {feature}: {corr:.4f} {direction}")

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#e74c3c' if x > 0 else '#3498db' for x in correlations.values]
correlations.plot(kind='barh', ax=ax, color=colors)
ax.set_title('Feature Correlation with 30-Day Readmission', fontweight='bold')
ax.set_xlabel('Pearson Correlation Coefficient')
ax.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'feature_correlations.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cell 8: Summary Statistics for Documentation
print("\n" + "=" * 60)
print("SUMMARY STATISTICS FOR README/DOCUMENTATION")
print("=" * 60)

summary_stats = {
    'total_encounters': len(df),
    'unique_patients': df['patient_nbr'].nunique(),
    'readmit_30day_rate': (df['readmitted'] == '<30').mean() * 100,
    'readmit_any_rate': (df['readmitted'] != 'NO').mean() * 100,
    'avg_time_in_hospital': df['time_in_hospital'].mean(),
    'avg_num_medications': df['num_medications'].mean(),
    'avg_num_diagnoses': df['number_diagnoses'].mean(),
    'avg_num_lab_procedures': df['num_lab_procedures'].mean(),
}

print("\nKey Statistics:")
for key, value in summary_stats.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value:,}")

print("\n*** SAVE THESE STATS FOR YOUR README ***")

# Save summary to JSON for later use
import json
with open(OUTPUT_DIR / 'exploration_summary.json', 'w') as f:
    json.dump(summary_stats, f, indent=2)
print(f"\nSummary saved to: {OUTPUT_DIR / 'exploration_summary.json'}")

In [None]:
# Cell 9: Key Takeaways for Modeling
print("\n" + "=" * 60)
print("KEY TAKEAWAYS FOR MODELING")
print("=" * 60)

takeaways = """
1. CLASS IMBALANCE: Only ~11% of patients are readmitted within 30 days.
   → Must use SMOTE or class weights in modeling.

2. MISSING VALUES: 'weight', 'payer_code', 'medical_specialty' have >40% missing.
   → Consider dropping these columns or advanced imputation.

3. HIGH-RISK FEATURES:
   - Prior inpatient visits (strongest correlation)
   - Number of emergency visits
   - Time in hospital
   - Number of medications

4. PATIENT DEDUPLICATION: Many patients have multiple encounters.
   → Keep only first encounter per patient to avoid data leakage.

5. AGE PATTERN: Readmission risk generally increases with age.
   → Age is an important predictor.

6. DATASET SIZE: ~100K encounters from ~70K unique patients.
   → Sufficient data for robust modeling.
"""
print(takeaways)