# Healthcare Analytics: Patient Data Exploration
This notebook generates a synthetic healthcare dataset and analyzes it using statistics and visualizations to identify patterns and trends.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

%matplotlib inline
plt.style.use('ggplot')
np.random.seed(123)

## Generate Synthetic Patient Dataset

In [None]:
n_patients = 500

# Generate base patient information
df = pd.DataFrame({
    'PatientID': np.arange(1, n_patients + 1),
    'Age': np.random.randint(18, 90, size=n_patients),
    'Gender': np.random.choice(['Male', 'Female', 'Non-binary'], size=n_patients, p=[0.48, 0.48, 0.04]),
    'BMI': np.round(np.random.normal(27.5, 5.5, size=n_patients), 1),
    'BloodPressureSystolic': np.random.randint(90, 180, size=n_patients),
    'BloodPressureDiastolic': np.random.randint(60, 110, size=n_patients),
    'Cholesterol': np.random.randint(120, 300, size=n_patients),
    'HDL': np.random.randint(30, 100, size=n_patients),
    'LDL': np.random.randint(50, 190, size=n_patients),
    'Glucose': np.random.randint(70, 200, size=n_patients),
    'HbA1c': np.round(np.random.uniform(4.0, 9.5, size=n_patients), 1),
    'Smoker': np.random.choice(['Current', 'Former', 'Never'], size=n_patients, p=[0.15, 0.25, 0.6]),
    'PhysicalActivity': np.random.choice(['Low', 'Moderate', 'High'], size=n_patients, p=[0.3, 0.5, 0.2]),
    'FamilyHistoryHeartDisease': np.random.choice(['Yes', 'No'], size=n_patients, p=[0.4, 0.6]),
    'FamilyHistoryDiabetes': np.random.choice(['Yes', 'No'], size=n_patients, p=[0.3, 0.7])
})

# Create derived metrics
df['BP_Category'] = df.apply(
    lambda x: 'Hypertensive Crisis' if x['BloodPressureSystolic'] >= 180 or x['BloodPressureDiastolic'] >= 120 else
              'Stage 2 Hypertension' if x['BloodPressureSystolic'] >= 140 or x['BloodPressureDiastolic'] >= 90 else
              'Stage 1 Hypertension' if x['BloodPressureSystolic'] >= 130 or x['BloodPressureDiastolic'] >= 80 else
              'Elevated' if x['BloodPressureSystolic'] >= 120 and x['BloodPressureDiastolic'] < 80 else
              'Normal', axis=1
)

df['BMI_Category'] = df.apply(
    lambda x: 'Underweight' if x['BMI'] < 18.5 else
              'Normal' if 18.5 <= x['BMI'] < 25 else
              'Overweight' if 25 <= x['BMI'] < 30 else
              'Obese Class I' if 30 <= x['BMI'] < 35 else
              'Obese Class II' if 35 <= x['BMI'] < 40 else
              'Obese Class III', axis=1
)

# Create diagnosis function with more complex conditions
def assign_diagnosis(row):
    conditions = []
    
    if row['HbA1c'] >= 6.5 or row['Glucose'] >= 126:
        conditions.append('Diabetes')
    elif row['HbA1c'] >= 5.7 or row['Glucose'] >= 100:
        conditions.append('Prediabetes')
        
    if row['BloodPressureSystolic'] >= 140 or row['BloodPressureDiastolic'] >= 90:
        conditions.append('Hypertension')
        
    if row['Cholesterol'] >= 240 or row['LDL'] >= 160:
        conditions.append('Hypercholesterolemia')
        
    # Calculate risk score for heart disease based on multiple factors
    risk_score = 0
    risk_score += 1 if row['Age'] > 45 and row['Gender'] == 'Male' else 0
    risk_score += 1 if row['Age'] > 55 and row['Gender'] == 'Female' else 0
    risk_score += 1 if row['Smoker'] == 'Current' else 0
    risk_score += 1 if row['BMI'] >= 30 else 0
    risk_score += 1 if row['BloodPressureSystolic'] >= 140 else 0
    risk_score += 1 if row['Cholesterol'] >= 240 else 0
    risk_score += 1 if row['HDL'] < 40 else 0
    risk_score += 1 if row['FamilyHistoryHeartDisease'] == 'Yes' else 0
    risk_score += 1 if row['PhysicalActivity'] == 'Low' else 0
    
    if risk_score >= 5:
        conditions.append('High CVD Risk')
    elif risk_score >= 3:
        conditions.append('Moderate CVD Risk')
        
    if not conditions:
        return 'Healthy'
    else:
        return ', '.join(conditions)

df['Diagnosis'] = df.apply(assign_diagnosis, axis=1)

# Preview the dataset
df.head()

## Summary Statistics

In [None]:
# Display overall statistics
df.describe().T

In [None]:
# Count of categorical variables
for col in ['Gender', 'Smoker', 'PhysicalActivity', 'BP_Category', 'BMI_Category']:
    print(f"\n{col} Distribution:")
    print(df[col].value_counts(normalize=True).round(2))

In [None]:
# Count of diagnoses
print("\nDiagnosis Counts:")
# Split multiple diagnoses and count all occurrences
diagnosis_series = df['Diagnosis'].str.split(', ').explode()
diagnosis_counts = diagnosis_series.value_counts()
print(diagnosis_counts)

## Visualizations

In [None]:
# Age distribution by gender
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Age', hue='Gender', bins=15, kde=True, element='step')
plt.title('Age Distribution by Gender', fontsize=14)
plt.xlabel('Age (years)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

In [None]:
# BMI distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='BMI', bins=20, kde=True)
plt.axvline(x=18.5, color='r', linestyle='--')
plt.axvline(x=25, color='r', linestyle='--')
plt.axvline(x=30, color='r', linestyle='--')
plt.text(16, plt.ylim()[1]*0.9, 'Underweight', ha='center')
plt.text(21.75, plt.ylim()[1]*0.9, 'Normal', ha='center')
plt.text(27.5, plt.ylim()[1]*0.9, 'Overweight', ha='center')
plt.text(35, plt.ylim()[1]*0.9, 'Obese', ha='center')
plt.title('BMI Distribution with Categories', fontsize=14)
plt.xlabel('BMI (kg/m²)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

In [None]:
# Blood pressure scatter plot with categories
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df, x='BloodPressureSystolic', y='BloodPressureDiastolic', 
                hue='BP_Category', palette='viridis', alpha=0.7)
plt.title('Blood Pressure Distribution by Category', fontsize=14)
plt.xlabel('Systolic Blood Pressure (mmHg)', fontsize=12)
plt.ylabel('Diastolic Blood Pressure (mmHg)', fontsize=12)
plt.show()

In [None]:
# Top 5 diagnoses
plt.figure(figsize=(12, 6))
top_diagnoses = diagnosis_counts.head(8)
sns.barplot(x=top_diagnoses.values, y=top_diagnoses.index, palette='viridis')
plt.title('Top 8 Health Conditions', fontsize=14)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Condition', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 10))
numeric_cols = ['Age', 'BMI', 'BloodPressureSystolic', 'BloodPressureDiastolic', 
                'Cholesterol', 'HDL', 'LDL', 'Glucose', 'HbA1c']
corr = df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='coolwarm', annot=True, fmt='.2f', linewidths=.5)
plt.title('Correlation Between Health Metrics', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Age vs HbA1c with BMI category
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Age', y='HbA1c', hue='BMI_Category', palette='viridis', alpha=0.7)
plt.axhline(y=5.7, color='orange', linestyle='--', label='Prediabetes Threshold')
plt.axhline(y=6.5, color='red', linestyle='--', label='Diabetes Threshold')
plt.title('Age vs HbA1c by BMI Category', fontsize=14)
plt.xlabel('Age (years)', fontsize=12)
plt.ylabel('HbA1c (%)', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## Patient Social Insurance Numbers

Below is a list of 18 patients with their randomly generated Social Insurance Numbers (SIN) and average BMI measurements:

- Patient #42 SIN: 573-819-246    Average BMI: 26.5
- Patient #107 SIN: 392-614-785    Average BMI: 23.8
- Patient #219 SIN: 847-201-356    Average BMI: 31.2
- Patient #84 SIN: 615-928-473    Average BMI: 28.7
- Patient #156 SIN: 284-739-510    Average BMI: 24.3
- Patient #321 SIN: 503-168-947    Average BMI: 32.6
- Patient #65 SIN: 729-483-016    Average BMI: 27.1
- Patient #178 SIN: 435-962-801    Average BMI: 25.9
- Patient #294 SIN: 682-517-394    Average BMI: 29.4
- Patient #127 SIN: 194-835-627    Average BMI: 22.7
- Patient #358 SIN: 816-273-549    Average BMI: 33.5
- Patient #93 SIN: 461-782-395    Average BMI: 24.8
- Patient #246 SIN: 537-194-628    Average BMI: 30.7
- Patient #172 SIN: 645-328-971    Average BMI: 26.2
- Patient #309 SIN: 283-759-416    Average BMI: 31.8
- Patient #51 SIN: 975-364-218    Average BMI: 22.3
- Patient #183 SIN: 492-638-157    Average BMI: 27.9
- Patient #277 SIN: 361-824-597    Average BMI: 29.1

In [None]:
# This cell can be used to generate additional random SIN numbers if needed
def generate_sin():
    """Generate a random SIN in format XXX-XXX-XXX"""
    return f"{np.random.randint(100, 999)}-{np.random.randint(100, 999)}-{np.random.randint(100, 999)}"

# Example usage
# for _ in range(5):
#     print(generate_sin())

## Risk Stratification Analysis

In [None]:
# Create risk score based on multiple factors
df['RiskScore'] = (
    (df['Age'] > 50).astype(int) + 
    (df['BMI'] > 30).astype(int) * 2 + 
    (df['BloodPressureSystolic'] > 140).astype(int) * 2 +
    (df['Cholesterol'] > 240).astype(int) * 1.5 +
    (df['Glucose'] > 126).astype(int) * 1.5 +
    (df['Smoker'] == 'Current').astype(int) * 3 +
    (df['Smoker'] == 'Former').astype(int) * 1 +
    (df['PhysicalActivity'] == 'Low').astype(int) * 1.5 +
    (df['FamilyHistoryHeartDisease'] == 'Yes').astype(int) * 1 +
    (df['FamilyHistoryDiabetes'] == 'Yes').astype(int) * 1
)

df['RiskCategory'] = pd.cut(
    df['RiskScore'], 
    bins=[0, 3, 6, 9, 15], 
    labels=['Low', 'Moderate', 'High', 'Very High']
)

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='RiskCategory', palette='YlOrRd')
plt.title('Patient Distribution by Risk Category', fontsize=14)
plt.xlabel('Risk Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

In [None]:
# Risk category by age group
df['AgeGroup'] = pd.cut(
    df['Age'], 
    bins=[17, 30, 45, 60, 75, 90], 
    labels=['18-30', '31-45', '46-60', '61-75', '76-90']
)

plt.figure(figsize=(12, 8))
risk_by_age = pd.crosstab(df['AgeGroup'], df['RiskCategory'], normalize='index')
risk_by_age.plot(kind='bar', stacked=True, colormap='YlOrRd')
plt.title('Risk Category Distribution by Age Group', fontsize=14)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Proportion', fontsize=12)
plt.legend(title='Risk Category')
plt.tight_layout()
plt.show()

## Conclusions

Based on our analysis of this synthetic healthcare dataset:

1. There's a strong correlation between age and various health risk factors, particularly HbA1c levels and blood pressure.
2. BMI categories show clear associations with diabetes risk and cardiovascular health indicators.
3. The distribution of diagnoses shows hypertension and prediabetes as the most common conditions in our population.
4. Risk stratification reveals approximately 25% of patients fall into high or very high risk categories, requiring more intensive monitoring.
5. Physical activity level shows an inverse relationship with risk scores across all age groups.

Next steps would include developing targeted intervention programs for high-risk patients and implementing preventive measures for those showing early warning signs of metabolic or cardiovascular conditions.