In [None]:
# Full reproducible workflow: generate dataset, compute stats, save plots, and write README
# Run in Colab or a Jupyter environment.

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Create output folders
os.makedirs('images', exist_ok=True)

# 2) Generate synthetic dataset (200 rows)
np.random.seed(42)
n = 200
hba1c = np.random.normal(6.0, 0.8, n).clip(4.5, 9.5)
bp_systolic = np.random.normal(120, 15, n) + (hba1c - 6.0) * 5

df = pd.DataFrame({
    'PatientID': range(1, n+1),
    'HbA1c': np.round(hba1c, 2),
    'BP_systolic': np.round(bp_systolic, 1)
})
df.to_csv('synthetic_biomedical_data.csv', index=False)
print("Saved synthetic_biomedical_data.csv (200 rows)")

# 3) Descriptive statistics & correlation
desc = df[['HbA1c','BP_systolic']].describe().T.round(2)
corr = df[['HbA1c','BP_systolic']].corr().iloc[0,1].round(2)

# 4) Categorize using clinical thresholds
def categorize_hba1c(x):
    if x < 5.7: return 'Normal'
    elif x < 6.5: return 'Prediabetes'
    else: return 'Diabetes'

def categorize_bp(x):
    if x < 120: return 'Normal'
    elif x < 130: return 'Elevated'
    elif x < 140: return 'Hypertension Stage 1'
    else: return 'Hypertension Stage 2'

df['HbA1c_group'] = df['HbA1c'].apply(categorize_hba1c)
df['BP_group'] = df['BP_systolic'].apply(categorize_bp)

hba_counts = df['HbA1c_group'].value_counts().reindex(['Normal','Prediabetes','Diabetes']).fillna(0).astype(int)
hba_perc = (hba_counts / len(df) * 100).round(1)
bp_counts = df['BP_group'].value_counts().reindex(['Normal','Elevated','Hypertension Stage 1','Hypertension Stage 2']).fillna(0).astype(int)
bp_perc = (bp_counts / len(df) * 100).round(1)
ct = pd.crosstab(df['HbA1c_group'], df['BP_group'], normalize='index') * 100
ct = ct.round(1)

print("\nDescriptive statistics:\n", desc)
print("\nCorrelation (HbA1c vs BP_systolic):", corr)
print("\nHbA1c counts:\n", hba_counts.to_dict())
print("\nHbA1c percentages:\n", hba_perc.to_dict())
print("\nBP counts:\n", bp_counts.to_dict())
print("\nBP percentages:\n", bp_perc.to_dict())
print("\nCross-tab (percent within HbA1c groups):\n", ct)

# 5) Create and save plots into images/
sns.set(style='whitegrid')

# HbA1c distribution + boxplot
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.histplot(df['HbA1c'], kde=True, bins=20)
plt.title('HbA1c Distribution')
plt.subplot(1,2,2)
sns.boxplot(x=df['HbA1c'])
plt.title('HbA1c Boxplot')
plt.tight_layout()
plt.savefig('images/HbA1c_distribution_boxplot.png', dpi=200, bbox_inches='tight')
plt.close()
print("Saved images/HbA1c_distribution_boxplot.png")

# BP distribution + boxplot
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.histplot(df['BP_systolic'], kde=True, bins=20)
plt.title('BP_systolic Distribution')
plt.subplot(1,2,2)
sns.boxplot(x=df['BP_systolic'])
plt.title('BP_systolic Boxplot')
plt.tight_layout()
plt.savefig('images/BP_systolic_distribution_boxplot.png', dpi=200, bbox_inches='tight')
plt.close()
print("Saved images/BP_systolic_distribution_boxplot.png")

# Scatterplot
plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x='HbA1c', y='BP_systolic', hue='BP_group', palette='coolwarm', legend='brief')
plt.title('HbA1c vs BP_systolic')
plt.tight_layout()
plt.savefig('images/hba1c_vs_bp_scatter.png', dpi=200, bbox_inches='tight')
plt.close()
print("Saved images/hba1c_vs_bp_scatter.png")

# Category counts
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.countplot(data=df, x='HbA1c_group', order=['Normal','Prediabetes','Diabetes'])
plt.title('HbA1c Category Counts')
plt.subplot(1,2,2)
sns.countplot(data=df, x='BP_group', order=['Normal','Elevated','Hypertension Stage 1','Hypertension Stage 2'])
plt.title('BP Category Counts')
plt.xticks(rotation=20)
plt.tight_layout()
plt.savefig('images/category_counts.png', dpi=200, bbox_inches='tight')
plt.close()
print("Saved images/category_counts.png")

# Cross-tab heatmap
plt.figure(figsize=(8,5))
sns.heatmap(ct, annot=True, fmt='.1f', cmap='Blues')
plt.title('Cross-tabulation: HbA1c vs BP (%)')
plt.tight_layout()
plt.savefig('images/hba1c_bp_crosstab_heatmap.png', dpi=200, bbox_inches='tight')
plt.close()
print("Saved images/hba1c_bp_crosstab_heatmap.png")

# 6) Programmatically write README_generated.md (embedding computed numbers and image paths)
img_hba = 'images/HbA1c_distribution_boxplot.png'
img_bp = 'images/BP_systolic_distribution_boxplot.png'
img_scatter = 'images/hba1c_vs_bp_scatter.png'
img_counts = 'images/category_counts.png'
img_heat = 'images/hba1c_bp_crosstab_heatmap.png'

# Percentage of diabetic patients with hypertension stages 1 or 2 (for the README sentence)
diabetes_hypertension_pct = 0
if hba_counts.get('Diabetes', 0) > 0:
    diabetes_hypertension_pct = int((pd.crosstab(df['HbA1c_group'], df['BP_group']).loc['Diabetes',['Hypertension Stage 1','Hypertension Stage 2']].sum() / hba_counts['Diabetes']) * 100)

readme = f\"\"\"# üß¨ Biomedical Data Exploration (EDA)

> Exploratory Data Analysis (EDA) of synthetic biomedical indicators ‚Äî **HbA1c** and **Systolic Blood Pressure (BP)** ‚Äî to uncover relationships between glucose control and cardiovascular health.

---

## üìÅ Project Overview

This project demonstrates **data processing, cleaning, and visualization** techniques using a **synthetic biomedical dataset** of 200 patients.  
The analysis explores potential associations between **HbA1c** (glycated hemoglobin) and **BP_systolic** (systolic blood pressure), two key indicators of metabolic and cardiovascular health.

---

## üéØ Objectives

- Perform **descriptive statistical analysis** on HbA1c and BP.  
- Visualize data distributions and bivariate relationships.  
- Categorize patients into **clinically meaningful groups**.  
- Extract **biomedical insights** consistent with real-world trends.

---

## üìä Dataset Information

| Feature | Description | Units | Typical Range |
|----------|--------------|--------|----------------|
| **PatientID** | Unique patient identifier | ‚Äî | 1‚Äì200 |
| **HbA1c** | Glycated hemoglobin (indicator of long-term glucose control) | % | 4.5 ‚Äì 9.5 |
| **BP_systolic** | Systolic blood pressure (upper reading) | mmHg | 90 ‚Äì 180 |

Synthetic data was generated with mild positive correlation between HbA1c and BP_systolic to mimic realistic biomedical variability.

---

## ‚öôÔ∏è Technologies Used

- **Python 3.x**
- **NumPy** ‚Äî numerical computations  
- **Pandas** ‚Äî data manipulation  
- **Matplotlib & Seaborn** ‚Äî visualization  
- **Jupyter Notebook / Google Colab** ‚Äî analysis environment

---

## üß© Exploratory Data Analysis (EDA)

### 1Ô∏è‚É£ Descriptive Statistics

| Metric | HbA1c | BP_systolic |
|--------:|:-------:|:-------------:|
| **Mean** | {desc.loc['HbA1c','mean']} | {desc.loc['BP_systolic','mean']} |
| **Std Dev** | {desc.loc['HbA1c','std']} | {desc.loc['BP_systolic','std']} |
| **Min‚ÄìMax** | {desc.loc['HbA1c','min']} ‚Äì {desc.loc['HbA1c','max']} | {desc.loc['BP_systolic','min']} ‚Äì {desc.loc['BP_systolic','max']} |
| **Correlation (r)** | **{corr}** | Positive correlation |

---

### 2Ô∏è‚É£ Distribution Plots

#### HbA1c Distribution & Boxplot

![HbA1c Distribution]({img_hba})

#### BP_systolic Distribution & Boxplot

![BP_systolic Distribution]({img_bp})

**Insights:**  
- HbA1c shows slight right skew; ~{int(hba_perc.get('Diabetes',0))}% diabetic range (>6.5%).  
- BP_systolic roughly normal with some high-BP outliers.

---

### 3Ô∏è‚É£ Correlation & Relationship

#### Scatterplot: HbA1c vs BP_systolic

![HbA1c vs BP Scatterplot]({img_scatter})

**Observation:**  
Moderate positive correlation *(r ‚âà {corr})* ‚Äî individuals with higher HbA1c tend to have higher systolic blood pressure.

---

### 4Ô∏è‚É£ Biomedical Categorization

| Metric | Category Logic |
|---------|----------------|
| **HbA1c** | Normal: <5.7%, Prediabetes: 5.7‚Äì6.4%, Diabetes: ‚â• 6.5% |
| **BP_systolic** | Normal: <120, Elevated: 120‚Äì129, Hypertension Stage 1: 130‚Äì139, Stage 2: ‚â•140 |

#### Category Counts

![Category Counts]({img_counts})

---

### 5Ô∏è‚É£ Cross-Tabulation Heatmap

![Cross-tab Heatmap]({img_heat})

**Interpretation:**  
- {diabetes_hypertension_pct}% of diabetic patients fall into Hypertension Stage 1‚Äì2.  
- Over half of normoglycemic individuals have normal BP.  
- Pattern aligns with metabolic comorbidity seen in clinical studies.

---

## üß† Key Insights

- Positive correlation between HbA1c and systolic BP.  
- Prediabetes and elevated BP commonly overlap ‚Äî early metabolic risk.  
- Synthetic data mirrors population-level biomedical relationships.

---

## üöÄ Next Steps

- Add additional biomarkers (e.g., BMI, Age, Cholesterol).  
- Perform multivariate regression to predict BP.  
- Apply PCA for dimensionality reduction.  
- Build an interactive dashboard (Streamlit) for exploration.

---

## üßæ How to Use

Clone this repository:

```bash
git clone https://github.com/yourusername/biomedical-eda.git
cd biomedical-eda
