# Synthetic Clinical Data Generator
This notebook demonstrates how to generate high-fidelity synthetic data using SDV's GaussianCopula or CTGAN models.

In [None]:
# Install required libraries
!pip install sdv pandas scikit-learn

## Upload Your Real Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import io
real_df = pd.read_csv(io.BytesIO(list(uploaded.values())[0]))
real_df.head()

## Train Synthesizer and Generate Data

In [None]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer

method = 'gaussian'  # change to 'ctgan' if needed

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_df)

if method == 'gaussian':
    synthesizer = GaussianCopulaSynthesizer(metadata)
else:
    synthesizer = CTGANSynthesizer(metadata)

synthesizer.fit(real_df)
synthetic_df = synthesizer.sample(num_rows=len(real_df))
synthetic_df.head()

## Download Synthetic Dataset

In [None]:
synthetic_df.to_csv('synthetic_output.csv', index=False)
files.download('synthetic_output.csv')

## Visualize Original vs Synthetic Distributions

In [None]:
import matplotlib.pyplot as plt

# Select key variables
columns_to_plot = ['Age', 'CRP', 'LOS', 'Audit_Score']
for col in columns_to_plot:
    plt.figure(figsize=(8, 4))
    plt.hist(real_df[col], bins=20, alpha=0.6, label='Original', density=True)
    plt.hist(synthetic_df[col], bins=20, alpha=0.6, label='Synthetic', density=True)
    plt.title(f'Distribution Comparison: {col}')
    plt.legend()
    plt.grid(True)
    plt.show()

## Categorical Proportion Comparison

In [None]:
categorical_cols = ['MDR_Status', 'ICU_Transfer', 'Mortality', 'ESBL_Present']

for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    real_counts = real_df[col].value_counts(normalize=True)
    synth_counts = synthetic_df[col].value_counts(normalize=True)
    bar_width = 0.35
    categories = sorted(set(real_counts.index).union(set(synth_counts.index)))
    
    real_vals = [real_counts.get(cat, 0) for cat in categories]
    synth_vals = [synth_counts.get(cat, 0) for cat in categories]

    x = range(len(categories))
    plt.bar(x, real_vals, width=bar_width, label='Original', alpha=0.6)
    plt.bar([p + bar_width for p in x], synth_vals, width=bar_width, label='Synthetic', alpha=0.6)
    plt.xticks([p + bar_width/2 for p in x], categories)
    plt.ylabel("Proportion")
    plt.title(f"Proportion Comparison: {col}")
    plt.legend()
    plt.grid(True)
    plt.show()

## Correlation Heatmaps (Numeric Variables)

In [None]:
import seaborn as sns

numeric_cols = real_df.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.heatmap(real_df[numeric_cols].corr(), annot=True, cmap='Blues')
plt.title("Original Data Correlation")

plt.subplot(1, 2, 2)
sns.heatmap(synthetic_df[numeric_cols].corr(), annot=True, cmap='Oranges')
plt.title("Synthetic Data Correlation")

plt.tight_layout()
plt.show()

## PCA and t-SNE Clustering Visualization

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Prepare numeric data
numeric_data_real = real_df.select_dtypes(include=['float64', 'int64']).dropna()
numeric_data_synth = synthetic_df.select_dtypes(include=['float64', 'int64']).dropna()

# Align shapes
min_len = min(len(numeric_data_real), len(numeric_data_synth))
numeric_data_real = numeric_data_real.sample(min_len, random_state=42)
numeric_data_synth = numeric_data_synth.sample(min_len, random_state=42)

# Standardize
scaler = StandardScaler()
real_scaled = scaler.fit_transform(numeric_data_real)
synth_scaled = scaler.transform(numeric_data_synth)

# PCA
pca = PCA(n_components=2)
real_pca = pca.fit_transform(real_scaled)
synth_pca = pca.transform(synth_scaled)

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=300)
real_tsne = tsne.fit_transform(real_scaled)
synth_tsne = tsne.fit_transform(synth_scaled)

# Plot PCA
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(real_pca[:, 0], real_pca[:, 1], alpha=0.6, label='Original', c='blue')
plt.scatter(synth_pca[:, 0], synth_pca[:, 1], alpha=0.6, label='Synthetic', c='orange')
plt.title("PCA: Original vs Synthetic")
plt.legend()

# Plot t-SNE
plt.subplot(1, 2, 2)
plt.scatter(real_tsne[:, 0], real_tsne[:, 1], alpha=0.6, label='Original', c='green')
plt.scatter(synth_tsne[:, 0], synth_tsne[:, 1], alpha=0.6, label='Synthetic', c='red')
plt.title("t-SNE: Original vs Synthetic")
plt.legend()
plt.tight_layout()
plt.show()
