In [None]:
import torch
import torch.nn as nn

from sklearn.compose import ColumnTransformer

import os
import math
import joblib
from typing import Optional, Tuple, Dict, Any

### Data Pipeline for Loading and Sampling Synthetic Data

In [None]:
# ==== Utilities: save/load, sampling, inverse-transform, export ====
import os
import math
import joblib
from typing import Optional, Tuple, Dict, Any

def save_gan(bundle_path: str,
             G: nn.Module,
             D: nn.Module,
             preprocessor: ColumnTransformer,
             meta: Optional[Dict[str, Any]] = None):
    """
    Saves:
      - {bundle_path}.pt  : torch checkpoints (G/D state_dicts + meta)
      - {bundle_path}.prep: sklearn preprocessor (ColumnTransformer + helpful fields)
    """
    ckpt = {
        "G_state": G.state_dict(),
        "D_state": D.state_dict(),
        "meta": (meta or {}),
    }
    torch.save(ckpt, f"{bundle_path}.pt")

    prep_bundle = {
        "preprocessor": preprocessor,
        # Nice-to-have fields if present:
        "feature_names_": getattr(preprocessor, "get_feature_names_out", lambda: None)(),
    }
    joblib.dump(prep_bundle, f"{bundle_path}.prep")
    print(f"[save_gan] Wrote: {bundle_path}.pt  and  {bundle_path}.prep")


def load_gan(bundle_path: str,
             latent_dim: int,
             in_dim: int,
             device: str = "cpu",
             G_cls=Generator,
             D_cls=Discriminator) -> Tuple[nn.Module, nn.Module, ColumnTransformer, Dict[str, Any]]:
    """
    Recreates G/D modules, loads weights, and returns the fitted preprocessor & meta.
    """
    ckpt = torch.load(f"{bundle_path}.pt", map_location=device)
    G = G_cls(latent_dim, in_dim).to(device)
    D = D_cls(in_dim).to(device)
    G.load_state_dict(ckpt["G_state"])
    D.load_state_dict(ckpt["D_state"])
    G.eval(); D.eval()

    prep_bundle = joblib.load(f"{bundle_path}.prep")
    preprocessor = prep_bundle["preprocessor"]
    meta = ckpt.get("meta", {})
    print(f"[load_gan] Loaded G/D and preprocessor from {bundle_path}.*")
    return G, D, preprocessor, meta


@torch.no_grad()
def sample_latent(n: int, latent_dim: int, device: str = "cpu", seed: Optional[int] = None) -> torch.Tensor:
    if seed is not None:
        g = torch.Generator(device=device)
        g.manual_seed(seed)
        return torch.randn((n, latent_dim), device=device, generator=g)
    return torch.randn((n, latent_dim), device=device)


@torch.no_grad()
def sample_synthetic(G: nn.Module,
                     n_rows: int,
                     latent_dim: int,
                     device: str = "cpu",
                     batch_size: int = 4096) -> torch.Tensor:
    """
    Samples in the GAN's feature space (i.e., AFTER preprocessing & scaling).
    Returns a float32 tensor of shape [n_rows, in_dim] in [-1, 1] (because of Tanh).
    """
    G.eval()
    out = []
    remain = n_rows
    while remain > 0:
        b = min(remain, batch_size)
        z = sample_latent(b, latent_dim, device=device)
        x = G(z)
        out.append(x.detach().cpu())
        remain -= b
    return torch.vstack(out)


def inverse_to_dataframe(X: torch.Tensor,
                         preprocessor: ColumnTransformer,
                         original_columns: Optional[list] = None) -> pd.DataFrame:
    """
    Attempts to invert the ColumnTransformer (+ pipelines) back to a tidy DataFrame.
    Works when transformers support inverse_transform (SimpleImputer, MinMaxScaler, OneHotEncoder do).
    Falls back to a numpy array -> DataFrame if inverse_transform fails.
    """
    X_np = X.detach().cpu().numpy()
    try:
        X_inv = preprocessor.inverse_transform(X_np)
        # If original columns known, use them; else produce generic names
        if original_columns is not None:
            df = pd.DataFrame(X_inv, columns=original_columns)
        else:
            # best-effort: if the original DF shape is known from the preprocessor
            # there's no universal way; we just number columns
            df = pd.DataFrame(X_inv)
        return df
    except Exception as e:
        # Fallback: you still get the transformed array for further handling
        print(f"[inverse_to_dataframe] inverse_transform failed ({e}); returning transformed array as DataFrame")
        return pd.DataFrame(X_np)


def generate_synthetic_dataset(G: nn.Module,
                               preprocessor: ColumnTransformer,
                               n_rows: int,
                               latent_dim: int,
                               device: str = "cpu",
                               batch_size: int = 4096,
                               original_columns: Optional[list] = None,
                               seed: Optional[int] = None,
                               return_tensor: bool = False) -> Tuple[pd.DataFrame, Optional[torch.Tensor]]:
    """
    1) Samples in feature space from G
    2) Inverse-transforms via the fitted preprocessor to get a DataFrame
    3) Optionally returns the raw tensor in model space
    """
    # Sample (optionally seed by setting global torch seed before calling)
    X_t = sample_synthetic(G, n_rows, latent_dim, device=device, batch_size=batch_size)
    # Invert to tabular space
    df = inverse_to_dataframe(X_t, preprocessor, original_columns=original_columns)
    return (df, X_t) if return_tensor else (df, None)


def save_synthetic_csv(df: pd.DataFrame, path: str, index: bool = False):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    df.to_csv(path, index=index)
    print(f"[save_synthetic_csv] Wrote {len(df):,} rows to {path}")


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

G_loaded, D_loaded, preproc, meta = load_gan(
    "./artifacts/loan_gan",
    latent_dim=latent_dim,
    in_dim=in_dim,
    device=DEVICE
)

# Generate 10k synthetic rows
df_synth, _ = generate_synthetic_dataset(
    G_loaded,
    preproc,
    n_rows=10_000,
    latent_dim=meta["latent_dim"],
    device=DEVICE
)

# Save to CSV
save_synthetic_csv(df_synth, "./artifacts/loan_synth_10k.csv")
