# Structured Synthetic Data

# Imports

In [24]:
from pathlib import Path
import numpy as np
import pandas as pd

# Read sample csv

In [25]:
def read_template_csv(path: str) -> pd.DataFrame:
    return pd.read_csv(path)


# Build Spec to profile sample dataset (statistical features)

In [26]:
def build_spec(
    df: pd.DataFrame,
    id_col: str = "Customer_ID",
    numeric_threshold: float = 0.8,
    clip_q: tuple = (0.01, 0.99),
    jitter_frac: float = 0.02,
):
    """
    spec[col] contains everything needed to generate that column fast.
    """
    spec = {"headers": list(df.columns), "id_col": id_col, "cols": {}}
    num_cache = {}
    for col in df.columns:
        if col == id_col:
            continue
        num_cache[col] = pd.to_numeric(df[col], errors="coerce")

    for col in df.columns:
        if col == id_col:
            spec["cols"][col] = {"type": "id"}
            continue

        s_num = num_cache[col]
        numeric_share = float(s_num.notna().mean())

        if numeric_share >= numeric_threshold:
            x = s_num.dropna().to_numpy(dtype=float)
            if x.size == 0:
                spec["cols"][col] = {"type": "numeric", "obs": np.array([0.0]), "qlo": 0.0, "qhi": 0.0,
                                     "intlike": True, "jitter": 0.0}
                continue

            qlo, qhi = np.quantile(x, clip_q)
            intlike = bool(np.mean(np.isclose(x, np.round(x))) > 0.9)

            spec["cols"][col] = {
                "type": "numeric",
                "obs": x,       
                "qlo": float(qlo),
                "qhi": float(qhi),
                "intlike": intlike,
                "jitter": float(jitter_frac),
                "sd": float(np.std(x, ddof=1)) if x.size > 1 else 0.0,  
            }

        else:
            s = df[col].dropna().astype(str)
            if s.size == 0:
                cats = np.array(["NA"], dtype=object)
                probs = np.array([1.0], dtype=float)
            else:
                vc = s.value_counts(normalize=True)
                cats = vc.index.to_numpy(dtype=object)
                probs = vc.to_numpy(dtype=float)

            spec["cols"][col] = {
                "type": "categorical",
                "cats": cats,
                "probs": probs,
            }

    return spec

# Column Generator (Using Numpy arrays)

In [27]:
def gen_numeric(rng: np.random.Generator, n: int, colspec: dict) -> np.ndarray:
    obs = colspec["obs"]
    out = rng.choice(obs, size=n, replace=True).astype(float)

    if colspec["jitter"] > 0 and colspec["sd"] > 0:
        out += rng.normal(0.0, colspec["sd"] * colspec["jitter"], size=n)

    out = np.clip(out, colspec["qlo"], colspec["qhi"])

    if colspec["intlike"]:
        out = np.rint(out).astype(int)

    return out

def gen_categorical(rng: np.random.Generator, n: int, colspec: dict) -> np.ndarray:
    return rng.choice(colspec["cats"], size=n, replace=True, p=colspec["probs"])

def gen_ids(n: int, prefix: str = "CUST", width: int = 7) -> np.ndarray:
    nums = np.arange(1, n + 1)
    return np.array([f"{prefix}{str(i).zfill(width)}" for i in nums], dtype=object)


# Assemble Synthetic DataFrame From Spec

In [28]:
def generate_from_spec(spec: dict, n_rows: int, seed: int = 42) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    headers = spec["headers"]
    id_col = spec["id_col"]

    data = {}
    for col in headers:
        colspec = spec["cols"][col]
        t = colspec["type"]
        if t == "id" and col == id_col:
            data[col] = gen_ids(n_rows)
        elif t == "numeric":
            data[col] = gen_numeric(rng, n_rows, colspec)
        else:
            data[col] = gen_categorical(rng, n_rows, colspec)

    return pd.DataFrame(data, columns=headers)


# Run the Pipeline

In [None]:
project_root = Path.cwd().parent
sample_path = project_root / "data" / "sample" / "dpd_sample.csv"
out_dir = project_root / "output"
out_dir.mkdir(exist_ok=True)

out_path = out_dir / "dpd_out.csv"

print("project_root:", project_root)
print("sample exists:", sample_path.exists(), sample_path)
print("out_path:", out_path)


PROJECT_ROOT: /drive/notebooks
Sample: /drive/notebooks/dpd_sample.csv
Output: /drive/notebooks/output/dpd_out.csv


In [30]:
df_t = read_template_csv(sample_path)
spec = build_spec(df_t, id_col="Customer_ID", jitter_frac=0.02)
df_syn = generate_from_spec(spec, n_rows=1000, seed=42) # change row numbers here
df_syn.to_csv(out_path, index=False)
