In [None]:
import warnings
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List

import ctgan
import numpy as np
import pandas as pd
from ctgan import CTGAN
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier


In [None]:
@dataclass
class Card:
    dataset: str
    encoding: str
    target: str
    dataset_path: str
    model_path: str
    schedule_path: str
    real_data: pd.DataFrame
    synt_data: pd.DataFrame
    jensen_shannon_divergence: pd.DataFrame
    mean_jsd: float
    logistic_real_score: float
    logistic_synt_score: float
    xgb_real_score: float
    xgb_synt_score: float

    def __init__(self, dataset: str, encoding: str, target: str, dataset_path: str, model_path: str, schedule_path: str):
        self.dataset = dataset
        self.encoding = encoding
        self.target = target
        self.dataset_path = dataset_path
        self.model_path = model_path
        self.schedule_path = schedule_path
        self.model = ctgan.CTGAN.load(model_path)
        self.real_data = pd.read_csv(dataset_path)
        self.synt_data = self.model.sample(len(self.real_data))
        self.jensen_shannon_divergence = pd.DataFrame()
        self.mean_jsd = 0.0
        self.logistic_real_score = 0.0
        self.logistic_synt_score = 0.0
        self.xgb_real_score = 0.0
        self.xgb_synt_score = 0.0


In [None]:
import numpy as np
import pandas as pd


def calculate_distributions(p_series: pd.Series, q_series: pd.Series):
    """
    Вычисляет распределения вероятностей для двух серий (колонок) pandas.
    Возвращает p, q на общем наборе уникальных значений.
    """
    all_values = pd.Index(p_series.unique()).union(q_series.unique())
    p_dist = p_series.value_counts(normalize=True).reindex(all_values, fill_value=0)
    q_dist = q_series.value_counts(normalize=True).reindex(all_values, fill_value=0)
    return p_dist, q_dist


def calculate_metrics(p_df: pd.DataFrame, q_df: pd.DataFrame, skip_col: list[str] | None = None) -> pd.DataFrame:
    """
    Вычисляет JSD дивергенцию для каждой колонки.
    """
    skip_col = skip_col or []
    metrics = []
    epsilon = 1e-10

    for col in p_df.columns:
        if col not in q_df.columns or col in skip_col:
            continue

        p, q = calculate_distributions(p_df[col], q_df[col])
        p_nonzero = p[p > 0]
        q_nonzero = q[q > 0]

        m = 0.5 * (p + q)
        m_smooth = m + epsilon

        kl_p_m = np.sum(p_nonzero * np.log2(p_nonzero / m_smooth[p_nonzero.index]))
        kl_q_m = np.sum(q_nonzero * np.log2(q_nonzero / m_smooth[q_nonzero.index]))
        jensen_shannon_divergence = 0.5 * kl_p_m + 0.5 * kl_q_m

        metrics.append({
            "column": col,
            "jensen_shannon_divergence": jensen_shannon_divergence,
        })

    return pd.DataFrame(metrics)[["column", "jensen_shannon_divergence"]]


In [None]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


# --- Utility metrics ---
def encode_targets(y_real: pd.Series, y_synt: pd.Series):
    if y_real.empty:
        return y_real, y_synt
    sample = y_real.iloc[0]
    if isinstance(sample, str) or y_real.dtype == object:
        encoder = LabelEncoder()
        encoder.fit(pd.concat([y_real, y_synt], ignore_index=True))
        return pd.Series(encoder.transform(y_real), index=y_real.index), pd.Series(encoder.transform(y_synt), index=y_synt.index)
    return y_real, y_synt


def evaluate_card(card: Card) -> None:
    X_real = card.real_data.drop(columns=[card.target])
    y_real = card.real_data[card.target]
    X_synt = card.synt_data.drop(columns=[card.target])
    y_synt = card.synt_data[card.target]

    y_real_enc, y_synt_enc = encode_targets(y_real, y_synt)

    X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
        X_real, y_real_enc, test_size=0.5, random_state=42, stratify=y_real_enc
    )
    X_train_synt, _, y_train_synt, _ = train_test_split(
        X_synt, y_synt_enc, test_size=0.5, random_state=42, stratify=y_synt_enc
    )

    lr_params = dict(solver="saga", max_iter=20000, random_state=42)
    clf_real = make_pipeline(StandardScaler(), LogisticRegression(**lr_params))
    clf_real.fit(X_train_real, y_train_real)
    y_pred_real = clf_real.predict(X_test_real)
    card.logistic_real_score = accuracy_score(y_test_real, y_pred_real)

    clf_synt = make_pipeline(StandardScaler(), LogisticRegression(**lr_params))
    clf_synt.fit(X_train_synt, y_train_synt)
    y_pred_synt = clf_synt.predict(X_test_real)
    card.logistic_synt_score = accuracy_score(y_test_real, y_pred_synt)

    xgb_params = dict(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42,
        n_jobs=4,
    )
    xgb_real = XGBClassifier(**xgb_params)
    xgb_real.fit(X_train_real, y_train_real)
    xgb_pred_real = xgb_real.predict(X_test_real)
    card.xgb_real_score = accuracy_score(y_test_real, xgb_pred_real)

    xgb_synt = XGBClassifier(**xgb_params)
    xgb_synt.fit(X_train_synt, y_train_synt)
    xgb_pred_synt = xgb_synt.predict(X_test_real)
    card.xgb_synt_score = accuracy_score(y_test_real, xgb_pred_synt)

    card.jensen_shannon_divergence = calculate_metrics(
        card.real_data,
        card.synt_data,
        skip_col=[card.target, "Id", "ID", "id", "identifier"],
    )
    if not card.jensen_shannon_divergence.empty:
        card.mean_jsd = float(card.jensen_shannon_divergence["jensen_shannon_divergence"].mean())


In [None]:
ENCODING_FILE_SUFFIX = {
    "original": "original",
    "one_hot_encoding": "ohe",
    "label_encoding": "label",
    "frequency_encoding": "frequency",
}

ENCODING_LABELS = {
    "original": "Original",
    "one_hot_encoding": "One-hot",
    "label_encoding": "Label",
    "frequency_encoding": "Frequency",
}


def load_registry(registry_path: str) -> pd.DataFrame:
    df = pd.read_csv(registry_path)
    df.columns = [c.strip() for c in df.columns]
    df["dataset_name"] = df["dataset_name"].str.strip()
    df["target"] = df["target"].str.strip()
    df["dataset_csv"] = df["dataset_csv"].astype(str).str.strip()
    return df


def build_cards_from_registry(registry_path: str) -> Dict[str, List[Card]]:
    registry = load_registry(registry_path)
    cards: Dict[str, List[Card]] = defaultdict(list)

    for _, row in registry.iterrows():
        dataset = row["dataset_name"]
        target = row["target"]
        dataset_csv = str(row["dataset_csv"])
        dataset_root = Path(dataset_csv).parent

        for encoding, suffix in ENCODING_FILE_SUFFIX.items():
            dataset_file = dataset_root / f"{dataset}_{suffix}.csv"
            model_path = dataset_root / "models" / f"ctgan_{dataset}_{encoding}_model.pkl"
            schedule_path = dataset_root / "training_schedules" / f"ctgan_{dataset}_{encoding}_losses.png"

            if not dataset_file.exists() or not model_path.exists():
                continue

            card = Card(
                dataset=dataset,
                encoding=encoding,
                target=target,
                dataset_path=str(dataset_file),
                model_path=str(model_path),
                schedule_path=str(schedule_path),
            )
            cards[dataset].append(card)

    return cards


In [None]:
def cards_to_frame(cards_by_dataset: Dict[str, List[Card]]) -> pd.DataFrame:
    rows = []
    for dataset, items in cards_by_dataset.items():
        for card in items:
            rows.append({
                "dataset": dataset,
                "encoding": card.encoding,
                "rows": len(card.real_data),
                "features": card.real_data.shape[1] - 1,
                "logreg_real": card.logistic_real_score,
                "logreg_synt": card.logistic_synt_score,
                "logreg_gap": card.logistic_real_score - card.logistic_synt_score,
                "xgb_real": card.xgb_real_score,
                "xgb_synt": card.xgb_synt_score,
                "xgb_gap": card.xgb_real_score - card.xgb_synt_score,
                "mean_jsd": card.mean_jsd,
            })
    return pd.DataFrame(rows)


def save_cards_summary(cards_by_dataset: Dict[str, List[Card]], output_path: str = "data.csv") -> pd.DataFrame:
    df = cards_to_frame(cards_by_dataset)
    df.to_csv(output_path, index=False)
    return df


In [None]:

import base64
from datetime import datetime
from html import escape


def _fmt(value: float | int | None, ndigits: int = 3) -> str:
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return "—"
    return f"{value:.{ndigits}f}" if isinstance(value, float) else str(value)


def _encoding_label(name: str) -> str:
    return ENCODING_LABELS.get(name, name)


def _card_anchor(card: Card) -> str:
    return f"{card.dataset}-{card.encoding}".replace(" ", "_")


def _embed_image_base64(path: str) -> str:
    if not path:
        return ""
    img_path = Path(path)
    if not img_path.exists():
        return ""
    data = img_path.read_bytes()
    mime = {
        "png": "image/png",
        "jpg": "image/jpeg",
        "jpeg": "image/jpeg",
        "gif": "image/gif",
        "svg": "image/svg+xml",
    }.get(img_path.suffix.lower().lstrip("."), "image/png")
    b64 = base64.b64encode(data).decode("ascii")
    return f"<img class='schedule-img' src='data:{mime};base64,{b64}' alt='training schedule'>"


def _render_jsd_table(card: Card, top_k: int = 15) -> str:
    if card.jensen_shannon_divergence is None or card.jensen_shannon_divergence.empty:
        return "<div class='meta'>Нет доступных метрик.</div>"
    df = card.jensen_shannon_divergence.sort_values("jensen_shannon_divergence", ascending=False)
    if top_k:
        df = df.head(top_k)
    rows = "".join(
        f"<tr><td>{escape(str(r['column']))}</td><td class='right'>{_fmt(r['jensen_shannon_divergence'])}</td></tr>"
        for _, r in df.iterrows()
    )
    return f"""
    <table class="jsd-table">
      <thead>
        <tr><th>Колонка</th><th class='right'>JSD</th></tr>
      </thead>
      <tbody>{rows}</tbody>
    </table>
    """


def _render_matrix(cards_by_dataset: Dict[str, List[Card]]) -> str:
    encodings = sorted({c.encoding for items in cards_by_dataset.values() for c in items})
    pill_row = "".join(f"<span class='pill'>{_encoding_label(e)}</span>" for e in encodings)
    rows_html = []
    for dataset, items in cards_by_dataset.items():
        cell_cards = []
        for encoding in encodings:
            card = next((c for c in items if c.encoding == encoding), None)
            if card is None:
                cell_cards.append("<div class='matrix-cell empty'>Нет данных</div>")
                continue
            anchor = _card_anchor(card)
            cell_cards.append(
                f"""
                <a class="matrix-cell" href="#{anchor}">
                  <div class="matrix-cell__head">
                    <span class="pill subtle">{_encoding_label(encoding)}</span>
                    <span class="metric chip">JSD {_fmt(card.mean_jsd)}</span>
                  </div>
                  <div class="matrix-cell__body">
                    <div class="metric">LR <span>synthetic</span> {_fmt(card.logistic_synt_score)}</div>
                    <div class="metric">LR <span>real</span> {_fmt(card.logistic_real_score)}</div>
                    <div class="metric">XGB <span>synthetic</span> {_fmt(card.xgb_synt_score)}</div>
                    <div class="metric">XGB <span>real</span> {_fmt(card.xgb_real_score)}</div>
                  </div>
                </a>
                """
            )
        rows_html.append(
            f"""
            <div class="matrix-row">
              <div class="dataset-name">{escape(dataset)}</div>
              <div class="cell-grid">{''.join(cell_cards)}</div>
            </div>
            """
        )
    return f"""
    <div class="matrix">
      <div class="matrix-header">
        <div class="header-left">
          <p class="eyebrow">Матрица: датасет × кодировка</p>
          <h3>Быстрый обзор</h3>
          <p class="muted">Кликните на карточку, чтобы перейти к детальному отчёту.</p>
        </div>
        <div class="pill-row">{pill_row}</div>
      </div>
      <div class="matrix-grid">{''.join(rows_html)}</div>
    </div>
    """


def _render_cards(cards_by_dataset: Dict[str, List[Card]], top_k: int = 15) -> str:
    cards_html = []
    for dataset, items in sorted(cards_by_dataset.items()):
        for card in sorted(items, key=lambda c: c.encoding):
            anchor = _card_anchor(card)
            metrics_list = "".join([
                f"<li><span>LogReg (реальные)</span><strong>{_fmt(card.logistic_real_score)}</strong></li>",
                f"<li><span>LogReg (синтетические)</span><strong>{_fmt(card.logistic_synt_score)}</strong></li>",
                f"<li><span>XGBoost (реальные)</span><strong>{_fmt(card.xgb_real_score)}</strong></li>",
                f"<li><span>XGBoost (синтетические)</span><strong>{_fmt(card.xgb_synt_score)}</strong></li>",
                f"<li><span>Mean JSD</span><strong>{_fmt(card.mean_jsd)}</strong></li>",
                f"<li><span>Строк</span><strong>{len(card.real_data)}</strong></li>",
                f"<li><span>Фичей</span><strong>{card.real_data.shape[1] - 1}</strong></li>",
            ])
            cards_html.append(f"""
            <section class="card" id="{anchor}">
              <div class="card__header">
                <div>
                  <p class="eyebrow">{escape(card.dataset)}</p>
                  <h2>{_encoding_label(card.encoding)}</h2>
                  <div class="meta">Целевая переменная: {escape(card.target)}</div>
                </div>
                <span class="badge">JSD {_fmt(card.mean_jsd)}</span>
              </div>
              <ul class="metrics">{metrics_list}</ul>
              {_embed_image_base64(card.schedule_path)}
              <details open>
                <summary>Метрики распределений (JSD) — top {top_k}</summary>
                {_render_jsd_table(card, top_k=top_k)}
              </details>
            </section>
            """)
    return "".join(cards_html)


def render_report(cards_by_dataset: Dict[str, List[Card]], output_path: str = "report.html", top_k: int = 15) -> str:
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    style = """
    <style>
      @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
      :root {
        --bg: #040710;
        --panel: rgba(255, 255, 255, 0.04);
        --border: rgba(255, 255, 255, 0.08);
        --card: rgba(255, 255, 255, 0.06);
        --muted: #cbd5e1;
        --accent: #a855f7;
        --accent-2: #22d3ee;
      }
      * { box-sizing: border-box; }
      body {
        font-family: 'Inter', system-ui, -apple-system, sans-serif;
        margin: 0;
        padding: 48px 16px 64px;
        background: radial-gradient(circle at 20% 20%, rgba(168, 85, 247, 0.12), transparent 35%),
                    radial-gradient(circle at 80% 0%, rgba(34, 211, 238, 0.16), transparent 28%),
                    linear-gradient(135deg, #040710 0%, #0b1224 100%);
        color: #e2e8f0;
        min-height: 100vh;
      }
      .container { max-width: 1200px; margin: 0 auto; position: relative; z-index: 2; }
      .panel {
        background: var(--panel);
        border: 1px solid var(--border);
        border-radius: 18px;
        padding: 20px 24px;
        box-shadow: 0 20px 60px rgba(0,0,0,0.35), inset 0 1px 0 rgba(255,255,255,0.08);
        backdrop-filter: blur(12px);
      }
      .hero {
        display: grid;
        gap: 16px;
        grid-template-columns: 2fr 1fr;
        align-items: center;
        margin-bottom: 24px;
      }
      .hero h1 { margin: 0; font-size: 28px; letter-spacing: -0.02em; }
      .hero .subtitle { color: var(--muted); margin: 4px 0 0; }
      .pill-row, .pill-group { display: flex; flex-wrap: wrap; gap: 8px; }
      .pill {
        padding: 6px 10px;
        border-radius: 999px;
        border: 1px solid var(--border);
        background: rgba(255,255,255,0.04);
        color: #e5e7eb;
        font-size: 13px;
      }
      .pill.subtle { background: rgba(255,255,255,0.06); color: #cbd5e1; }
      .pill.accent { border-color: rgba(168,85,247,0.4); color: #f5eafe; }
      .matrix { margin-top: 8px; display: flex; flex-direction: column; gap: 18px; }
      .matrix-header { display: flex; justify-content: space-between; gap: 16px; flex-wrap: wrap; }
      .header-left h3 { margin: 6px 0; font-size: 20px; }
      .eyebrow { font-size: 12px; letter-spacing: 0.08em; text-transform: uppercase; color: #94a3b8; margin: 0; }
      .muted { color: var(--muted); margin: 0; }
      .matrix-grid { display: flex; flex-direction: column; gap: 12px; }
      .matrix-row {
        display: grid;
        grid-template-columns: 200px 1fr;
        gap: 12px;
        align-items: stretch;
      }
      .dataset-name {
        padding: 14px 16px;
        border: 1px solid var(--border);
        border-radius: 14px;
        background: linear-gradient(120deg, rgba(168,85,247,0.12), rgba(34,211,238,0.08));
        font-weight: 600;
      }
      .cell-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 10px; }
      .matrix-cell {
        display: block;
        padding: 14px;
        border-radius: 14px;
        border: 1px solid var(--border);
        background: var(--card);
        text-decoration: none;
        color: inherit;
        position: relative;
        overflow: hidden;
        box-shadow: 0 10px 24px rgba(0,0,0,0.2);
        transition: transform 200ms ease, border-color 200ms ease, box-shadow 200ms ease;
      }
      .matrix-cell::before {
        content: "";
        position: absolute;
        inset: 0;
        background: radial-gradient(circle at 20% 20%, rgba(168,85,247,0.25), transparent 40%);
        opacity: 0;
        transition: opacity 200ms ease;
      }
      .matrix-cell:hover { transform: translateY(-4px); border-color: rgba(168,85,247,0.5); box-shadow: 0 12px 28px rgba(0,0,0,0.35); }
      .matrix-cell:hover::before { opacity: 1; }
      .matrix-cell__head { display: flex; justify-content: space-between; align-items: center; gap: 6px; margin-bottom: 8px; }
      .matrix-cell__body { display: grid; gap: 4px; font-size: 13px; color: var(--muted); }
      .metric { display: flex; justify-content: space-between; align-items: center; }
      .metric span { color: #e2e8f0; opacity: 0.85; }
      .chip {
        padding: 4px 8px;
        border-radius: 999px;
        background: rgba(34, 211, 238, 0.12);
        border: 1px solid rgba(34, 211, 238, 0.45);
        color: #99f6e4;
        font-size: 12px;
      }
      .matrix-cell.empty { background: rgba(255,255,255,0.02); color: var(--muted); text-align: center; }
      .card { margin-bottom: 16px; }
      .card__header { display: flex; justify-content: space-between; align-items: center; gap: 12px; flex-wrap: wrap; }
      .card h2 { margin: 4px 0; font-size: 22px; }
      .badge { padding: 6px 10px; border-radius: 10px; background: rgba(168,85,247,0.16); border: 1px solid rgba(168,85,247,0.45); color: #e9d5ff; font-weight: 600; }
      .meta { color: var(--muted); margin-top: 4px; }
      .metrics { list-style: none; padding: 0; margin: 16px 0; display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 10px; }
      .metrics li { background: var(--card); border: 1px solid var(--border); border-radius: 12px; padding: 10px 12px; display: flex; justify-content: space-between; align-items: center; }
      .metrics span { color: var(--muted); font-size: 13px; }
      .metrics strong { color: #fff; font-weight: 700; }
      .schedule-img { max-width: 100%; height: auto; margin: 12px 0; display: block; border-radius: 12px; border: 1px solid var(--border); }
      details summary { cursor: pointer; font-weight: 700; margin-bottom: 8px; }
      details { margin-top: 8px; background: rgba(255,255,255,0.02); padding: 12px; border-radius: 12px; border: 1px solid var(--border); }
      .jsd-table { width: 100%; border-collapse: collapse; }
      .jsd-table th, .jsd-table td { border: 1px solid var(--border); padding: 8px 10px; }
      .jsd-table th { background: rgba(255,255,255,0.04); text-align: left; }
      .right { text-align: right; }
      @media (max-width: 900px) {
        .hero { grid-template-columns: 1fr; }
        .matrix-row { grid-template-columns: 1fr; }
        .dataset-name { order: -1; }
      }
    </style>
    """

    matrix = _render_matrix(cards_by_dataset)
    cards_html = _render_cards(cards_by_dataset, top_k=top_k)

    html = f"""<!doctype html>
    <html lang='ru'>
      <meta charset='utf-8'>
      <meta name='viewport' content='width=device-width, initial-scale=1'>
      <title>Cards report</title>
      {style}
      <body>
        <div class='container'>
          <div class='panel hero'>
            <div>
              <p class='eyebrow'>CTGAN Metrics Report</p>
              <h1>Сводный отчёт по моделям</h1>
              <p class='subtitle'>Сгенерировано: {escape(now)}</p>
            </div>
            <div class='pill-group'>
              <span class='pill accent'>JSD</span>
              <span class='pill'>Logistic Regression</span>
              <span class='pill'>XGBoost</span>
              <span class='pill'>Кодировки: {len({c.encoding for items in cards_by_dataset.values() for c in items})}</span>
            </div>
          </div>
          <div class='panel'>
            {matrix}
          </div>
          <div class='panel' style='margin-top:16px;'>
            <h3 style='margin-top:0;'>Детальные карточки</h3>
            <p class='muted'>По каждой связке датасет + кодировка.</p>
            {cards_html}
          </div>
        </div>
      </body>
    </html>"""

    Path(output_path).write_text(html, encoding="utf-8")
    return output_path


In [None]:
cards_by_dataset = build_cards_from_registry("datasets/datasets_registry.csv")
for items in cards_by_dataset.values():
    for card in items:
        evaluate_card(card)

summary_df = save_cards_summary(cards_by_dataset, output_path="data.csv")
render_report(cards_by_dataset, output_path="report.html", top_k=20)
summary_df
