In [91]:
from narwhals import read_csv
# !pip install -r requirements.txt
!ls

Adult.csv        Iris.csv         sample.ipynb
[34mdata[m[m             requirements.txt


In [92]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [93]:
def universal_one_hot_encoder(df: pd.DataFrame, target_col: str):
    data_to_encode = df.copy()
    # Определяем категориальные признаки (все, что не числа и не целевая колонка)
    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_features = [
        col for col in data_to_encode.columns if col not in numeric_cols or col == target_col
    ]
    if target_col in categorical_features:
        categorical_features.remove(target_col)

    # Применяем One-Hot Encoding
    data_ohe = pd.get_dummies(
        data_to_encode,
        columns=categorical_features,
        dummy_na=False,
        dtype=int
    )

    # Собираем список всех дискретных колонок для CTGAN
    # Это все колонки, которых не было в исходном списке числовых колонок
    discrete_features_ohe = [
        col for col in data_ohe.columns if col not in numeric_cols or col == target_col
    ]

    return data_ohe, discrete_features_ohe

In [94]:
pd.set_option('future.no_silent_downcasting', True)
target = 'Species'
data = pd.read_csv("Iris.csv", na_values=[r'?', '？'], skipinitialspace=True)
# data = data.sample(4000)
data.dropna(inplace=True)
# data.replace({'<=50K': 0, '>50K': 1}, inplace=True)
# data['class'] = data['class'].astype(int)
data[target] = LabelEncoder().fit_transform(data[target].astype(str))
# Определение числовых признаков
numeric_features = data.select_dtypes(include=np.number).columns.tolist()

# Вызов функции
data_ohe, discrete_features_ohe = universal_one_hot_encoder(data, target_col=target)

In [95]:
data_ohe

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,2
146,147,6.3,2.5,5.0,1.9,2
147,148,6.5,3.0,5.2,2.0,2
148,149,6.2,3.4,5.4,2.3,2


In [104]:
from ctgan import CTGAN

ctgan = CTGAN(
    epochs=2000
)
ctgan.fit(data_ohe, discrete_features_ohe)

In [154]:
synthetic_data = ctgan.sample(len(data_ohe))

In [155]:
synthetic_data

Unnamed: 0,cap-shape=b,cap-shape=c,cap-shape=f,cap-shape=k,cap-shape=s,cap-shape=x,cap-surface=f,cap-surface=g,cap-surface=s,cap-surface=y,...,population=v,population=y,habitat=d,habitat=g,habitat=l,habitat=m,habitat=p,habitat=u,habitat=w,class
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,0,1,1,...,1,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,0,0,0,0,1,0,0,1,1,...,0,1,0,1,0,0,0,0,0,0
146,0,0,0,0,0,1,0,0,1,1,...,1,0,0,0,1,0,0,1,0,1
147,0,0,0,0,0,1,0,0,1,1,...,1,0,0,0,1,0,0,0,0,1
148,0,0,0,0,0,0,1,0,0,1,...,1,0,0,1,1,0,0,1,0,0


In [99]:
from scipy.stats import entropy


def kl_numeric_tables(real: pd.DataFrame, syn: pd.DataFrame, eps=1e-9):
    """
    KL(P‖Q) по каждому числовому столбцу.
    - Бины общие (по real), правило 'fd' по умолчанию.
    - eps добавляется к гистограммам для избежания бесконечностей.
    Возвращает DataFrame: column | kl
    """
    cols = [c for c in real.columns if c in syn.columns]
    rows = []
    for c in cols:
        r = real[c].values
        s = syn[c].values
        hist_r, edges = np.histogram(r, bins="fd")
        hist_s, _ = np.histogram(s, bins=edges)
        kl = float(entropy(hist_r + eps, hist_s + eps))  # KL(P‖Q)
        rows.append((c, kl))
    out = pd.DataFrame(rows, columns=["column", "kl"]).sort_values("kl", ascending=False).reset_index(drop=True)

    vals = out["kl"]  #.replace([np.inf, -np.inf], np.nan).dropna()
    print(
        f"KL summary — mean: {vals.mean():.4f}, median: {vals.median():.4f}, min: {vals.min():.4f}, max: {vals.max():.4f}")
    return out

In [100]:
kl_numeric_tables(data_ohe, synthetic_data)

KL summary — mean: 0.2061, median: 0.0282, min: 0.0061, max: 0.9468


Unnamed: 0,column,kl
0,SepalLengthCm,0.946813
1,SepalWidthCm,0.203539
2,Id,0.029251
3,PetalWidthCm,0.027115
4,PetalLengthCm,0.023724
5,Species,0.006114


In [115]:

from inspect import signature
import numpy as np
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, confusion_matrix,
    precision_score, recall_score, average_precision_score
)
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

seed = 42


def tabular_model_test(x_train, y_train, x_test, y_test,
                       model_name='logistic_regression',
                       sample_weight=None, model_params=None):
    model_params = model_params or {}

    # Конструктор модели (параметры -> конструктор)
    if model_name == 'xgb':
        xgb_params = {'objective': 'binary:logistic', 'seed': seed}
        xgb_params.update(model_params)
        model = xgb.XGBClassifier(**xgb_params)
    elif model_name == 'logistic_regression':
        model = LogisticRegression(random_state=seed, **model_params)
    elif model_name == 'rf':
        model = RandomForestClassifier(random_state=seed, **model_params)
    elif model_name == 'decision_tree':
        model = DecisionTreeClassifier(random_state=seed, **model_params)
    elif model_name == 'adaboost':
        model = AdaBoostClassifier(random_state=seed, **model_params)
    elif model_name == "knn":
        model = KNeighborsClassifier(**model_params)
    elif model_name == 'mlp':
        model = MLPClassifier(random_state=seed, early_stopping=True, hidden_layer_sizes=128, **model_params)
    elif model_name == 'svm':
        model = svm.SVC(gamma='scale', C=1.0, decision_function_shape='ovr', kernel='rbf', probability=True)
    else:
        model = LogisticRegression(random_state=seed, **model_params)

    # Fit с sample_weight только если поддерживается
    fit_kwargs = {}
    if sample_weight is not None and 'sample_weight' in signature(model.fit).parameters:
        fit_kwargs['sample_weight'] = sample_weight
    model.fit(x_train, y_train, **fit_kwargs)

    # Получаем вероятности / скор для AUC/AP
    probs_binary = None
    probs_multiclass = None
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(x_test)
        if proba.ndim == 2 and proba.shape[1] == 2:
            probs_binary = proba[:, 1]
        else:
            probs_multiclass = proba
    elif hasattr(model, "decision_function"):
        scores = model.decision_function(x_test)
        if scores.ndim == 1:
            probs_binary = 1.0 / (1.0 + np.exp(-scores))
        else:
            probs_multiclass = 1.0 / (1.0 + np.exp(-scores))

    # Предсказанные метки
    if probs_multiclass is not None:
        test_pred_label = np.argmax(probs_multiclass, axis=1)
    elif probs_binary is not None:
        test_pred_label = (probs_binary >= 0.5).astype(int)
    else:
        test_pred_label = model.predict(x_test)

    # Определяем тип задачи и average для метрик
    n_classes = len(np.unique(y_train))
    is_multiclass = n_classes > 2
    avg = 'binary' if not is_multiclass else 'macro'

    # Вычисляем метрики
    test_accuracy = accuracy_score(y_test, test_pred_label)
    test_precision = precision_score(y_test, test_pred_label, average=avg, zero_division=0)
    test_f1 = f1_score(y_test, test_pred_label, average=avg, zero_division=0)
    test_recall = recall_score(y_test, test_pred_label, average=avg, zero_division=0)
    test_confusion_matrix = confusion_matrix(y_test, test_pred_label)

    test_auc = None
    test_ap = None
    try:
        if probs_multiclass is not None:
            classes = np.unique(y_train)
            y_bin = label_binarize(y_test, classes=classes)
            test_auc = roc_auc_score(y_bin, probs_multiclass, average='macro', multi_class='ovr')
            test_ap = average_precision_score(y_bin, probs_multiclass, average='macro')
        elif probs_binary is not None:
            test_auc = roc_auc_score(y_test, probs_binary)
            test_ap = average_precision_score(y_test, probs_binary)
    except Exception:
        test_auc = None
        test_ap = None

    print(f'Test Accuracy: {test_accuracy}')
    print(f'Test precision: {test_precision}')
    print(f'Test F1: {test_f1}')
    print(f'Test AUC: {test_auc}')
    print(f'Test AP: {test_ap}')
    print('Test Confusion Matrix:')
    print(test_confusion_matrix)

    return {
        'accuracy': test_accuracy,
        'precision': test_precision,
        'f1': test_f1,
        'recall': test_recall,
        'auc': test_auc,
        'ap': test_ap,
        'confusion_matrix': test_confusion_matrix
    }

In [120]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data_ohe.drop(columns=[target]),
    data_ohe[target],
    test_size=0.5,
    random_state=42
)
X_train_synt, _, y_train_synt, _ = train_test_split(
    synthetic_data.drop(columns=[target]),
    synthetic_data[target],
    test_size=0.5,
    random_state=42
)


In [182]:
test_real = tabular_model_test(X_train, y_train, X_test, y_test, model_name='logistic_regression',
                               model_params={'max_iter': 100, 'C': 0.5})

Test Accuracy: 0.9997538158542589
Test precision: 0.9995206136145733
Test F1: 0.9997602493406856
Test AUC: 1.0
Test AP: 1.0
Test Confusion Matrix:
[[1976    1]
 [   0 2085]]


In [183]:
test_synt = tabular_model_test(X_train_synt, y_train_synt, X_test, y_test, model_name='logistic_regression',
                               model_params={'max_iter': 100, 'C': 0.5})

Test Accuracy: 0.9054652880354506
Test precision: 0.8836265223274695
Test F1: 0.9107391910739191
Test AUC: 0.9565402609627017
Test AP: 0.9434365189461298
Test Confusion Matrix:
[[1719  258]
 [ 126 1959]]


In [126]:
data = pd.read_csv("mushroom.csv")
data['class'] = data['class=e']
data = data.drop(columns=['class=e', 'class=p'])
target = 'class'

In [161]:
ctgan = CTGAN(
    epochs=2000
)
ctgan.fit(data, data.columns)


In [191]:
ctgan._generator_lr

0.0002

In [178]:
synthetic_data = ctgan.sample(len(data_ohe))
kl_numeric_tables(data, synthetic_data)

KL summary — mean: 0.0000, median: 0.0000, min: 0.0000, max: 0.0000


Unnamed: 0,column,kl
0,cap-shape=b,0.0
1,stalk-color-below-ring=c,0.0
2,veil-color=y,0.0
3,veil-color=w,0.0
4,veil-color=o,0.0
...,...,...
113,gill-size=b,0.0
114,gill-spacing=w,0.0
115,gill-spacing=c,0.0
116,gill-attachment=f,0.0


In [164]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=[target]),
    data[target],
    test_size=0.5,
    random_state=42
)
X_train_synt, _, y_train_synt, _ = train_test_split(
    synthetic_data.drop(columns=[target]),
    synthetic_data[target],
    test_size=0.5,
    random_state=42
)

In [188]:
test_real = tabular_model_test(X_train, y_train, X_test, y_test, model_name='logistic_regression',
                               model_params={'max_iter': 100, 'C': 0.5})

Test Accuracy: 0.9997538158542589
Test precision: 0.9995206136145733
Test F1: 0.9997602493406856
Test AUC: 1.0
Test AP: 1.0
Test Confusion Matrix:
[[1976    1]
 [   0 2085]]


In [170]:
test_synt = tabular_model_test(X_train_synt, y_train_synt, X_test, y_test, model_name='logistic_regression',
                               model_params={'max_iter': 100, 'C': 0.5})

Test Accuracy: 0.9054652880354506
Test precision: 0.8836265223274695
Test F1: 0.9107391910739191
Test AUC: 0.9565402609627017
Test AP: 0.9434365189461298
Test Confusion Matrix:
[[1719  258]
 [ 126 1959]]
