In [1]:

import numpy as np
import hashlib
from midst_models.single_table_TabDDPM.lib import Dataset, TaskType, transform_dataset
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast
from pathlib import Path
from dataclasses import astuple, dataclass, replace
from midst_models.single_table_TabDDPM.lib import (
    Transformations,
    prepare_fast_dataloader,
    round_columns,
)
import json

from midst_models.single_table_TabDDPM.complex_pipeline import (
    clava_clustering,
    clava_load_pretrained_customized,
    load_configs,
)
from midst_models.single_table_TabDDPM.pipeline_utils import load_multi_table_customized
import pickle

from midst_models.single_table_TabDDPM.complex_pipeline import CustomUnpickler
import os


CAT_MISSING_VALUE = "__nan__"
CAT_RARE_VALUE = "__rare__"
Normalization = Literal["standard", "quantile", "minmax"]
NumNanPolicy = Literal["drop-rows", "mean"]
CatNanPolicy = Literal["most_frequent"]
CatEncoding = Literal["one-hot", "counter"]
YPolicy = Literal["default"]
ArrayDict = Dict[str, np.ndarray]

def raise_unknown(unknown_what: str, unknown_value: Any):
    raise ValueError(f"Unknown {unknown_what}: {unknown_value}")


def get_table_info(df, domain_dict, y_col):
    cat_cols = []
    num_cols = []
    for col in df.columns:
        if col in domain_dict and col != y_col:
            if domain_dict[col]["type"] == "discrete":
                cat_cols.append(col)
            else:
                num_cols.append(col)

    df_info = {}
    df_info["cat_cols"] = cat_cols
    df_info["num_cols"] = num_cols
    df_info["y_col"] = y_col
    df_info["n_classes"] = 0
    df_info["task_type"] = "multiclass"

    return df_info

def get_T_dict():
    return {
        "seed": 0,
        "normalization": "quantile",
        "num_nan_policy": None,
        "cat_nan_policy": None,
        "cat_min_frequency": None,
        "cat_encoding": None,
        "y_policy": "default",
    }

def get_model_params(rtdl_params=None):
    return {
        "num_classes": 0,
        "is_y_cond": "none",
        "rtdl_params": {"d_layers": [512, 1024, 1024, 1024, 1024, 512], "dropout": 0.0}
        if rtdl_params is None
        else rtdl_params,
    }

def build_target(
    y: ArrayDict, policy: Optional[YPolicy], task_type: TaskType
) -> Tuple[ArrayDict, Dict[str, Any]]:
    info: Dict[str, Any] = {"policy": policy}
    if policy is None:
        pass
    elif policy == "default":
        if task_type == TaskType.REGRESSION:
            mean, std = float(y["train"].mean()), float(y["train"].std())
            y = {k: (v - mean) / std for k, v in y.items()}
            info["mean"] = mean
            info["std"] = std
    else:
        raise_unknown("policy", policy)
    return y, info

@dataclass(frozen=True)
class Transformations:
    seed: int = 0
    normalization: Optional[Normalization] = None
    num_nan_policy: Optional[NumNanPolicy] = None
    cat_nan_policy: Optional[CatNanPolicy] = None
    cat_min_frequency: Optional[float] = None
    cat_encoding: Optional[CatEncoding] = None
    y_policy: Optional[YPolicy] = "default"


def transform_dataset(
    dataset: Dataset,
    transformations: Transformations,
    cache_dir: Optional[Path],
    transform_cols_num: int = 0,
    normalizer=None,
    cat_transform=None, 
    num_transform=None
) -> Dataset:
    # WARNING: the order of transformations matters. Moreover, the current
    # implementation is not ideal in that sense.
    if cache_dir is not None:
        transformations_md5 = hashlib.md5(
            str(transformations).encode("utf-8")
        ).hexdigest()
        transformations_str = "__".join(map(str, astuple(transformations)))
        cache_path = (
            cache_dir / f"cache__{transformations_str}__{transformations_md5}.pickle"
        )
        if cache_path.exists():
            cache_transformations, value = util.load_pickle(cache_path)
            if transformations == cache_transformations:
                print(
                    f"Using cached features: {cache_dir.name + '/' + cache_path.name}"
                )
                return value
            else:
                raise RuntimeError(f"Hash collision for {cache_path}")
    else:
        cache_path = None


    cat_transform = None
    X_num = dataset.X_num
    X_num = {k: num_transform.transform(v) for k, v in X_num.items()}

    if dataset.X_cat is None:
        assert transformations.cat_nan_policy is None
        assert transformations.cat_min_frequency is None
        # assert transformations.cat_encoding is None
        X_cat = None
    else:
        X_cat = cat_process_nans(dataset.X_cat, transformations.cat_nan_policy)
        if transformations.cat_min_frequency is not None:
            X_cat = cat_drop_rare(X_cat, transformations.cat_min_frequency)
        
        if cat_transform is None:
            raise ValueError("See why no cat_tramsform")
        else:
            X_cat = {k: cat_transform.transform(v).astype("float32") for k, v in X_cat.items()} 
            X_num = (
                X_cat
                if X_num is None
                else {x: np.hstack([X_num[x], X_cat[x]]) for x in X_num}
            )
            X_cat = None

    y, y_info = build_target(dataset.y, transformations.y_policy, dataset.task_type)

    dataset = replace(dataset, X_num=X_num, X_cat=X_cat, y=y, y_info=y_info)
    dataset.num_transform = num_transform
    dataset.cat_transform = cat_transform

    return dataset
def make_dataset_from_df_with_loaded(df, T, is_y_cond, ratios=[0.7, 0.2, 0.1], df_info=None, std=0, label_encoders=None, num_transform=None):

    cat_column_orders = []
    num_column_orders = []
    index_to_column = list(df.columns)
    column_to_index = {col: i for i, col in enumerate(index_to_column)}

    if df_info["n_classes"] > 0:
        X_cat = {} if df_info["cat_cols"] is not None or is_y_cond == "concat" else None
        X_num = {} if df_info["num_cols"] is not None else None
        y = {}

        cat_cols_with_y = []
        if df_info["cat_cols"] is not None:
            cat_cols_with_y += df_info["cat_cols"]
        if is_y_cond == "concat":
            cat_cols_with_y = [df_info["y_col"]] + cat_cols_with_y

        if len(cat_cols_with_y) > 0:
            X_cat["train"] = df[cat_cols_with_y].to_numpy(dtype=np.str_)

        y["train"] = df[df_info["y_col"]].values.astype(np.float32)

        if df_info["num_cols"] is not None:
            X_num["train"] = df[df_info["num_cols"]].values.astype(np.float32)

        cat_column_orders = [column_to_index[col] for col in cat_cols_with_y]
        num_column_orders = [column_to_index[col] for col in df_info["num_cols"]]

    else:
        X_cat = {} if df_info["cat_cols"] is not None else None
        X_num = {} if df_info["num_cols"] is not None or is_y_cond == "concat" else None
        y = {}

        num_cols_with_y = []
        if df_info["num_cols"] is not None:
            num_cols_with_y += df_info["num_cols"]
        if is_y_cond == "concat":
            num_cols_with_y = [df_info["y_col"]] + num_cols_with_y

        if len(num_cols_with_y) > 0:
            X_num["train"] = df[num_cols_with_y].values.astype(np.float32)

        y["train"] = df[df_info["y_col"]].values.astype(np.float32)

        if df_info["cat_cols"] is not None:
            X_cat["train"] = df[df_info["cat_cols"]].to_numpy(dtype=np.str_)

        cat_column_orders = [column_to_index[col] for col in df_info["cat_cols"]]
        num_column_orders = [column_to_index[col] for col in num_cols_with_y]

    column_orders = num_column_orders + cat_column_orders
    column_orders = [index_to_column[index] for index in column_orders]

    if X_cat is not None and len(df_info["cat_cols"]) > 0:
        X_cat_all = X_cat["train"]
        X_cat_converted = []
        for col_index in range(X_cat_all.shape[1]):
            if label_encoders is None:
                raise ValueError('Should be loaded: label_encoder')
            else:
                print('label_encoders loaded')

            X_cat_converted.append(
                label_encoders[col_index].transform(X_cat_all[:, col_index]).astype(float)
            )
            if std > 0:
                # add noise
                X_cat_converted[-1] += np.random.normal(
                    0, std, X_cat_converted[-1].shape
                )


        X_cat_converted = np.vstack(X_cat_converted).T

        train_num = X_cat["train"].shape[0]

        X_cat["train"] = X_cat_converted[:train_num, :]

        if len(X_num) > 0:
            X_num["train"] = np.concatenate((X_num["train"], X_cat["train"]), axis=1)
        else:
            X_num = X_cat
            X_cat = None

    D = Dataset(
        X_num,
        None,
        y,
        y_info={},
        task_type=TaskType(df_info["task_type"]),
        n_classes=df_info["n_classes"],
    )

    return transform_dataset(D, T, None, num_transform=num_transform), label_encoders, column_orders

    

def get_dataset(config_path =None, save_dir_tmp=None, train_name="train_with_id.csv"):
    configs, save_dir = load_configs(config_path)
    # print(configs, save_dir)
    #TBD: Customized Config res

    # Display config
    json_str = json.dumps(configs, indent=4)
    # print(json_str)
    print(configs["general"]["data_dir"])

    # Load  dataset

    # In this step, we load the dataset according to the 'dataset_meta.json' file located in the data_dir.
    tables, relation_order, dataset_meta = load_multi_table_customized(save_dir_tmp, meta_dir="/work4/xiaoyuwu/MIDSTModelsMIA/midst_models/single_table_TabDDPM/configs", train_name=train_name)
    print("")

    # Tables is a dictionary of the multi-table dataset
    print(
        "{} We show the keys of the tables dictionary below {}".format("=" * 20, "=" * 20)
    )
    print(list(tables.keys()))

    # Display important clustering parameters
    params_clustering = configs["clustering"]
    print("{} We show the clustering parameters below {}".format("=" * 20, "=" * 20))
    for key, val in params_clustering.items():
        print(f"{key}: {val}")
    print("")

    # Clustering on the multi-table dataset
    tables, all_group_lengths_prob_dicts = clava_clustering(
        tables, relation_order, save_dir, configs
    )
    train_loader_list = []
    for parent, child in relation_order:
        print(f"Getting {parent} -> {child} model from scratch")
        df_with_cluster = tables[child]["df"]

        id_cols = [col for col in df_with_cluster.columns if "_id" in col]
        df_without_id = df_with_cluster.drop(columns=id_cols)
        child_df_with_cluster, child_domain_dict, parent_name, child_name =  df_without_id, tables[child]["domain"], parent, child
        if parent_name is None:
            y_col = "placeholder"
            child_df_with_cluster["placeholder"] = list(range(len(child_df_with_cluster)))
        else:
            y_col = f"{parent_name}_{child_name}_cluster"
        child_info = get_table_info(child_df_with_cluster, child_domain_dict, y_col)
        child_model_params = get_model_params(
            {
                "d_layers": configs["diffusion"]["d_layers"],
                "dropout": configs["diffusion"]["dropout"],
            }
        )
        child_T_dict = get_T_dict()
        file_path = os.path.join(save_dir_tmp, f"{parent}_{child}_ckpt.pkl")
        with open(file_path, "rb") as f:
            model = CustomUnpickler(f).load()
        diffusion = model['diffusion'].cuda()
        num_transform = model['dataset'].num_transform
        T = Transformations(**child_T_dict)
        dataset, label_encoders, column_orders = make_dataset_from_df_with_loaded(
            child_df_with_cluster,
            T,
            is_y_cond=child_model_params["is_y_cond"],
            ratios=[0.99, 0.005, 0.005],
            df_info=child_info,
            std=0,
            label_encoders=model['label_encoders'],
            num_transform=num_transform
        )
        # print(dataset.n_features)
        dataset.X_num['test'] = dataset.X_num['train']
        if dataset.X_cat is not None:
            dataset.X_cat['test'] = dataset.X_cat['train']
        dataset.y['test'] = dataset.y['train']
        train_loader = prepare_fast_dataloader(
            dataset, split="test", batch_size=1, y_type="long"
        )
        train_loader_list.append(train_loader)
    return train_loader_list



In [2]:

# Load config 

config_path = "configs/trans_demo.json"
save_dir_tmp = '/work4/xiaoyuwu/MIDSTModelsMIA/starter_kits/tabddpm_white_box/train/tabddpm_1'
train_loader_list = get_dataset(config_path, save_dir_tmp, train_name='challenge_with_id.csv')
device = "cuda"

for train_loader in train_loader_list:
        x, out_dict = next(train_loader)
        out_dict = {"y": out_dict}
        x = x.to(device)
        for k in out_dict:
            out_dict[k] = out_dict[k].long().to(device)

        print(x, out_dict)

/work4/xiaoyuwu/MIDSTModelsMIA/starter_kits/tabddpm_white_box/train/tabddpm_1
Table name: trans, Total dataframe shape: (200, 8), Numerical data shape: (200, 4), Categorical data shape: (200, 4)

['trans']
parent_scale: 1.0
num_clusters: 50
clustering_method: both

Clustering checkpoint found, loading...
Getting None -> trans model from scratch
label_encoders loaded
label_encoders loaded
label_encoders loaded
label_encoders loaded
tensor([[-0.3191, -0.1172, -1.5632,  0.9795,  5.1993, -0.6125,  0.2794,  1.5003]],
       device='cuda:0') {'y': tensor([0], device='cuda:0')}


In [3]:
""

''