In [1]:
import os
from pathlib import Path
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.chdir(Path.cwd().parent.parent.parent)

In [2]:
import json
import random
from typing import List, Dict, Any, Optional, Tuple

from benchmark_src.dataset_creation.target.collect_all_target_datasets import get_target_dataset_by_name
import pandas as pd

In [3]:
def convert_array_to_markdown(table_array: List[List[Any]], max_rows: int = -1) -> str:
    """
    Converts a list of lists (where the first list is headers)
    into a Markdown table string.

    Args:
        table_array: A list of lists.
                     Example: [["col1", "col2"], ["data1", "data2"]]
        max_rows: The maximum number of data rows to include. If -1, there is no limit.
    """
    if not table_array or not table_array[0]:
        print("ERROR: Empty table array provided.") #TODO: change the whole file to use logging
        return ""

    headers = [str(h) for h in table_array[0]]
    lines: List[str] = ["| " + " | ".join(headers) + " |", "| " + " | ".join(["---"] * len(headers)) + " |"]

    data_rows = table_array[1:]

    if max_rows != -1 and len(data_rows) > max_rows:
        data_rows = data_rows[:max_rows]
        # print(f"Limiting table (with {len(table_array)-1} rows) to first {max_rows} rows.")

    for row in data_rows:
        lines.append("| " + " | ".join(str(item) for item in row) + " |")

    # Join all lines with a newline and add a final newline
    return "\n".join(lines) + "\n"

In [4]:
def levenshtein_distance(a: str, b: str) -> int:
    if a == b:
        return 0
    la, lb = len(a), len(b)
    if la == 0:
        return lb
    if lb == 0:
        return la
    prev = list(range(lb + 1))
    for i, ca in enumerate(a, start=1):
        cur = [i] + [0] * lb
        for j, cb in enumerate(b, start=1):
            cost = 0 if ca == cb else 1
            cur[j] = min(prev[j] + 1,
                         cur[j-1] + 1,
                         prev[j-1] + cost)
        prev = cur
    return prev[lb]

# -----------------------------
# Textual-change metric: normalized Levenshtein
# -----------------------------
def normalized_levenshtein(a: str, b: str) -> float:
    max_len = max(len(a), len(b))
    if max_len == 0:
        return 0.0
    return levenshtein_distance(a, b) / max_len

In [5]:
def shuffle_rows(table: List[List[Optional[str]]], pos_strength: float
                ) -> Tuple[List[List[Optional[str]]], List[int], List[int]]:
    n_rows = len(table)
    # Only shuffle data rows (index 1 to N)
    data_row_indices = list(range(1, n_rows))
    k = int(round(pos_strength * len(data_row_indices)))

    fwd_map = list(range(n_rows))

    if n_rows <= 2 or pos_strength <= 0.0 or k <= 1:
        return [row[:] for row in table], fwd_map, []

    src_indices = random.sample(data_row_indices, k)

    dst_indices = src_indices[:]
    random.shuffle(dst_indices)

    new_T = [row[:] for row in table]
    for src, dst in zip(src_indices, dst_indices):
        new_T[dst] = table[src][:]
        fwd_map[src] = dst

    return new_T, fwd_map, src_indices


def shuffle_columns(table: List[List[Optional[str]]], pos_strength: float
                    ) -> Tuple[List[List[Optional[str]]], List[int], List[str]]:
    n_rows = len(table)
    n_cols = len(table[0]) if n_rows > 0 else 0
    k = max(1, int(round(pos_strength * n_cols)))

    fwd_map = list(range(n_cols))

    if n_cols == 0 or pos_strength <= 0.0 or k <= 1:
        return [row[:] for row in table], fwd_map, []

    col_indices = list(range(n_cols))

    src_indices = random.sample(col_indices, k)

    dst_indices = src_indices[:]
    random.shuffle(dst_indices)

    new_T = []
    for r in table:
        new_row = r[:]
        for src, dst in zip(src_indices, dst_indices):
            new_row[dst] = r[src]
        new_T.append(new_row)

    for src, dst in zip(src_indices, dst_indices):
        fwd_map[src] = dst

    chosen_col_names = [table[0][c] for c in src_indices]

    return new_T, fwd_map, chosen_col_names


def shuffle_within_columns(table: List[List[Optional[str]]], chosen_cols: List[int], neg_degree: float
                           ) -> Tuple[List[List[Optional[str]]], Dict[str, List[int]], List[str]]:
    n_rows = len(table)
    data_rows = list(range(1, n_rows))
    L = len(data_rows)
    S = int(round(neg_degree * L))

    new_T = [row[:] for row in table]
    col_forward_maps: Dict[str, List[int]] = {}
    chosen_col_names = []

    if n_rows <= 2 or not chosen_cols or neg_degree <= 0.0 or S <= 1:
        return new_T, {}, []

    for c in chosen_cols:
        col_name = table[0][c]
        chosen_col_names.append(col_name)

        fwd_mapping = list(range(n_rows))

        src_indices = random.sample(data_rows, S)

        dst_indices = src_indices[:]
        random.shuffle(dst_indices)

        for src, dst in zip(src_indices, dst_indices):
            new_T[dst][c] = table[src][c]
            fwd_mapping[src] = dst

        col_forward_maps[col_name] = fwd_mapping

    return new_T, col_forward_maps, chosen_col_names


def generate_positive(table: List[List[Optional[str]]], pos_type: str, pos_strength: float):
    assert 0.0 <= pos_strength <= 1.0
    assert pos_type in {"row_reorder", "col_reorder", "both"}
    T = [row[:] for row in table]

    # Minimal metadata container
    pos_meta: Dict[str, Any] = {
        "pos_type": pos_type
    }

    if pos_type in ("row_reorder", "both"):
        new_T, fwd_map, affected_rows = shuffle_rows(T, pos_strength)
        T = new_T
        pos_meta["row_permutation_index"] = fwd_map
        pos_meta["affected_row_indices"] = affected_rows

    if pos_type in ("col_reorder", "both"):
        new_T, fwd_map, affected_cols = shuffle_columns(T, pos_strength)
        T = new_T
        pos_meta["col_permutation_index"] = fwd_map
        pos_meta["affected_col_names"] = affected_cols

    return T, pos_meta


def generate_negative(table: List[List[Optional[str]]], neg_columns_frac: float, neg_degree: float):
    assert 0.0 <= neg_columns_frac <= 1.0
    assert 0.0 <= neg_degree <= 1.0

    n_rows = len(table)
    n_cols = len(table[0]) if n_rows > 0 else 0
    T = [row[:] for row in table]

    neg_meta = {
        "neg_columns_frac": neg_columns_frac,
        "neg_degree": neg_degree
    }

    if n_cols == 0 or n_rows <= 2:
        return T, neg_meta

    num_cols_to_perm = int(round(neg_columns_frac * n_cols))
    if num_cols_to_perm <= 0:
        return T, neg_meta

    chosen_cols = random.sample(list(range(n_cols)), num_cols_to_perm)

    new_T, col_forward_map, affected_names = shuffle_within_columns(T, chosen_cols, neg_degree)
    T = new_T

    neg_meta["col_permutation_indexes"] = col_forward_map
    neg_meta["affected_col_names"] = affected_names

    return T, neg_meta

In [6]:
# -----------------------------
# Triplet generation
# -----------------------------
def generate_triplets_from_dataset(
    dataset: List[Dict[str, Any]],
    triplets_per_anchor: int,
    pos_params: Dict[str, Any],
    neg_params: Dict[str, Any],
):
    triplets = []
    deltas_pos = []
    deltas_neg = []

    for anchor_rec in dataset:
        anchor_table = anchor_rec["table"]
        serialized_anchor = convert_array_to_markdown(anchor_table)

        for _ in range(triplets_per_anchor):
            pos_table, pos_meta = generate_positive(anchor_table, pos_params["pos_type"], pos_params["pos_strength"])
            neg_table, neg_meta = generate_negative(anchor_table, neg_params["neg_columns_frac"], neg_params["neg_degree"])

            delta_pos = normalized_levenshtein(serialized_anchor, convert_array_to_markdown(pos_table))
            delta_neg = normalized_levenshtein(serialized_anchor, convert_array_to_markdown(neg_table))

            triplets.append({
                "database_id": anchor_rec["database_id"],
                "table_id": anchor_rec["table_id"],
                "anchor_table": anchor_table,
                "pos_table": pos_table,
                "neg_table": neg_table,
                "delta_pos": delta_pos,
                "delta_neg": delta_neg,
                "pos_meta": pos_meta,
                "neg_meta": neg_meta,
            })
            deltas_pos.append(delta_pos)
            deltas_neg.append(delta_neg)

    avg_delta_pos = sum(deltas_pos) / len(deltas_pos) if deltas_pos else 0.0
    avg_delta_neg = sum(deltas_neg) / len(deltas_neg) if deltas_neg else 0.0

    return triplets, avg_delta_pos, avg_delta_neg

In [7]:
config = {
    "dataset_name": "fetaqa",
    "random_seed": 42,
    "dataset": {
        "num_databases": 1,
        "tables_per_db": 20,
        "min_rows": 10,
        "max_rows": 50,
        "min_cols": 3,
        "max_cols": 10
    },
    "triplets_per_anchor": 1,
    "pos_params": {
        "pos_type": "both",   # one of "row_reorder", "col_reorder", "both"
        "pos_strength": 0.5          # fraction [0..1] of rows/cols to shuffle
    },
    "neg_params": {
        "neg_columns_frac": 0.3,     # fraction [0..1] of columns to permute
        "neg_degree": 0.8            # [0..1] how shuffled each chosen column is
    }
}

random.seed(config["random_seed"])

dataset_new = get_target_dataset_by_name(config["dataset_name"]).corpus
corpus = dataset_new.select(range(min(10, len(dataset_new))))

2026-01-27 19:30:08,979 - benchmark_src.dataset_creation.target.collect_all_target_datasets - INFO - Loading queries for fetaqa using datasets library...
Using the latest cached version of the dataset since target-benchmark/fetaqa-queries couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/ayeen/uni/research/liane/table-representation-evals/cache/datasets/target-benchmark___fetaqa-queries/default/0.0.0/d878c7bb22159fbeb704ab6acb316edbab2884e2 (last modified on Mon Oct 27 12:40:53 2025).
2026-01-27 19:30:09,043 - benchmark_src.dataset_creation.target.collect_all_target_datasets - INFO - Loading corpus for fetaqa using datasets library...
Using the latest cached version of the dataset since target-benchmark/fetaqa-corpus couldn't be found on the Hugging Face Hub (offline mode is enabled).
Found the latest cached dataset configuration 'default' at /home/ayeen/uni/research/liane/table-representation-evals/cac

In [8]:
corpus.with_format("pandas")

Dataset({
    features: ['database_id', 'table_id', 'table', 'context'],
    num_rows: 10
})

In [9]:
def table_2d_to_df(table_2d):
    """
    table_2d: list[list] where table_2d[0] is the header row.
    Returns a pandas DataFrame.
    """
    if table_2d is None or len(table_2d) == 0:
        return pd.DataFrame()

    header = list(table_2d[0])
    rows = table_2d[1:]

    df = pd.DataFrame(rows, columns=header)

    df = df.dropna(how="all")

    return df

In [10]:
print("Generating triplets...")
triplets, avg_delta_pos, avg_delta_neg = generate_triplets_from_dataset(
    dataset=corpus.to_list(),
    triplets_per_anchor=config["triplets_per_anchor"],
    pos_params=config["pos_params"],
    neg_params=config["neg_params"]
)

print(f"Generated {len(triplets)} triplets")
print(f"Average normalized textual-change Δ_text (pos): {avg_delta_pos:.4f}")
print(f"Average normalized textual-change Δ_text (neg): {avg_delta_neg:.4f}")

Generating triplets...
Generated 10 triplets
Average normalized textual-change Δ_text (pos): 0.3325
Average normalized textual-change Δ_text (neg): 0.1091


In [11]:
table_2d_to_df(triplets[8]['anchor_table'])

Unnamed: 0,Isotope,Half-life,Decay mode,Discovery year,Reaction
0,253Rf,48 μs,"α, SF",1994,"204Pb(50Ti,n)"
1,254Rf,23 μs,SF,1994,"206Pb(50Ti,2n)"
2,255Rf,2.3 s,"ε?, α, SF",1974,"207Pb(50Ti,2n)"
3,256Rf,6.4 ms,"α, SF",1974,"208Pb(50Ti,2n)"
4,257Rf,4.7 s,"ε, α, SF",1969,"249Cf(12C,4n)"
5,257mRf,4.1 s,"ε, α, SF",1969,"249Cf(12C,4n)"
6,258Rf,14.7 ms,"α, SF",1969,"249Cf(13C,4n)"
7,259Rf,3.2 s,"α, SF",1969,"249Cf(13C,3n)"
8,259mRf,2.5 s,ε,1969,"249Cf(13C,3n)"
9,260Rf,21 ms,"α, SF",1969,"248Cm(16O,4n)"


In [12]:
table_2d_to_df(triplets[8]['pos_table'])

Unnamed: 0,Decay mode,Half-life,Isotope,Discovery year,Reaction
0,"α, SF",48 μs,253Rf,1994,"204Pb(50Ti,n)"
1,SF,23 μs,254Rf,1994,"206Pb(50Ti,2n)"
2,"ε?, α, SF",2.3 s,255Rf,1974,"207Pb(50Ti,2n)"
3,"α, SF",6.4 ms,256Rf,1974,"208Pb(50Ti,2n)"
4,"ε, α, SF",4.7 s,257Rf,1969,"249Cf(12C,4n)"
5,"α, SF",78 s,261Rf,1970,"248Cm(18O,5n)"
6,"α, SF",14.7 ms,258Rf,1969,"249Cf(13C,4n)"
7,SF,23 s?,266Rf,2007?,"266Db( e−, ν e)?"
8,SF,1.4 s?,268Rf,2004?,"268Db( e−, ν e)?"
9,"α, SF",21 ms,260Rf,1969,"248Cm(16O,4n)"


In [13]:
table_2d_to_df(triplets[8]['neg_table'])

Unnamed: 0,Isotope,Half-life,Decay mode,Discovery year,Reaction
0,253Rf,8 s,"α, SF",1994,"269Sg(—,α)"
1,254Rf,23 μs,SF,1994,"244Pu(22Ne,4n)"
2,255Rf,2.5 s,"ε?, α, SF",1974,"268Db( e−, ν e)?"
3,256Rf,2.3 s,"α, SF",1974,"249Cf(13C,3n)"
4,257Rf,1.3 h,"ε, α, SF",1969,"249Cf(12C,4n)"
5,257mRf,48 μs,"ε, α, SF",1969,"204Pb(50Ti,n)"
6,258Rf,15 min,"α, SF",1969,"248Cm(16O,4n)"
7,259Rf,3.2 s,"α, SF",1969,"207Pb(50Ti,2n)"
8,259mRf,1.4 s?,ε,1969,"249Cf(12C,4n)"
9,260Rf,14.7 ms,"α, SF",1969,"249Cf(13C,3n)"


In [15]:
def summarize_triplets(triplets, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        for idx, t in enumerate(triplets):
            pos_meta = t.get("pos_meta", {})
            neg_meta = t.get("neg_meta", {})

            summary = {
                "triplet_id": idx,
                "database_id": t.get("database_id"),
                "table_id": t.get("table_id"),

                "delta_pos": t.get("delta_pos"),
                "delta_neg": t.get("delta_neg"),

                "positive": {
                    "type": pos_meta.get("pos_type"),

                    # Row Reorder Data
                    "row_reorder": {
                        "permutation_index": pos_meta.get("row_permutation_index", []),
                        "affected_indices": pos_meta.get("affected_row_indices", [])
                    },

                    # Column Reorder Data
                    "column_reorder": {
                        "permutation_index": pos_meta.get("col_permutation_index", []),
                        "affected_indices": pos_meta.get("affected_col_names", [])
                    },
                },

                "negative": {
                    # Intra-column Shuffle Data
                    "intra_column_shuffle": {
                        "permutation_indexes": neg_meta.get("col_permutation_indexes", {}),
                        "affected_indices": neg_meta.get("affected_col_names", [])
                    },
                    "neg_degree": neg_meta.get("neg_degree"),
                    "neg_columns_frac": neg_meta.get("neg_columns_frac"),
                },
            }

            f.write(json.dumps(summary, ensure_ascii=False) + "\n")

summarize_triplets(triplets, "triplet_generation_summary.jsonl")
print("Summerized ")

Summerized 
