In [69]:
import sys
import os

# Navigate up to the project root (adjust path as needed)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

In [70]:
from pathlib import Path
from tether.dataset.source import DataSource, Column
from tether.dataset.repository import DataRepository

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.cluster import HDBSCAN

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset as TorchDataset

from tqdm import tqdm

In [71]:
data_dir = Path("../../data")
package_dir = "packages"
resource_dir = "resources"

data_source = DataSource(
    data_dir=data_dir, package_dir=package_dir, resource_dir=resource_dir
)

data_repository = DataRepository(data_source=data_source)
data_repository.load_all_metadata(max_datasets=100)

Loading packages:  74%|███████▍  | 99/134 [00:11<00:04,  8.39it/s]


In [72]:
def create_items_from_repository(repository, package_names=None):
    datasets = []
    for package in repository.list_packages():
        if package_names is None or package.name in package_names:
            datasets.extend(repository.list_package_datasets(package.name))

    items = []
    colnames = []
    for dataset in tqdm(datasets, desc="Loading datasets"):
        df = dataset.load()
        df = df.head(200)
        str_cols = df.select_dtypes(include=["object"]).columns

        for col in str_cols:
            items_col = df[col].dropna().tolist()
            if items_col:
                items.extend(items_col)
                colnames.extend([Column(name=col, dataset=dataset)] * len(items_col))

    return items, colnames

In [73]:
def process_ascii(ascii_items, max_length=100):
    one_hot_ascii = np.zeros((len(ascii_items), max_length, 256), dtype=np.float32)
    for i, item in enumerate(tqdm(ascii_items, desc="Processing ASCII items")):
        for j, char in enumerate(item):
            if j >= max_length:
                break
            if ord(char) < 255:  # Ensure character is within ASCII range
                one_hot_ascii[i, j, ord(char) + 1] = 1.0
            else:
                one_hot_ascii[i, j, 0] = 1.0

    if max_length > one_hot_ascii.shape[1]:
        padding = np.eye(256, dtype=np.float32)[0:1, :].reshape(1, 1, 256)
        one_hot_ascii = np.concatenate((one_hot_ascii, padding), axis=1)

    return one_hot_ascii


def process_items(items, columns, max_length=100):
    one_hot_items = process_ascii(items, max_length=max_length)

    column_datasets = [column.dataset.id for column in columns]
    column_names = [column.name for column in columns]
    column_ids = np.array(
        [f"{dataset}_{name}" for dataset, name in zip(column_datasets, column_names)]
    )

    return one_hot_items, column_ids

In [74]:
all_package_names = [pkg.name for pkg in data_repository.list_packages()]
train_package_names, test_package_names = train_test_split(
    all_package_names, test_size=0.2, random_state=42
)

train_items, train_colnames = create_items_from_repository(
    data_repository, package_names=train_package_names
)
test_items, test_colnames = create_items_from_repository(
    data_repository, package_names=test_package_names
)
train_items, train_colnames = process_items(train_items, train_colnames)
test_items, test_colnames = process_items(test_items, test_colnames)

Loading datasets: 100%|██████████| 80/80 [00:11<00:00,  6.70it/s]
Loading datasets: 100%|██████████| 20/20 [00:00<00:00, 30.70it/s]
Processing ASCII items: 100%|██████████| 113853/113853 [00:08<00:00, 13582.77it/s]
Processing ASCII items: 100%|██████████| 31404/31404 [00:02<00:00, 14453.80it/s]


In [75]:
class ContrastiveItemDataset(TorchDataset):
    def __init__(self, items, colnames):
        items_by_col = {}
        for item, colname in zip(items, colnames):
            if colname not in items_by_col:
                items_by_col[colname] = []
            items_by_col[colname].append(item)
        self.items_by_col = items_by_col
        self.colnames = list(items_by_col.keys())

    def __len__(self):
        return sum(len(items) for items in self.items_by_col.values())

    def __getitem__(self, idx):
        random_colname = np.random.choice(self.colnames)
        items = self.items_by_col[random_colname]

        random_idx = np.random.randint(len(items))
        anchor = items[random_idx]
        positive = items[random_idx]

        negative_colname = np.random.choice(
            [col for col in self.colnames if col != random_colname]
        )
        negative_items = self.items_by_col[negative_colname]
        negative_idx = np.random.randint(len(negative_items))
        negative = negative_items[negative_idx]

        return anchor, positive, negative


train_dataset = ContrastiveItemDataset(train_items, train_colnames)
test_dataset = ContrastiveItemDataset(test_items, test_colnames)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [76]:
class ItemAutoencoder(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, input_size=100):
        super(ItemAutoencoder, self).__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.input_size = input_size

        self.input_linear = nn.Linear(input_dim, 64)
        self.encoder_lstm = nn.LSTM(
            input_size=64,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            dropout=0.1,
        )
        self.decoder_lstm = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=64,
            num_layers=2,
            batch_first=True,
            dropout=0.1,
        )
        self.output_linear = nn.Linear(64, input_dim)

    def encoder(self, x):
        x = F.relu(self.input_linear(x))
        _, (h, _) = self.encoder_lstm(x)
        return h[-1]

    def decoder(self, x):
        x = x.unsqueeze(1).repeat(1, self.input_size, 1)
        x, _ = self.decoder_lstm(x)
        x = self.output_linear(x)
        return x

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded
    

In [None]:
def triplet_loss(anchor, positive, negative, margin=0.2, distance="cosine"):
    if distance == "cosine":
        d_ap = 1 - F.cosine_similarity(anchor, positive)
        d_an = 1 - F.cosine_similarity(anchor, negative)
    elif distance == "euclidean":
        d_ap = F.pairwise_distance(anchor, positive)
        d_an = F.pairwise_distance(anchor, negative)
    else:
        raise ValueError("Unsupported distance metric")

    loss = F.relu(d_ap - d_an + margin)
    return loss.mean()


def reconstruction_loss(original, reconstructed):
    logits_flat = reconstructed.view(-1, reconstructed.size(-1))
    targets_flat = original.argmax(dim=-1).view(-1)

    loss = F.cross_entropy(logits_flat, targets_flat, reduction="mean")
    return loss


def train_model(model, train_loader, optimizer, device):
    model.train()
    total_rec_loss = 0.0
    total_triplet_loss = 0.0
    total_loss = 0.0

    for batch in tqdm(train_loader, desc="Training"):
        anchor, positive, negative = batch
        anchor = anchor.to(device)
        positive = positive.to(device)
        negative = negative.to(device)

        optimizer.zero_grad()

        anchor_encoded = model.encoder(anchor)
        positive_encoded = model.encoder(positive)
        negative_encoded = model.encoder(negative)

        output = model.decoder(anchor_encoded)

        rec_loss_value = reconstruction_loss(anchor, output)
        triplet_loss_value = triplet_loss(
            anchor_encoded, positive_encoded, negative_encoded
        )

        total_rec_loss += rec_loss_value.item()
        total_triplet_loss += triplet_loss_value.item()
        loss = 0.5 * rec_loss_value + 0.5 * triplet_loss_value
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    rec_loss_avg = total_rec_loss / len(train_loader)
    triplet_loss_avg = total_triplet_loss / len(train_loader)

    return rec_loss_avg, triplet_loss_avg


def evaluate_model(model, test_loader, device):
    model.eval()
    total_rec_loss = 0.0
    total_triplet_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            anchor, positive, negative = batch
            anchor = anchor.to(device)
            positive = positive.to(device)
            negative = negative.to(device)

            output = model(anchor)
            rec_loss_value = reconstruction_loss(anchor, output)
            triplet_loss_value = triplet_loss(
                model.encoder(anchor), model.encoder(positive), model.encoder(negative)
            )
            total_rec_loss += rec_loss_value.item()
            total_triplet_loss += triplet_loss_value.item()
    rec_loss_avg = total_rec_loss / len(test_loader)
    triplet_loss_avg = total_triplet_loss / len(test_loader)
    return rec_loss_avg, triplet_loss_avg


autoencoder = ItemAutoencoder(input_dim=256, hidden_dim=64, input_size=100)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
autoencoder.to(device)
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)
num_epochs = 30

In [78]:
for epoch in range(num_epochs):
    rec_loss, triplet_loss_val = train_model(autoencoder, train_loader, optimizer, device)
    print(
        f"Epoch {epoch + 1}/{num_epochs}, Rec Loss: {rec_loss:.4f}, Triplet Loss: {triplet_loss_val:.4f}",
    )
    rec_loss, triplet_loss_val = evaluate_model(autoencoder, test_loader, device)
    print(
        f"Test Rec Loss: {rec_loss:.4f}, Test Triplet Loss: {triplet_loss_val:.4f}",
    )

Training: 100%|██████████| 890/890 [00:52<00:00, 17.05it/s]


Epoch 1/50, Rec Loss: 1.3664, Triplet Loss: 0.0896


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 38.01it/s]


Test Rec Loss: 0.9296, Test Triplet Loss: 0.0196


Training: 100%|██████████| 890/890 [00:51<00:00, 17.33it/s]


Epoch 2/50, Rec Loss: 1.0015, Triplet Loss: 0.0178


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 38.25it/s]


Test Rec Loss: 0.9308, Test Triplet Loss: 0.0203


Training: 100%|██████████| 890/890 [00:52<00:00, 17.09it/s]


Epoch 3/50, Rec Loss: 0.9284, Triplet Loss: 0.0124


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.99it/s]


Test Rec Loss: 0.8048, Test Triplet Loss: 0.0118


Training: 100%|██████████| 890/890 [00:52<00:00, 16.82it/s]


Epoch 4/50, Rec Loss: 0.8734, Triplet Loss: 0.0093


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.00it/s]


Test Rec Loss: 0.7696, Test Triplet Loss: 0.0097


Training: 100%|██████████| 890/890 [00:52<00:00, 16.87it/s]


Epoch 5/50, Rec Loss: 0.8471, Triplet Loss: 0.0083


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.29it/s]


Test Rec Loss: 0.7460, Test Triplet Loss: 0.0088


Training: 100%|██████████| 890/890 [00:52<00:00, 16.90it/s]


Epoch 6/50, Rec Loss: 0.8180, Triplet Loss: 0.0068


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.42it/s]


Test Rec Loss: 0.7369, Test Triplet Loss: 0.0076


Training: 100%|██████████| 890/890 [00:52<00:00, 16.96it/s]


Epoch 7/50, Rec Loss: 0.7938, Triplet Loss: 0.0060


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.01it/s]


Test Rec Loss: 0.7031, Test Triplet Loss: 0.0071


Training: 100%|██████████| 890/890 [00:52<00:00, 16.89it/s]


Epoch 8/50, Rec Loss: 0.7703, Triplet Loss: 0.0052


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.03it/s]


Test Rec Loss: 0.6940, Test Triplet Loss: 0.0070


Training: 100%|██████████| 890/890 [00:52<00:00, 16.93it/s]


Epoch 9/50, Rec Loss: 0.7472, Triplet Loss: 0.0047


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.03it/s]


Test Rec Loss: 0.6771, Test Triplet Loss: 0.0062


Training: 100%|██████████| 890/890 [00:52<00:00, 16.91it/s]


Epoch 10/50, Rec Loss: 0.7299, Triplet Loss: 0.0042


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.53it/s]


Test Rec Loss: 0.6489, Test Triplet Loss: 0.0056


Training: 100%|██████████| 890/890 [00:52<00:00, 16.86it/s]


Epoch 11/50, Rec Loss: 0.7124, Triplet Loss: 0.0039


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.32it/s]


Test Rec Loss: 0.6328, Test Triplet Loss: 0.0057


Training: 100%|██████████| 890/890 [00:52<00:00, 16.97it/s]


Epoch 12/50, Rec Loss: 0.6945, Triplet Loss: 0.0037


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.95it/s]


Test Rec Loss: 0.6218, Test Triplet Loss: 0.0057


Training: 100%|██████████| 890/890 [00:52<00:00, 16.81it/s]


Epoch 13/50, Rec Loss: 0.6869, Triplet Loss: 0.0036


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.32it/s]


Test Rec Loss: 0.6250, Test Triplet Loss: 0.0055


Training: 100%|██████████| 890/890 [00:52<00:00, 16.86it/s]


Epoch 14/50, Rec Loss: 0.6719, Triplet Loss: 0.0034


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 35.85it/s]


Test Rec Loss: 0.6115, Test Triplet Loss: 0.0050


Training: 100%|██████████| 890/890 [00:52<00:00, 16.85it/s]


Epoch 15/50, Rec Loss: 0.6594, Triplet Loss: 0.0033


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.38it/s]


Test Rec Loss: 0.6226, Test Triplet Loss: 0.0050


Training: 100%|██████████| 890/890 [00:52<00:00, 16.90it/s]


Epoch 16/50, Rec Loss: 0.6551, Triplet Loss: 0.0033


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.97it/s]


Test Rec Loss: 0.6074, Test Triplet Loss: 0.0051


Training: 100%|██████████| 890/890 [00:52<00:00, 16.86it/s]


Epoch 17/50, Rec Loss: 0.6446, Triplet Loss: 0.0031


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.93it/s]


Test Rec Loss: 0.5953, Test Triplet Loss: 0.0052


Training: 100%|██████████| 890/890 [00:52<00:00, 16.89it/s]


Epoch 18/50, Rec Loss: 0.6424, Triplet Loss: 0.0030


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.91it/s]


Test Rec Loss: 0.5822, Test Triplet Loss: 0.0051


Training: 100%|██████████| 890/890 [00:52<00:00, 16.89it/s]


Epoch 19/50, Rec Loss: 0.6454, Triplet Loss: 0.0030


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.27it/s]


Test Rec Loss: 0.6398, Test Triplet Loss: 0.0054


Training: 100%|██████████| 890/890 [00:52<00:00, 16.83it/s]


Epoch 20/50, Rec Loss: 0.6463, Triplet Loss: 0.0032


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.73it/s]


Test Rec Loss: 0.6073, Test Triplet Loss: 0.0049


Training: 100%|██████████| 890/890 [00:52<00:00, 16.89it/s]


Epoch 21/50, Rec Loss: 0.6348, Triplet Loss: 0.0031


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.19it/s]


Test Rec Loss: 0.6039, Test Triplet Loss: 0.0050


Training: 100%|██████████| 890/890 [00:52<00:00, 16.88it/s]


Epoch 22/50, Rec Loss: 0.6307, Triplet Loss: 0.0029


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.68it/s]


Test Rec Loss: 0.5928, Test Triplet Loss: 0.0051


Training: 100%|██████████| 890/890 [00:52<00:00, 16.90it/s]


Epoch 23/50, Rec Loss: 0.6171, Triplet Loss: 0.0028


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.95it/s]


Test Rec Loss: 0.5949, Test Triplet Loss: 0.0049


Training: 100%|██████████| 890/890 [00:52<00:00, 16.90it/s]


Epoch 24/50, Rec Loss: 0.6165, Triplet Loss: 0.0028


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.99it/s]


Test Rec Loss: 0.6145, Test Triplet Loss: 0.0051


Training: 100%|██████████| 890/890 [00:52<00:00, 16.93it/s]


Epoch 25/50, Rec Loss: 0.6127, Triplet Loss: 0.0028


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.25it/s]


Test Rec Loss: 0.5843, Test Triplet Loss: 0.0050


Training: 100%|██████████| 890/890 [00:52<00:00, 16.95it/s]


Epoch 26/50, Rec Loss: 0.6023, Triplet Loss: 0.0028


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.21it/s]


Test Rec Loss: 0.6114, Test Triplet Loss: 0.0048


Training: 100%|██████████| 890/890 [00:52<00:00, 16.90it/s]


Epoch 27/50, Rec Loss: 0.6053, Triplet Loss: 0.0027


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.69it/s]


Test Rec Loss: 0.5849, Test Triplet Loss: 0.0050


Training: 100%|██████████| 890/890 [00:52<00:00, 16.82it/s]


Epoch 28/50, Rec Loss: 0.6004, Triplet Loss: 0.0028


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.30it/s]


Test Rec Loss: 0.5710, Test Triplet Loss: 0.0046


Training: 100%|██████████| 890/890 [00:52<00:00, 16.87it/s]


Epoch 29/50, Rec Loss: 0.5906, Triplet Loss: 0.0028


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.83it/s]


Test Rec Loss: 0.5661, Test Triplet Loss: 0.0044


Training: 100%|██████████| 890/890 [00:52<00:00, 16.94it/s]


Epoch 30/50, Rec Loss: 0.5846, Triplet Loss: 0.0027


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.53it/s]


Test Rec Loss: 0.5652, Test Triplet Loss: 0.0047


Training: 100%|██████████| 890/890 [00:52<00:00, 16.88it/s]


Epoch 31/50, Rec Loss: 0.5850, Triplet Loss: 0.0026


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.30it/s]


Test Rec Loss: 0.5606, Test Triplet Loss: 0.0046


Training: 100%|██████████| 890/890 [00:52<00:00, 16.84it/s]


Epoch 32/50, Rec Loss: 0.5789, Triplet Loss: 0.0026


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.14it/s]


Test Rec Loss: 0.5516, Test Triplet Loss: 0.0045


Training: 100%|██████████| 890/890 [00:52<00:00, 16.82it/s]


Epoch 33/50, Rec Loss: 0.5762, Triplet Loss: 0.0025


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.98it/s]


Test Rec Loss: 0.5735, Test Triplet Loss: 0.0045


Training: 100%|██████████| 890/890 [00:52<00:00, 17.06it/s]


Epoch 34/50, Rec Loss: 0.5748, Triplet Loss: 0.0024


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.21it/s]


Test Rec Loss: 0.5663, Test Triplet Loss: 0.0045


Training: 100%|██████████| 890/890 [00:52<00:00, 16.89it/s]


Epoch 35/50, Rec Loss: 0.5718, Triplet Loss: 0.0024


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.90it/s]


Test Rec Loss: 0.5535, Test Triplet Loss: 0.0043


Training: 100%|██████████| 890/890 [00:52<00:00, 16.90it/s]


Epoch 36/50, Rec Loss: 0.5709, Triplet Loss: 0.0024


Evaluating: 100%|██████████| 246/246 [00:07<00:00, 35.09it/s]


Test Rec Loss: 0.5595, Test Triplet Loss: 0.0045


Training: 100%|██████████| 890/890 [00:52<00:00, 16.89it/s]


Epoch 37/50, Rec Loss: 0.5647, Triplet Loss: 0.0024


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 37.21it/s]


Test Rec Loss: 0.5492, Test Triplet Loss: 0.0043


Training: 100%|██████████| 890/890 [00:52<00:00, 16.86it/s]


Epoch 38/50, Rec Loss: 0.5603, Triplet Loss: 0.0025


Evaluating: 100%|██████████| 246/246 [00:06<00:00, 36.88it/s]


Test Rec Loss: 0.5617, Test Triplet Loss: 0.0042


Training:  34%|███▎      | 299/890 [00:18<00:35, 16.48it/s]


KeyboardInterrupt: 

In [79]:
def predict(model, items, device):
    model.eval()
    with torch.no_grad():
        items_tensor = torch.tensor(items, dtype=torch.float32).to(device)
        encoded_items = model.encoder(items_tensor)
        decoded_items = model.decoder(encoded_items)
    return decoded_items.cpu().numpy()


test_indices = np.random.choice(len(test_items), size=5, replace=False)
test_sample = test_items[test_indices]

predicted_items = predict(autoencoder, test_sample, device)
for i in range(len(test_sample)):
    original_chars = []
    predicted_chars = []

    for j in range(100):
        if test_sample[i, j].argmax() > 0:
            original_chars.append(chr(np.argmax(test_sample[i, j]) - 1))
        else:
            original_chars.append(" ")

    for j in range(100):
        if predicted_items[i, j].argmax() > 0:
            predicted_chars.append(chr(np.argmax(predicted_items[i, j]) - 1))
        else:
            predicted_chars.append(" ")

    print(f"Original: {''.join(original_chars)}")
    print(f"Predicted: {''.join(predicted_chars)}")

Original: April                                                                                               
Predicted: Stril                                                                                               
Original:                                                                                                     
Predicted:                                                                                                     
Original: No                                                                                                  
Predicted: No                                                                                                  
Original: GO-20141273911                                                                                      
Predicted: GO-20141290881                                                                                      
Original: 2014-01-01                                                                                        

In [80]:
torch.save(autoencoder.state_dict(), "../checkpoints/item_autoencoder.pth")

In [81]:
def get_column_embeddings(model, items_by_column, device):
    model.eval()
    embeddings = []
    column_ids = []

    for colname, items in tqdm(
        list(items_by_column.items()), desc="Clustering columns"
    ):
        items_tensor = torch.tensor(np.array(items), dtype=torch.float32).to(device)
        with torch.no_grad():
            embedding = model.encoder(items_tensor).cpu().numpy()
            embeddings.append(embedding.mean(axis=0))
            column_ids.append(colname)

    embeddings = np.array(embeddings)
    return embeddings, column_ids


def cluster_columns(embeddings):
    clusterer = HDBSCAN(
        min_cluster_size=3,
        metric="euclidean",
        cluster_selection_method="eom",
    )
    cluster_labels = clusterer.fit_predict(embeddings)
    return cluster_labels


embeddings, column_ids = get_column_embeddings(autoencoder, train_dataset.items_by_col, device)
cluster_labels = cluster_columns(embeddings)

Clustering columns: 100%|██████████| 855/855 [00:04<00:00, 177.13it/s]


In [82]:
for cluster_id in np.unique(cluster_labels):
    if cluster_id == -1:
        continue  # Skip noise points
    cluster_columns = [
        column_ids[i]
        for i in range(len(cluster_labels))
        if cluster_labels[i] == cluster_id
    ]
    print(f"Cluster {cluster_id}:")
    for col in cluster_columns:
        print(f"  {col}")

Cluster 0:
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_monClose
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_tueClose
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_thuClose
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_friClose
Cluster 1:
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_wedOpen
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_wedClose
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_satOpen
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_satClose
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_sunOpen
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_sunClose
Cluster 2:
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_monOpen
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_tueOpen
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_thuOpen
  e9a267e7-85be-41f2-8fc9-e5c0ed3ab05b_friOpen
Cluster 3:
  de539a35-7340-430b-8c75-0135161b82f1_CITY
  7337adce-e574-4db6-92bd-31c956381eac_CITY
  34c11c63-26b9-4f1c-837d-0d87d196ae5a_CITY
  0a0b0be6-27a2-4b95-80d7-a0fa0aac5aef_CITY
  f2c12e37-f533-4ecd-ac93-3166022fadec_CITY
Cluster 4:
  c5dcb036-9e50-45ad-ac78-6eaede8ed56e_DATE_EFFECTIVE
  43d0