In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from torch import Tensor
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from scipy.stats import kendalltau
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import gc

torch.manual_seed(42)
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
print(device)

cuda:1


In [2]:
def get_unique_list(row, configs):
    c = len(row["node_config_feat"])
    unique_list = []
    for i in range(c):
        if (len(unique_list) == configs):
            break
        for j in range(i + 1, c):
            conf_i = row["node_config_feat"][i]
            conf_j = row["node_config_feat"][j]
            if (np.array_equal(conf_i, conf_j)):
                break
        else:
            unique_list.append(i)
            
    return unique_list

In [3]:
def compress_layout_df(directory):
    MAX_CONFIGS = 1000
    splits = ["train","valid", "test"]

    target_directory = directory.split("/")
    target_directory[2] = "working"
    target_directory = "/".join(target_directory)
    
    for split in splits:
        path = os.path.join(directory, split)
        files = os.listdir(path)
        os.makedirs(os.path.join(target_directory, split))
        
        for file in files:
            d = dict(np.load(os.path.join(path,file), allow_pickle = True))
            # Only keep configurable nodes
            d["node_feat"] = d["node_feat"][d["node_config_ids"]]
            d["node_opcode"] = d["node_opcode"][d["node_config_ids"]]
            
            d["edge_index"] = d["edge_index"][np.isin(d["edge_index"], d["node_config_ids"]).all(1)]
            if d["edge_index"].size > 0: 
                value_to_idx = {val: idx for idx, val in enumerate(d["node_config_ids"])}
                convert = np.vectorize(value_to_idx.get)
                d["edge_index"] = convert(d["edge_index"])

            # if split == "train":
            #     unique_list = get_unique_list(d, MAX_CONFIGS)
            #     d["node_config_feat"] = d["node_config_feat"][unique_list]
            #     d["config_runtime"] = d["config_runtime"][unique_list]

            if split != "test":
                c = min(len(d['config_runtime']), MAX_CONFIGS)
                d["node_config_feat"] = d["node_config_feat"][:c]
                d["config_runtime"] = d["config_runtime"][:c]
            d['filename'] = file
            np.savez_compressed(os.path.join(target_directory, split, file), **d)
    print(f"target directory {target_directory} done")

In [4]:
layout_path_list = [
     "/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/nlp/random/",
     "/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/nlp/default/",
     "/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/random/",
     "/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/default/",
    ]
layout_path_list

['/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/nlp/random/',
 '/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/nlp/default/',
 '/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/random/',
 '/kaggle/input/predict-ai-model-runtime/npz_all/npz/layout/xla/default/']

In [5]:
# ! rm -rf /kaggle/working/predict-ai-model-runtime

In [6]:
for path in layout_path_list:
    compress_layout_df(path)

target directory /kaggle/working/predict-ai-model-runtime/npz_all/npz/layout/nlp/random/ done
target directory /kaggle/working/predict-ai-model-runtime/npz_all/npz/layout/nlp/default/ done
target directory /kaggle/working/predict-ai-model-runtime/npz_all/npz/layout/xla/random/ done
target directory /kaggle/working/predict-ai-model-runtime/npz_all/npz/layout/xla/default/ done


In [7]:
def load_saved(directory):
    directory = directory.split("/")
    directory[2] = "working"
    directory = "/".join(directory)
    splits = ["train","valid", "test"]
    dfs = dict()
    
    for split in splits:
        path = os.path.join(directory, split)
        files = os.listdir(path)
        list_df = []
        
        for file in files:
            d = dict(np.load(os.path.join(path,file), allow_pickle = True))
            list_df.append(d)
        dfs[split] = pd.DataFrame.from_dict(list_df)
    return dfs

In [8]:
class LayoutDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        node_config_feat = torch.tensor(row['node_config_feat'].astype(np.float32))
        node_feat = torch.tensor(row['node_feat'].astype(np.float32))
        node_opcode = torch.tensor(row['node_opcode'].astype(np.int32))
        edge_index = torch.tensor(np.swapaxes(row['edge_index'],0,1).astype(np.int32))
        target = torch.tensor(row['config_runtime'].astype(np.float32))
        
        
        # CRITICAL FIX: Normalize features to same scale
        node_feat = (node_feat - node_feat.mean()) / (node_feat.std() + 1e-8)
        node_config_feat = (node_config_feat - node_config_feat.mean()) / (node_config_feat.std() + 1e-8)
        
        target = (target - min(target)) / (max(target) - min(target))
        
        return node_config_feat, node_feat, node_opcode, edge_index, target

In [9]:
class LayoutModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(120, 16)

        # Process each node with config features
        self.node_encoder = torch.nn.Sequential(
            nn.Linear(174, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
        )
        
        # Aggregate nodes into config representation
        self.config_encoder = torch.nn.Sequential(
            nn.Linear(128*3, 64),  # mean, max, std = 32*3 = 96
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
        )
        
        # Final prediction
        self.predictor = nn.Linear(16, 1)

    def forward(self, x_cfg: Tensor, x_feat: Tensor, x_op: Tensor, edge_index: Tensor) -> Tensor:
        x_op = self.embedding(x_op)
        # x_cfg: (c, nc, 18)
        # x_feat: (nc, 140)
        # x_op: (nc, 16)
        x_feat = x_feat.unsqueeze(0).expand(x_cfg.shape[0], -1, -1)
        x_op = x_op.unsqueeze(0).expand(x_cfg.shape[0], -1, -1)
        
        # Concatenate all features
        x = torch.concat([x_feat, x_op, x_cfg], axis=2)  # (c, nc, 174)
        
        # Process each node
        x = self.node_encoder(x)  # (c, nc, 32)
        
        # Aggregate with multiple statistics to preserve distinguishability
        x_mean = x.mean(1)  # (c, 32)
        x_max = x.max(1).values  # (c, 32)
        x_std = x.std(1)  # (c, 32)
        x = torch.cat([x_mean, x_max, x_std], dim=1)  # (c, 96)
        
        # Encode aggregated config representation
        x = self.config_encoder(x)  # (c, 32)
        
        # Predict runtime
        x = self.predictor(x)  # (c, 1)
        x = torch.flatten(x)
        
        return x

In [10]:
class RuntimeRankNetLoss(nn.Module):
    """
    RankNet Loss for runtime optimization where LOWER runtime is better.
    Processes one row/query at a time.
    
    Args:
        reduction (str): 'none' | 'mean' | 'sum'
        margin (float): Ignore pairs with runtime differences smaller than this
        temperature (float): Temperature for sigmoid function
    """
    
    def __init__(self, reduction='mean', margin=0.0, temperature=1.0):
        super(RuntimeRankNetLoss, self).__init__()
        self.reduction = reduction
        self.margin = margin
        self.temperature = temperature
        
    def forward(self, predicted_runtimes, true_runtimes, mask=None):
        """
        Forward pass for single row of runtime data.
        
        Args:
            predicted_runtimes (torch.Tensor): Predicted runtimes, shape (n_items,)
            true_runtimes (torch.Tensor): True runtimes, shape (n_items,)
            mask (torch.Tensor, optional): Mask for valid items, shape (n_items,)
            
        Returns:
            torch.Tensor: Computed RankNet loss
        """
        # Ensure inputs are 1D
        predicted_runtimes = predicted_runtimes.squeeze()
        true_runtimes = true_runtimes.squeeze()
        
        n_items = predicted_runtimes.shape[0]
        device = predicted_runtimes.device
        
        # Create mask if not provided
        if mask is None:
            mask = torch.ones(n_items, dtype=torch.bool, device=device)
        
        # Get indices of valid items
        valid_indices = torch.where(mask)[0]
        if len(valid_indices) < 2:
            return torch.tensor(0.0, device=device)
        
        # Extract valid runtimes
        pred_valid = predicted_runtimes[valid_indices]
        true_valid = true_runtimes[valid_indices]
        n_valid = len(pred_valid)
        
        # Expand to compute all pairwise comparisons
        pred_i = pred_valid.unsqueeze(1).expand(n_valid, n_valid)
        pred_j = pred_valid.unsqueeze(0).expand(n_valid, n_valid)
        
        true_i = true_valid.unsqueeze(1).expand(n_valid, n_valid)
        true_j = true_valid.unsqueeze(0).expand(n_valid, n_valid)
        
        # For runtimes: LOWER is BETTER
        # So if true_i < true_j, then i is better than j (P_ij = 1)
        P_ij = (true_i < true_j).float()
        
        # Apply margin to ignore small differences
        if self.margin > 0:
            significant_diff = (torch.abs(true_i - true_j) > self.margin).float()
            P_ij = P_ij * significant_diff
        
        # Compute predicted probability that i is better than j
        # For runtimes: if pred_j - pred_i is large positive, then i is much better than j
        pred_diff = (pred_j - pred_i) / self.temperature  # positive when i is better
        
        # RankNet loss calculation
        # We want to maximize sigmoid(pred_diff) when P_ij = 1
        # and minimize it when P_ij = 0
        losses = F.binary_cross_entropy_with_logits(
            pred_diff, P_ij, reduction='none'
        )
        
        # Use only upper triangle to avoid duplicate pairs (i < j)
        triu_mask = torch.triu(torch.ones(n_valid, n_valid, device=device), diagonal=1)
        losses = losses * triu_mask
        valid_pairs = triu_mask.sum()
        
        if valid_pairs == 0:
            return torch.tensor(0.0, device=device)
        
        if self.reduction == 'mean':
            return losses.sum() / valid_pairs
        elif self.reduction == 'sum':
            return losses.sum()
        else:  # 'none'
            return losses

In [11]:
sub = pd.read_csv('/kaggle/input/predict-ai-model-runtime/sample_submission.csv')

In [12]:
def compute_score(out, target):
    return kendalltau(np.argsort(out.detach().cpu().numpy()), np.argsort(target.detach().cpu().numpy())).correlation

In [13]:
class EarlyStopper:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [14]:
# Ideas: 

In [15]:
for path in layout_path_list:
    EPOCHS = 200  # Increased for better training
    model = LayoutModel().to(device)
    dfs = load_saved(path)
    train_dataset = LayoutDataset(dfs["train"])
    valid_dataset = LayoutDataset(dfs["valid"])
    test_dataset = LayoutDataset(dfs["test"])
    criterion = RuntimeRankNetLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS)
    # early_stopper = EarlyStopper(patience = 5, min_delta = -0.01)
    
    best_tau = -float('inf')
    
    for epoch in range(EPOCHS):
        model.train()
        pbar = tqdm(range(len(train_dataset)))
        loss_sum = 0
        n = 0
        
        for i in pbar:
            node_config_feat, node_feat, node_opcode, edge_index, target = train_dataset[i]
            node_config_feat, node_feat, node_opcode, edge_index, target = (
                node_config_feat.to(device), node_feat.to(device), 
                node_opcode.to(device), edge_index.to(device), target.to(device)
            )
            
            # DEBUG INFO (first iteration only)
            if i == 0 and epoch == 0:
                print(f'\n=== DEBUG INFO ===')
                print(f'node_config_feat: {node_config_feat.shape}, variance: {node_config_feat.var():.6f}')
                print(f'node_feat: {node_feat.shape}, variance: {node_feat.var():.6f}')
                print(f'target range: [{target.min():.3f}, {target.max():.3f}]')
                
                out_test = model(node_config_feat, node_feat, node_opcode, edge_index)
                print(f'output: mean={out_test.mean():.6f}, std={out_test.std():.6f}')
                print(f'output range: [{out_test.min():.6f}, {out_test.max():.6f}]')
                print('==================\n')
            
            out = model(node_config_feat, node_feat, node_opcode, edge_index)
            loss = criterion(out, target)
            
            optimizer.zero_grad()
            loss.backward()
            
            # Check gradient norm on first iteration
            if i == 0:
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), float('inf'))
                # print(f'Epoch {epoch}: Gradient norm = {grad_norm:.4f}')
            
            # Gradient clipping (reasonable value)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            
            loss_sum += loss.item()
            n += 1
            tau = compute_score(out, target)
            pbar.set_description(f'[Training] Epoch: {epoch}, running loss: {(loss_sum/n):.3f}, current loss: {(loss.item()):.3f}, tau: {tau:.3f}')
        
        # Validation
        model.eval()
        pbar = tqdm(range(len(valid_dataset)))
        loss_sum = 0
        n = 0
        with torch.no_grad():
            for i in pbar:
                node_config_feat, node_feat, node_opcode, edge_index, target = valid_dataset[i]
                node_config_feat, node_feat, node_opcode, edge_index, target = (
                    node_config_feat.to(device), node_feat.to(device), 
                    node_opcode.to(device), edge_index.to(device), target.to(device)
                )
            
                out = model(node_config_feat, node_feat, node_opcode, edge_index)
                loss = criterion(out, target)
                loss_sum += loss.item()
                n += 1
                tau = compute_score(out, target)
                
                pbar.set_description(f'[Validation] Epoch: {epoch}, running loss: {(loss_sum/n):.3f}, current loss: {(loss.item()):.3f} tau: {tau:.3f}')
        
        scheduler.step()
    
    # Inference on test set
    predictions = []
    model.eval()
    pbar = tqdm(range(len(test_dataset)))
    with torch.no_grad():
        for i in pbar:
            node_config_feat, node_feat, node_opcode, edge_index, target = test_dataset[i]
            node_config_feat, node_feat, node_opcode, edge_index, target = (
                node_config_feat.to(device), node_feat.to(device), 
                node_opcode.to(device), edge_index.to(device), target.to(device)
            )
                
            out = model(node_config_feat, node_feat, node_opcode, edge_index)
            predictions.append(np.argsort(out.detach().cpu().numpy()))
    
    collection = path.split("/")[-3:-1]
    for i, filename in enumerate(dfs["test"]['filename'].values):
        id = f"layout:{collection[0]}:{collection[1]}:{filename.item()[:-4]}"
        sub.loc[sub.ID == id, 'TopConfigs'] = ';'.join(predictions[i].astype(str))
        # print(f"wrote id: {id}")

  0%|          | 0/207 [00:00<?, ?it/s]


=== DEBUG INFO ===
node_config_feat: torch.Size([1000, 346, 18]), variance: 1.000000
node_feat: torch.Size([346, 140]), variance: 1.000000
target range: [0.000, 1.000]
output: mean=0.224122, std=0.000909
output range: [0.221472, 0.226499]



[Training] Epoch: 0, running loss: 0.564, current loss: 0.455, tau: -0.005: 100%|██████████| 207/207 [00:16<00:00, 12.19it/s]
[Validation] Epoch: 0, running loss: 0.437, current loss: 0.412 tau: 0.022: 100%|██████████| 20/20 [00:00<00:00, 20.64it/s]
[Training] Epoch: 1, running loss: 0.444, current loss: 0.459, tau: -0.010: 100%|██████████| 207/207 [00:13<00:00, 14.86it/s]
[Validation] Epoch: 1, running loss: 0.435, current loss: 0.372 tau: 0.018: 100%|██████████| 20/20 [00:00<00:00, 21.66it/s]
[Training] Epoch: 2, running loss: 0.425, current loss: 0.401, tau: 0.023: 100%|██████████| 207/207 [00:14<00:00, 14.77it/s]
[Validation] Epoch: 2, running loss: 0.412, current loss: 0.311 tau: -0.003: 100%|██████████| 20/20 [00:00<00:00, 21.67it/s]
[Training] Epoch: 3, running loss: 0.407, current loss: 0.359, tau: -0.027: 100%|██████████| 207/207 [00:14<00:00, 14.75it/s]
[Validation] Epoch: 3, running loss: 0.383, current loss: 0.278 tau: 0.038: 100%|██████████| 20/20 [00:00<00:00, 22.44it/s]



=== DEBUG INFO ===
node_config_feat: torch.Size([1000, 346, 18]), variance: 1.000000
node_feat: torch.Size([346, 140]), variance: 1.000000
target range: [0.000, 1.000]
output: mean=0.206794, std=0.000621
output range: [0.205293, 0.209588]



[Training] Epoch: 0, running loss: 0.658, current loss: 0.633, tau: 0.007: 100%|██████████| 198/198 [00:12<00:00, 15.29it/s]
[Validation] Epoch: 0, running loss: 0.625, current loss: 0.603 tau: 0.019: 100%|██████████| 20/20 [00:00<00:00, 25.61it/s]
[Training] Epoch: 1, running loss: 0.606, current loss: 0.590, tau: -0.024: 100%|██████████| 198/198 [00:13<00:00, 15.21it/s]
[Validation] Epoch: 1, running loss: 0.599, current loss: 0.560 tau: 0.016: 100%|██████████| 20/20 [00:00<00:00, 25.43it/s]
[Training] Epoch: 2, running loss: 0.592, current loss: 0.586, tau: -0.010: 100%|██████████| 198/198 [00:13<00:00, 14.89it/s]
[Validation] Epoch: 2, running loss: 0.586, current loss: 0.537 tau: 0.037: 100%|██████████| 20/20 [00:00<00:00, 25.38it/s]
[Training] Epoch: 3, running loss: 0.587, current loss: 0.582, tau: -0.003: 100%|██████████| 198/198 [00:12<00:00, 15.27it/s]
[Validation] Epoch: 3, running loss: 0.578, current loss: 0.528 tau: 0.003: 100%|██████████| 20/20 [00:00<00:00, 25.39it/s]
[


=== DEBUG INFO ===
node_config_feat: torch.Size([1000, 382, 18]), variance: 1.000000
node_feat: torch.Size([382, 140]), variance: 1.000000
target range: [0.000, 1.000]
output: mean=-0.091125, std=0.001715
output range: [-0.096728, -0.086236]



[Training] Epoch: 0, running loss: 0.668, current loss: 0.696, tau: 0.009: 100%|██████████| 69/69 [00:10<00:00,  6.68it/s]
[Validation] Epoch: 0, running loss: 0.702, current loss: 0.691 tau: -0.015: 100%|██████████| 7/7 [00:00<00:00, 10.53it/s]
[Training] Epoch: 1, running loss: 0.625, current loss: 0.700, tau: -0.026: 100%|██████████| 69/69 [00:10<00:00,  6.80it/s]
[Validation] Epoch: 1, running loss: 0.724, current loss: 0.694 tau: 0.028: 100%|██████████| 7/7 [00:00<00:00,  9.92it/s]
[Training] Epoch: 2, running loss: 0.616, current loss: 0.704, tau: 0.031: 100%|██████████| 69/69 [00:10<00:00,  6.53it/s]
[Validation] Epoch: 2, running loss: 0.735, current loss: 0.697 tau: -0.025: 100%|██████████| 7/7 [00:00<00:00, 10.22it/s]
[Training] Epoch: 3, running loss: 0.613, current loss: 0.696, tau: 0.019: 100%|██████████| 69/69 [00:10<00:00,  6.81it/s]
[Validation] Epoch: 3, running loss: 0.733, current loss: 0.695 tau: -0.012: 100%|██████████| 7/7 [00:00<00:00, 10.72it/s]
[Training] Epoch


=== DEBUG INFO ===
node_config_feat: torch.Size([1000, 382, 18]), variance: 1.000000
node_feat: torch.Size([382, 140]), variance: 1.000000
target range: [0.000, 1.000]
output: mean=0.039096, std=0.001003
output range: [0.036129, 0.041858]



[Training] Epoch: 0, running loss: 0.691, current loss: 0.694, tau: 0.031: 100%|██████████| 61/61 [00:10<00:00,  5.76it/s]
[Validation] Epoch: 0, running loss: 0.694, current loss: 0.695 tau: 0.035: 100%|██████████| 7/7 [00:00<00:00,  9.54it/s]
[Training] Epoch: 1, running loss: 0.666, current loss: 0.693, tau: 0.051: 100%|██████████| 61/61 [00:11<00:00,  5.52it/s]
[Validation] Epoch: 1, running loss: 0.707, current loss: 0.710 tau: 0.013: 100%|██████████| 7/7 [00:00<00:00,  7.61it/s]
[Training] Epoch: 2, running loss: 0.654, current loss: 0.693, tau: 0.056: 100%|██████████| 61/61 [00:11<00:00,  5.52it/s]
[Validation] Epoch: 2, running loss: 0.705, current loss: 0.695 tau: -0.051: 100%|██████████| 7/7 [00:00<00:00,  9.43it/s]
[Training] Epoch: 3, running loss: 0.649, current loss: 0.687, tau: 0.063: 100%|██████████| 61/61 [00:10<00:00,  5.62it/s]
[Validation] Epoch: 3, running loss: 0.704, current loss: 0.695 tau: -0.045: 100%|██████████| 7/7 [00:00<00:00,  9.50it/s]
[Training] Epoch: 

In [16]:
sub

Unnamed: 0,ID,TopConfigs
0,tile:xla:d6f5f54247bd1e58a10b9e7062c636ab,0;1;2;3;4
1,tile:xla:e3a655daa38e34ec240df959b650ac16,0;1;2;3;4
2,tile:xla:f8c2c1a1098b2a361c26df668b286c87,0;1;2;3;4
3,tile:xla:4dd1716853ed46ee4e7d09ede1732de8,0;1;2;3;4
4,tile:xla:d0a69155b6340748c36724e4bfc34be3,0;1;2;3;4
...,...,...
889,layout:nlp:random:60880ed76de53f4d7a1b960b24f2...,836;645;623;270;356;219;457;647;742;320;112;79...
890,layout:nlp:random:23559853d9702baaaacbb0c83fd3...,126;350;703;215;924;204;377;214;51;616;827;906...
891,layout:nlp:random:f6c146fc5cf10be4f3accbaca989...,634;846;158;726;667;911;334;749;487;827;352;89...
892,layout:nlp:random:32531d07a084b319dce484f53a4c...,26;691;22;853;977;748;466;364;907;189;818;325;...


In [17]:
sub.to_csv('submission.csv',index=False)