In [1]:
import os
import sys
import re
import time

project_root = "/root/work/tenset"
os.environ["TVM_HOME"] = f"{project_root}"
os.environ["TVM_LIBRARY_PATH"] = f"{project_root}/build"
if f"{project_root}/python" not in sys.path:
    sys.path.insert(0, f"{project_root}/python")
    

sys.path = [p for p in sys.path if not p.startswith(f"{project_root}/build")]
sys.path.append(f"{project_root}/build")
os.environ["LD_LIBRARY_PATH"] = f"{project_root}/build:" + os.environ.get("LD_LIBRARY_PATH", "")

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np

class NpzRegressionDataset(Dataset):
    def __init__(self, X, y):
        if isinstance(X, np.ndarray):
            self.X = torch.from_numpy(X).float()
        else:
            self.X = X
        self.y = torch.from_numpy(y).float()
        # y shape이 (N,)이면 (N,1)로 바꿔주는 게 편할 때가 많음
        if self.y.ndim == 1:
            self.y = self.y.unsqueeze(1)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [3]:
import numpy as np
from tvm import auto_scheduler


# record_index
# vector_index
# diff_indices
# diff_values
# cost

json_diffs = np.load("../i_vectors_diffs.npz")
raw_input = json_diffs["diff_values"]

# input_data = json_diffs["diff_values"]
costs = -np.log(json_diffs["cost"])

In [20]:
import torch
from sklearn.model_selection import train_test_split

def transform_schedule(x_int, mean=None, std=None, eps=1e-6):
    """
    스케줄 파라미터 int 텐서를 log-scale + zero-flag로 변환

    Args:
        x_int: (B, D) int 텐서. 예: 0, 1, 2, 4, 8, ..., 1024
        mean, std: (1, D) 형태의 텐서. None이면 입력에서 계산해서 반환.
        eps: 분산 0 방지용 작은 값.

    Returns:
        x_cont: (B, 2*D) float 텐서. [v_norm, is_zero] concat
        mean, std: (1, D) float 텐서. 나중에 validation/test에도 reuse
    """
    # int → float
    x_int = torch.tensor(x_int)
    x = x_int.to(torch.float32)

    # zero flag
    is_zero = (x_int == 0).to(torch.float32)  # (B, D)

    # log2 변환 (0은 일단 0으로 두고 mask)
    v = torch.zeros_like(x, dtype=torch.float32)  # (B, D)
    mask = (x_int > 0)
    v[mask] = torch.log2(x[mask])

    # mean / std 없으면 전체 batch 기준으로 계산 (보통은 train 전체로 미리 계산)
    if mean is None or std is None:
        mean = v.mean(dim=0, keepdim=True)           # (1, D)
        std = v.std(dim=0, keepdim=True) + eps       # (1, D)

    # 정규화
    v_norm = (v - mean) / std   # (B, D)

    # v_norm과 is_zero concat → (B, 2D)
    x_cont = torch.cat([v_norm, is_zero], dim=-1)

    return x_cont, mean, std



In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16, hidden_dim=128):
        """
        input_dim: 2 * D (v_norm + is_zero concat한 차원)
        latent_dim: latent space 차원
        hidden_dim: MLP hidden 크기
        """
        super().__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            
            # 출력은 연속값이니까 activation 없이 그대로
        )

    def encode(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z)
        return x_recon, mu, logvar, z

def vae_loss(x_recon, x, mu, logvar, beta=1.0):
    """
    x, x_recon: (B, input_dim)
    mu, logvar: (B, latent_dim)

    beta: KL 가중치 (β-VAE 스타일로 조절)
    """
    # reconstruction loss: MSE
    recon_loss = F.mse_loss(x_recon, x, reduction="mean")

    # KL divergence: D_KL(q(z|x) || N(0, I))
    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())

    loss = recon_loss + beta * kl
    return loss, recon_loss, kl




In [22]:
from sklearn.preprocessing import StandardScaler


input_data = np.log(json_diffs["diff_values"]+1e-8)

scaler = StandardScaler()
input_data_scaled = scaler.fit_transform(input_data)

X_train, X_val, y_train, y_val = train_test_split(
    input_data_scaled, costs, test_size=0.2, random_state=42
)


In [23]:
from sklearn.metrics import r2_score
import torch
# 예시 세팅

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


train_dataset = NpzRegressionDataset(X_train, y_train)
val_dataset   = NpzRegressionDataset(X_val,   y_val)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=512, shuffle=False)



input_dim = X_train.shape[-1]
latent_dim = 64
hidden_dim = 256




vae = VAE(input_dim=input_dim, latent_dim=latent_dim, hidden_dim=hidden_dim).to(device)


# train_data: (N, D) int 텐서라고 가정
# train_data = torch.randint(low=0, high=1025, size=(1024, D))  # 예시용 dummy

# 전처리용 mean/std 미리 계산
# with torch.no_grad():
#     _x_cont, mean, std = transform_schedule(X_train.to(device))

beta = 0.05
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)
# 간단한 에폭 루프 예시

for epoch in range(501):
    # 여기서는 그냥 전체를 한 번에 돌린다고 가정 (실제로는 DataLoader로 배치 쪼개기)
    vae.train()
    for x_batch in train_loader:
        x_batch = x_batch[0].to(device)  # (N, D)
    
        # x_cont, _, _ = transform_schedule(x_batch, mean=mean, std=std)  # (N, 2D)

        x_recon, mu, logvar, z = vae(x_batch)

        loss, recon_loss, kl = vae_loss(x_recon, x_batch, mu, logvar, beta=beta)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    vae.eval()
    for x_batch in val_loader:
        x_batch = x_batch[0].to(device)
        x_recon, mu, logvar, z = vae(x_batch)
        val_loss, val_recon_loss, val_kl = vae_loss(x_recon, x_batch, mu, logvar, beta=beta)
        val_r2 = r2_score(x_batch.detach().cpu().numpy(), x_recon.detach().cpu().numpy())

    if epoch % 20 == 0:
        print(f"epoch {epoch}: loss={loss.item():.4f}, recon={recon_loss.item():.4f}, kl={kl.item():.4f}")
        print(f"epoch {epoch}: val loss={val_loss.item():.4f}, val recon={val_recon_loss.item():.4f}, val kl={val_kl.item():.4f}")
        
        print(val_r2)


epoch 0: loss=0.9582, recon=0.9580, kl=0.0040
epoch 0: val loss=1.0142, val recon=1.0139, val kl=0.0052
-0.0026533503296284344
epoch 20: loss=0.2727, recon=0.2264, kl=0.9275
epoch 20: val loss=0.2838, val recon=0.2339, val kl=0.9979
0.7673657920635055
epoch 40: loss=0.1521, recon=0.0979, kl=1.0838
epoch 40: val loss=0.1686, val recon=0.1143, val kl=1.0868
0.8867139025535845
epoch 60: loss=0.1230, recon=0.0689, kl=1.0828
epoch 60: val loss=0.1418, val recon=0.0874, val kl=1.0893
0.9131667091352129
epoch 80: loss=0.1074, recon=0.0560, kl=1.0288
epoch 80: val loss=0.1207, val recon=0.0689, val kl=1.0362
0.9313943473540682
epoch 100: loss=0.0919, recon=0.0426, kl=0.9841
epoch 100: val loss=0.1102, val recon=0.0605, val kl=0.9941
0.9395658845274228
epoch 120: loss=0.0838, recon=0.0352, kl=0.9704
epoch 120: val loss=0.1045, val recon=0.0568, val kl=0.9531
0.9435181589507591
epoch 140: loss=0.0783, recon=0.0328, kl=0.9101
epoch 140: val loss=0.0970, val recon=0.0512, val kl=0.9157
0.948774803

In [24]:
def seed_everything(seed):
    import random
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

In [25]:
class VAECostPredictor(nn.Module):
    """
    VAE 기반 Cost Regression 모델
    
    구조:
    - input → segment_encoder → segment_sum → VAE encoder → z → cost_predictor → cost
    
    특징:
    - Pretrained VAE encoder를 finetune (작은 learning rate)
    - Cost predictor는 더 큰 learning rate로 학습
    - 전체 forward 경로가 완전히 미분 가능 (detach, stop_grad 없음)
    """
    
    def __init__(self, input_dim, feature_dim=None, hidden_dim=256, latent_dim=64, 
                 predictor_hidden=256, predictor_layers=2, dropout=0.1, use_feature=False):
        super(VAECostPredictor, self).__init__()
        
        self.input_dim = input_dim
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
        )
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)
        
        # ========== Cost Predictor (새로 학습) ==========
        predictor_modules = []
        current_dim = latent_dim
        for i in range(predictor_layers):
            predictor_modules.extend([
                nn.Linear(current_dim, predictor_hidden),
                nn.ReLU(),
                nn.Dropout(dropout) if i < predictor_layers - 1 else nn.Identity(),
            ])
            current_dim = predictor_hidden
        predictor_modules.append(nn.Linear(predictor_hidden, 1))
        
        self.cost_predictor = nn.Sequential(*predictor_modules)

        self.use_feature = use_feature
        if self.use_feature:
            pass
            self.feature_predictor = nn.Sequential(
                nn.Linear(latent_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, feature_dim),  # feature_dim는 feature 차원
            )
        
    
    def encode(self, input_data):
        """
        Full encoding path: features → z
        완전히 미분 가능
        """
                
        # VAE Encoder
        h = self.encoder(input_data)
        
        mean = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        
        return mean, logvar
    
    def reparameterize(self, mean, logvar):
        """Reparameterization trick - 미분 가능"""
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std
    
    def predict_cost(self, z):
        """z → cost prediction - 완전히 미분 가능"""
        return self.cost_predictor(z).squeeze(-1)
    
    def predict_feature(self, z):
        return self.feature_predictor(z)
    
    def forward(self, input_data, use_mean=True):
        """
        Forward pass: input → z → cost
        
        Args:
            use_mean: True면 reparameterize 대신 mean 사용 (inference용)
        
        Returns:
            cost_pred: 예측된 cost
            mean: latent mean
            logvar: latent log-variance
            z: sampled/mean latent vector
        """
        mean, logvar = self.encode(input_data)
        
        if use_mean:
            z = mean  # Inference시 deterministic
        else:
            z = self.reparameterize(mean, logvar)  # Training시 stochastic
        
        cost_pred = self.predict_cost(z)
        
        return cost_pred, mean, logvar, z
    
    def get_encoder_params(self):
        """Encoder 파라미터 (작은 lr)"""
        encoder_params = []
        encoder_params.extend(self.encoder.parameters())
        encoder_params.extend(self.fc_mu.parameters())
        encoder_params.extend(self.fc_logvar.parameters())
        return encoder_params
    
    def get_cost_predictor_params(self):
        """Predictor 파라미터 (큰 lr)"""
        return self.cost_predictor.parameters()
    
    def get_feature_predictor_params(self):
        """Feature Predictor 파라미터"""
        return self.feature_predictor.parameters()

    def load_pretrained_encoder(self, checkpoint):
        """Pretrained VAE encoder 가중치 로드"""
        

        vae_state = checkpoint
        
        # 매칭되는 키만 로드
        encoder_keys = ['encoder', 'fc_mu', 'fc_logvar']
        own_state = self.state_dict()
        
        loaded_keys = []
        for name, param in vae_state.items():
            if any(name.startswith(k) for k in encoder_keys):
                if name in own_state and own_state[name].shape == param.shape:
                    own_state[name].copy_(param)
                    loaded_keys.append(name)
        
        # print(f"Loaded {len(loaded_keys)} parameters from pretrained VAE")
        # return loaded_keys

    def _enable_dropout(self):
        """모든 Dropout 모듈을 train 모드로 강제 활성화"""
        for module in self.modules():
            if isinstance(module, nn.Dropout):
                module.train()

    def mc_predict(self, input_tensor, T=20):
        """
        MC Dropout 기반 불확실성 추정
        
        Args:
            input_tensor: 입력 텐서 (shape [N, input_dim])
            T: MC 샘플 수
        
        Returns:
            mean: epistemic 평균 cost (shape [N])
            var: epistemic 분산 (shape [N])
        """

        self.eval()  # 전체 모델을 eval 모드로
        self._enable_dropout()  # Dropout만 train 모드로 활성화
        
        
        with torch.no_grad():
            predictions = []
            
            for _ in range(T):
                # Encode
                z, logvar = self.encode(input_tensor)
                cost_pred = self.predict_cost(z)
                predictions.append(cost_pred)
            
            predictions = torch.stack(predictions, dim=0)
            
            # epistemic mean & variance
            mc_mean = predictions.mean(dim=0)
            mc_var = predictions.var(dim=0)

        return mc_mean, mc_var


In [26]:
def reg_loss_fn(cost_pred, cost_true, loss_type='mse'):
    """
    기본 회귀 손실 (MSE 또는 MAE)
    """
    if loss_type == 'mse':
        return F.mse_loss(cost_pred, cost_true)
    else:  # mae
        return F.l1_loss(cost_pred, cost_true)


def pair_loss_fn(cost_pred, cost_true, margin=0.1):
    """
    Pairwise ranking loss: 실제 cost 순서를 예측이 유지하도록.
    cost_true[i] < cost_true[j] 이면 cost_pred[i] < cost_pred[j] + margin
    """
    batch_size = cost_pred.size(0)
    if batch_size < 2:
        return torch.tensor(0.0, device=cost_pred.device)
    
    # 모든 쌍에 대해 ranking loss 계산
    idx = torch.arange(batch_size, device=cost_pred.device)
    i_idx, j_idx = torch.meshgrid(idx, idx, indexing='ij')
    mask = i_idx < j_idx  # upper triangular only
    
    pred_i = cost_pred[i_idx[mask]]
    pred_j = cost_pred[j_idx[mask]]
    true_i = cost_true[i_idx[mask]]
    true_j = cost_true[j_idx[mask]]
    
    # label: 1 if true_i < true_j, -1 otherwise
    labels = torch.sign(true_j - true_i).float()
    
    # Margin ranking loss
    loss = F.margin_ranking_loss(pred_j.view(-1), pred_i.view(-1), labels.view(-1), margin=margin)
    return loss


def smooth_loss_fn(model, z, noise_std=0.1):
    """
    Smoothness loss: z에 작은 노이즈를 더했을 때 예측이 크게 변하지 않도록.
    """
    model.eval()
    with torch.no_grad():
        z_noisy = z + noise_std * torch.randn_like(z)
    
    cost_original = model.predict_cost(z)
    cost_noisy = model.predict_cost(z_noisy)
    
    smooth_loss = F.mse_loss(cost_original, cost_noisy)
    return smooth_loss


def kld_loss_fn(mean, logvar):
    """
    KL Divergence: q(z|x) || N(0, I)
    """
    kld = -0.5 * torch.mean(1 + logvar - mean.pow(2) - logvar.exp())
    return kld

def feature_loss_fn(use_feature, feature_pred, feature_true, coef=0.1):
    """
    Feature 예측 손실 (MSE)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if not use_feature:
        return torch.tensor(0.0, device=device)
    return F.mse_loss(feature_pred, feature_true) * coef


def compute_total_loss(model, cost_pred, mean, logvar, z, labels, feature, config, return_components=True):
    """
    Total loss 계산 (Segment 기반 데이터용).
    total_loss = reg_loss + λ_pair * pair_loss + γ * smooth_loss + β * kld_loss
    """
    
    # Individual losses
    reg = reg_loss_fn(cost_pred, labels, loss_type=config.get('loss_type', 'mse'))
    pair = pair_loss_fn(cost_pred.view(-1), labels.view(-1), margin=config.get('margin', 0.1))
    smooth = smooth_loss_fn(model, z, noise_std=config.get('noise_std', 0.1))
    kld = kld_loss_fn(mean, logvar)
    feature_loss = feature_loss_fn(model.use_feature, None, feature, coef=0)
    
    # Weighted sum
    total = config['lambda_reg'] * reg + config['lambda_pair'] * pair + config['gamma'] * smooth + config['beta'] * kld + feature_loss
    
    if return_components:
        return total, {
            'reg_loss': reg.item(),
            'pair_loss': pair.item(),
            'smooth_loss': smooth.item(),
            'kld_loss': kld.item(),
            'feature_loss': feature_loss.item(),
        }
    return total

In [27]:
def pair_accuracy(cost_pred, labels, rng=np.random.default_rng(42)):
    """
    cost_pred, labels: (B,) 텐서
    """
    n_samples = min(2000, len(cost_pred))
    sample_indices = rng.choice(len(cost_pred), n_samples, replace=False)

    correct = 0
    total = 0
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            idx_i = sample_indices[i]
            idx_j = sample_indices[j]
            pred_diff = cost_pred[idx_i] - cost_pred[idx_j]
            true_diff = labels[idx_i] - labels[idx_j]
            if (pred_diff * true_diff) > 0:
                correct += 1
            total += 1
    accuracy = correct / total if total > 0 else 0.0
    return accuracy

def recall_at_k(pred, labels, k=1):
    true_best_idx = torch.argmax(labels)
    topk_pred_idx = torch.topk(pred, k=k, largest=True).indices

    return int((topk_pred_idx == true_best_idx).any())


In [28]:
def xgb_select_indices(xgb_all_preds, train_indices, test_indices, topk_size, eps_greedy_size, rng):
    """
    랜덤으로 2개, xgb 모델로 상위 62개 선택
    """
    # 남은 인덱스 중에서 무작위로 random_select_size개 선택

    remaining_indices = set(test_indices)

    if topk_size + eps_greedy_size > test_indices.shape[0]:
        remaining_indices.update(train_indices.tolist())
        train_indices = np.array(list(remaining_indices), dtype=np.int64)
        return train_indices, np.array([], dtype=np.int64)


    top_indices, remaining_indices = select_topk_cost(xgb_all_preds, remaining_indices, topk_size)
    random_indices, remaining_indices = random_select_indices(remaining_indices, eps_greedy_size, rng=rng)
    test_indices = np.array(list(remaining_indices), dtype=np.int64)

    selected_indices = np.concatenate([top_indices, random_indices])

    train_indices = np.concatenate([train_indices, selected_indices])

    return train_indices, test_indices



def random_select_indices(remaining_indices, select_size, rng=np.random.default_rng(42)):
    if select_size == 0:
        return np.array([], dtype=np.int64), remaining_indices
    
    random_indices = rng.choice(list(remaining_indices), size=select_size, replace=False)

    remaining_indices = util_update_remaining_indices(remaining_indices, random_indices)

    return random_indices, remaining_indices



def util_update_remaining_indices(remaining_indices, selected_indices):
    """
    남은 인덱스 집합 업데이트
    util_update_remaining_indices에서 selected_indices 제거
    """
    selected_indices = set(selected_indices)
    remaining_indices.difference_update(selected_indices)

    return remaining_indices



def util_select_topk(predictions, remaining_indices, num_select):
    """
    예측값 기반 다음 측정할 샘플 선택
    
    Args:
        predictions: 전체 예측값 리스트 ([N, ] 형태)
        remaining_indices: 아직 측정되지 않은 인덱스 집합 (set)
        num_select: 선택할 샘플 수
    
    Returns:
        selected_indices: 선택된 샘플의 인덱스 numpy 배열
        remaining_indices: 업데이트된 남은 인덱스 집합 (set)
    """
    
    prediction = np.asarray(predictions)  # [N]

    remaining_np = np.array(list(remaining_indices), dtype=np.int64)
    remaining_pred = prediction[remaining_np]

    k = min(num_select, len(remaining_np))

    topk_local = np.argsort(remaining_pred)[-k:]
    selected_indices = remaining_np[topk_local]

    # remaining 업데이트
    remaining_indices.difference_update(selected_indices.tolist())

    return selected_indices, remaining_indices






def select_topk_cost(cost_pred, remaining_indices, num_select):
    """
    예측된 cost 기반 다음 측정할 샘플 선택
    
    Args:
        model: VAECostPredictor 모델
        input_data_scaled: 전체 input 리스트 ([N, input_dim] 형태)
        remaining_indices: 아직 측정되지 않은 인덱스 집합 (set)
        num_select: 선택할 샘플 수
    
    """
    if num_select == 0:
        return np.array([], dtype=np.int64), remaining_indices

    if isinstance(cost_pred, torch.Tensor):
        cost_pred = cost_pred.detach().cpu().numpy()  # [N]

    topk_cost_indices, remaining_indices = util_select_topk(cost_pred, remaining_indices, num_select)
    

    return topk_cost_indices, remaining_indices


def select_topk_z_grad(z, cost_pred, remaining_indices, num_select):
    """
    z에 대한 cost gradient 기반 다음 측정할 샘플 선택
    
    Args:
        model: VAECostPredictor 모델
        input_tensor: 전체 input numpy 배열 ([N, input_dim] 형태)
        remaining_indices: 아직 측정되지 않은 인덱스 집합 (set)
        num_select: 선택할 샘플 수
    
    """
    if num_select == 0:
        return np.array([], dtype=np.int64), remaining_indices

    candidate_indices = np.array(list(remaining_indices), dtype=np.int64)

    # z-gradient 계산
    z_grad = torch.autograd.grad(
        outputs=cost_pred.sum(),
        inputs=z,
        retain_graph=False,
        create_graph=False
    )[0]  # [N, latent_dim]

    z_grad_norm = torch.norm(z_grad, dim=1).detach().cpu().numpy()  # [N]

    # 후보 중 grad-norm top-k
    candidate_grad = z_grad_norm[candidate_indices]
    k = min(num_select, len(candidate_indices))

    topk_local = np.argsort(candidate_grad)[-k:]
    selected_indices = candidate_indices[topk_local]

    # remaining 업데이트
    remaining_indices = set(remaining_indices)
    remaining_indices.difference_update(selected_indices.tolist())

    return selected_indices, remaining_indices


def select_topk_uncertainty(model, input_tensor, remaining_indices, num_select, T_mc=10):
    """
    MC Dropout 기반 불확실성 추정으로 다음 측정할 샘플 선택
    
    Args:
        model: VAECostPredictor 모델
        input_data_scaled: 전체 input 리스트 ([N, input_dim] 형태)
        remaining_indices: 아직 측정되지 않은 인덱스 집합 (set)
        num_select: 선택할 샘플 수
        T_mc: MC Dropout 샘플 수
    
    Returns:
        selected_indices: 선택된 샘플의 인덱스 리스트
    """
    if num_select == 0:
        return np.array([], dtype=np.int64), remaining_indices


    was_training = model.training
    model.train()

    with torch.no_grad():
        _, mc_var = model.mc_predict(input_tensor, T=T_mc)

    if not was_training:
        model.eval()  # 원복

    var_np = mc_var.detach().cpu().numpy()  # [N]

    topk_uncertainty_indices, remaining_indices = util_select_topk(var_np, remaining_indices, num_select)

    return topk_uncertainty_indices, remaining_indices


def select_topk_latent_diversity(z, candidate_indices, used_indices, select_n_div, chunk_size=1024, eps=1e-12):
    """
    먼저 candidates 320개를 뽑았다고 치자.
    이후 앞에서 topk_cost, topk_z_grad로 40개 정도를 뽑았다고 치자.
    latent diversity는 40개 + used_indices로부터 가장 멀리 떨어진 24개를 280개에서 뽑는다.

    z를 L2 정규화한 뒤, k-center greedy(farthest-first)로 diversity 선택.
    초기 센터는 used_indices (이미 측정된 점들).
    매 스텝마다 "센터 집합까지의 최소거리"가 최대인 candidate를 하나씩 추가.
    
    Args:
        z: torch.Tensor [N, latent_dim]
        candidate_indices: set(int)
        used_indices: set(int)
        select_n_div: int
        chunk_size: int
    Returns:
        diverse_indices: np.ndarray (int64)
        candidate_indices: set (선택된 인덱스 제거된 상태)
    """
    if select_n_div == 0 or len(candidate_indices) == 0:
        return np.array([], dtype=np.int64), candidate_indices


    device = z.device

    # 1) L2 normalize z  (각 벡터를 단위벡터로)
    with torch.no_grad():
        z_norm = z / (z.norm(dim=1, keepdim=True) + eps)

    cand = np.array(list(candidate_indices), dtype=np.int64)
    k = min(select_n_div, len(cand))

    cand_t = torch.from_numpy(cand).to(device=device)
    z_cand = z_norm[cand_t]  # [M, D], M=len(cand)

    # 초기 센터: used_indices (비어있을 수도 있음)
    used = np.array(list(used_indices), dtype=np.int64)
    selected = []

    # 2) 각 candidate의 "현재 센터 집합까지 최소거리" 벡터 init
    #    used가 비어있으면 +inf로 시작해서 임의 첫 점을 뽑게(가장 큰 값) 만들기
    if len(used) > 0:
        used_t = torch.from_numpy(used).to(device=device)
        z_used = z_norm[used_t]  # [U, D]

        # min_dists[j] = min_{u in used} ||z_cand[j] - z_used[u]||
        min_dists = torch.empty(len(cand), device=device, dtype=torch.float32)

        with torch.no_grad():
            for s in range(0, len(cand), chunk_size):
                e = min(s + chunk_size, len(cand))
                d = torch.cdist(z_cand[s:e], z_used, p=2)  # [B, U]
                min_dists[s:e] = d.min(dim=1).values
    else:
        # 센터가 없으면 모두 동일하게 시작 → 첫 선택은 아래 argmax가 0번째로 갈 수 있음
        # 다양성 목적이면 랜덤/최대 norm 등으로 첫 점을 정할 수도 있지만,
        # 여기서는 "가장 큰 min_dists"를 위해 +inf로 둔다.
        min_dists = torch.full((len(cand),), float("inf"), device=device, dtype=torch.float32)

    # 3) k-center greedy 반복
    #    매번 argmax(min_dists) 하나 선택 -> 그 점을 센터에 추가 -> min_dists 갱신
    with torch.no_grad():
        for _ in range(k):
            j = torch.argmax(min_dists).item()     # cand 내부 위치
            sel_idx = cand[j]                      # 원본 인덱스
            selected.append(sel_idx)

            # 선택된 점을 "센터"로 추가: 모든 candidate에 대해 dist_to_new_center 계산 후 min 갱신
            new_center = z_cand[j:j+1]  # [1, D]

            # 방금 뽑은 점은 다시 뽑히지 않게 min_dists를 -inf로
            min_dists[j] = -float("inf")

            # 나머지 후보들의 min 거리 업데이트
            for s in range(0, len(cand), chunk_size):
                e = min(s + chunk_size, len(cand))
                d_new = torch.cdist(z_cand[s:e], new_center, p=2).squeeze(1)  # [B]
                min_dists[s:e] = torch.minimum(min_dists[s:e], d_new)

    diverse_indices = np.array(selected, dtype=np.int64)

    candidate_indices = set(candidate_indices)
    candidate_indices.difference_update(diverse_indices.tolist())

    return diverse_indices, candidate_indices






def select_programs(model, input_data_scaled, used_indices, remaining_indices, num_select=64, T_mc=10, uncertainty_topk=128,
                    w_cost=0.5, w_unc=0.3, w_div=0.2, grad_num=2, rand_num=0, rng=np.random.default_rng(42), device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), topk_factor=5):
    """
    Active learning 기반 다음 측정할 샘플 선택
    
    Args:
        model: VAECostPredictor 모델
        input_data_scaled: 전체 input 리스트 ([N, input_dim] 형태)
        used_indices: 이미 측정된 인덱스 집합(set)
        remaining_indices: 아직 측정되지 않은 인덱스 집합 (set)
        num_select: 선택할 샘플 수
        T_mc: MC Dropout 샘플 수
        w_cost: 예측값이 큰 샘플 가중치
        w_unc: epistemic 불확실성이 높은 샘플 가중치
        w_div: latent 다양성이 높은 샘플 가중치
        grad_num: z에 대한 cost의 gradient가 큰 샘플 수
        rand_num: 무작위로 선택할 샘플 수
    
    Returns:
        selected_indices: 선택된 샘플의 인덱스 리스트
    """

    # 합쳐서 64개 선택
    total = num_select
    budget = total - grad_num - rand_num

    # 랜덤 선택만 할 경우
    if num_select == 0 and rand_num > 0:
        rand_indices, remaining_indices = random_select_indices(remaining_indices, rand_num, rng=rng)
        return rand_indices, remaining_indices
    

    select_n_cost = int(budget * w_cost)
    select_n_unc  = int(budget * w_unc)
    select_n_div  = int(budget * w_div)
    select_n_grad = grad_num
    s = select_n_cost + select_n_unc + select_n_div
    if s < budget:
        select_n_cost += budget - s

    input_tensor = torch.tensor(input_data_scaled, dtype=torch.float32, device=device)
    

    model.eval()
    with torch.no_grad():
        z, _ = model.encode(input_tensor)
    z = z.detach().requires_grad_(True)
    model.zero_grad(set_to_none=True)
    cost_pred = model.predict_cost(z)
    cost_pred = cost_pred.view(-1)
    cost_np = cost_pred.detach().cpu().numpy()

    remaining_np = np.array(list(remaining_indices), dtype=np.int64)
    remaining_cost = cost_np[remaining_np]

    k_pref = min(len(remaining_np), total * topk_factor)
    top_local = np.argsort(remaining_cost)[-k_pref:]
    candidate_indices = set(remaining_np[top_local].tolist())  # 작업용 remaining

    # print(f"Candidate pool size: {len(candidate_indices)}")


    # 중복 방지용
    currently_used = set()
    topk_cost_indices, candidate_indices = select_topk_cost(cost_pred, candidate_indices, select_n_cost)
    currently_used.update(topk_cost_indices.tolist())
    z_grad_indices, candidate_indices = select_topk_z_grad(z, cost_pred, candidate_indices, select_n_grad)
    currently_used.update(z_grad_indices.tolist())

    # if len(used_indices) / len(input_data_scaled) >= 0.1:
    if len(used_indices) >= uncertainty_topk:
        uncertainty_indices, candidate_indices = select_topk_uncertainty(model, input_tensor, candidate_indices, select_n_unc, T_mc=T_mc)
    else:
        pool_for_uncertainty = set(remaining_indices)
        pool_for_uncertainty.difference_update(currently_used)
        uncertainty_indices, _ = select_topk_uncertainty(model, input_tensor, pool_for_uncertainty, select_n_unc, T_mc=T_mc)
        candidate_indices.difference_update(uncertainty_indices.tolist())


    currently_used.update(uncertainty_indices.tolist())
    used_local = set(used_indices)
    used_local.update(currently_used)

    diverse_indices, _ = select_topk_latent_diversity(z, candidate_indices, used_local, select_n_div)
    currently_used.update(diverse_indices.tolist())


    remaining_indices.difference_update(currently_used)


    rand_indices, remaining_indices = random_select_indices(remaining_indices, rand_num, rng=rng)
    currently_used.update(rand_indices.tolist())

    

    all_selected_indices = np.array(sorted(currently_used), dtype=np.int64)



    return all_selected_indices, remaining_indices

In [29]:
def make_vae_reg_dataloaders(input_data_scaled, costs, used_indices, remaining_indices):

    
    train_indices = np.array(list(used_indices), dtype=np.int64)
    val_indices = np.array(list(remaining_indices), dtype=np.int64)

    X_train = input_data_scaled[train_indices]
    X_val = input_data_scaled[val_indices]
    y_train = costs[train_indices]
    y_val = costs[val_indices]

    train_dataset = NpzRegressionDataset(X_train, y_train)
    val_dataset   = NpzRegressionDataset(X_val,   y_val)

    train_loader = DataLoader(train_dataset, batch_size=512, shuffle=False)
    val_loader   = DataLoader(val_dataset,   batch_size=512, shuffle=False)



    y_mean = y_train.mean()
    y_std = y_train.std() + 1e-8  # 0 나누기 방지용 작은 값 추가
    print(f"y_train mean: {y_mean}, std: {y_std}")

    
    return train_loader, val_loader, y_mean, y_std


def make_xgb_datasets(inputs, results):
    f_inputs = []
    f_results = []
    r_costs = []
    for inp, res in zip(inputs, results):
        cost = np.mean([c.value for c in res.costs])
        if cost < 1e10:
            f_inputs.append(inp)
            f_results.append(res)
            r_costs.append(cost)
    r_costs = np.array(r_costs, dtype=np.float32)
    
    dataset = auto_scheduler.dataset.Dataset()
    dataset.update_from_measure_pairs(f_inputs, f_results)
    return dataset


def split_xgb_datasets(dataset, train_indices, test_indices):

    raw_features = list(dataset.features.values())[0]
    raw_throughputs = list(dataset.throughputs.values())[0]

    
    train_set, test_set = dataset.random_split_within_task(train_set_ratio=0, 
                                                        train_idxs=train_indices.tolist(), 
                                                        test_idxs=test_indices.tolist())
    return train_set, test_set, raw_throughputs

In [30]:




train_dataset = NpzRegressionDataset(X_train, y_train)
val_dataset   = NpzRegressionDataset(X_val,   y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader   = DataLoader(val_dataset,   batch_size=256, shuffle=False)


input_data = np.log(json_diffs["diff_values"]+1e-8)
costs = -np.log(json_diffs["cost"])

train_size = 64

np.random.seed(3000)
random_indices = np.random.permutation(len(input_data))

scaler = StandardScaler()
input_data_scaled = scaler.fit_transform(input_data)
costs_scaled = (costs - costs.mean()) / (costs.std() + 1e-8)

X_train = input_data_scaled[random_indices[:train_size]]
X_val = input_data_scaled[random_indices[train_size:]]
y_train = costs[random_indices[:train_size]]
y_val = costs[random_indices[train_size:]]

train_dataset = NpzRegressionDataset(X_train, y_train)
val_dataset   = NpzRegressionDataset(X_val,   y_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=False)
val_loader   = DataLoader(val_dataset,   batch_size=512, shuffle=False)

y_mean = y_train.mean()
y_std = y_train.std() + 1e-8  # 0 나누기 방지용 작은 값 추가
print(f"y_train mean: {y_mean}, std: {y_std}")


y_train mean: 6.446558870745085, std: 1.4566711846140028


In [31]:
import itertools
def make_vae_reg_model(vae, hyperparameter, input_dim, latent_dim, hidden_dim, y_std, verbose=True):


    cnt = 0
    for vals in itertools.product(*hyperparameter.values()):
        (lambda_reg, lambda_pair, margin_scale, gamma, beta, noise_std, 
        encoder_lr, feature_predictor_lr, cost_predictor_lr,  epochs) = vals
        cnt += 1
        if verbose:
            print(f"Experiment {cnt}/{len(list(itertools.product(*hyperparameter.values())))}")
            print(f"lambda_reg={lambda_reg}, lambda_pair={lambda_pair}, margin_scale={margin_scale}, \
              gamma={gamma}, beta={beta}, noise_std={noise_std}\nencoder_lr={encoder_lr}, cost_predictor_lr={cost_predictor_lr}, epochs={epochs}")
        config = {
                    'encoder_lr': encoder_lr,
                    'feature_predictor_lr': feature_predictor_lr,
                    'cost_predictor_lr': cost_predictor_lr,
                    'lambda_reg' : lambda_reg,
                    'lambda_pair': lambda_pair,
                    'gamma': gamma,
                    'beta': beta,
                    'margin': margin_scale * y_std,
                    'noise_std': noise_std,
                    'loss_type': 'mse',
                    'epochs': epochs,
                }

        vae_cost_model = VAECostPredictor(input_dim=input_dim, 
                                    latent_dim=latent_dim, 
                                    hidden_dim=hidden_dim, 
                                    predictor_layers=2,
                                    dropout=0.1, use_feature=False).to(device)
        vae_cost_model.load_pretrained_encoder(vae.state_dict())
        optimizer = torch.optim.AdamW([
            {'params': vae_cost_model.get_encoder_params(), 'lr': config['encoder_lr']},
            {'params': vae_cost_model.get_cost_predictor_params(), 'lr': config['cost_predictor_lr']}
        ], weight_decay=1e-5)
    return vae_cost_model, optimizer, config

In [32]:
def train_regression(vae_cost_model, optimizer, train_loader, val_loader, input_data_scaled, costs, config, top_k=10, use_rank=True):

    print("Train size :", len(train_loader.dataset))

    # all_reg_results = []


    

    for epoch in range(1, config['epochs']+1):
        vae_cost_model.train()
        for x_batch, labels in train_loader:
            x_batch = x_batch.to(device)
            labels = labels.to(device).squeeze(-1)
            
        
            cost_pred, mean, logvar, z = vae_cost_model(x_batch, use_mean=True)

            train_loss, train_components = compute_total_loss(vae_cost_model, 
                                                    cost_pred, mean, logvar, z, labels, None, config)

            optimizer.zero_grad()
            train_loss.backward()
            torch.nn.utils.clip_grad_norm_(vae_cost_model.parameters(), max_norm=1.0)
            optimizer.step()
            
        

        if epoch % config['epochs'] == 0:
            vae_cost_model.eval()
            with torch.no_grad():
                all_preds = []
                all_labels = []
                for x_batch, labels in val_loader:
                    x_batch = x_batch.to(device)
                    labels = labels.to(device).squeeze(-1)

                    cost_pred, mean, logvar, z = vae_cost_model(x_batch, use_mean=True)
                    all_preds.append(cost_pred)
                    all_labels.append(labels)

                    val_loss, val_components = compute_total_loss(vae_cost_model, cost_pred, mean, logvar, z, labels, None, config)
                val_reg_r2 = r2_score(torch.cat(all_labels).detach().cpu().numpy(), torch.cat(all_preds).detach().cpu().numpy())
                val_reg_r2 = round(val_reg_r2, 4)
                
                print(f"Train loss epoch {epoch} : reg={train_components['reg_loss']: .4f} rank={train_components['pair_loss']: .4f} kl={train_components['kld_loss']: .4f}")
                print(f"Val loss epoch {epoch}: reg={val_components['reg_loss']: .4f} rank={val_components['pair_loss']: .4f} kl={val_components['kld_loss']: .4f}")
                
                print(f"Regression R2 : {val_reg_r2:.4f}, ")
        
        # rank r2 계산
        vae_cost_model.eval()
        with torch.no_grad():
            if epoch % config['epochs'] == 0:
                input_data_tensor = torch.from_numpy(input_data_scaled).float().to(device)
                all_preds = vae_cost_model(input_data_tensor, use_mean=True)[0].detach().cpu().numpy()
                if use_rank:
                    val_rank_r2 = pair_accuracy(all_preds, costs)
                    val_rank_r2 = round(val_rank_r2, 4)
                    print(f"Rank R2 : {val_rank_r2:.4f}")
                else:
                    val_rank_r2 = None
                recall_top_k = recall_at_k(torch.tensor(all_preds), torch.from_numpy(costs), k=top_k)
                
                print(f"Recall@{top_k} : {recall_top_k}")
                if recall_top_k:
                    break_signal = True
                else:
                    break_signal = False

    # print("=============================================")
    # all_reg_results.append({
    #     "lambda_reg": lambda_reg,
    #     "lambda_pair": lambda_pair,
    #     "margin_scale": margin_scale,
    #     "gamma": gamma,
    #     "beta": beta,
    #     "noise_std": noise_std,
    #     "encoder_lr": encoder_lr,
    #     "feature_predictor_lr": feature_predictor_lr,
    #     "cost_predictor_lr": cost_predictor_lr,
    #     "seed": seed,
    #     "reg_r2": val_reg_r2,
    #     "rank_r2": val_rank_r2,
    #     "recall@64": recall_top_k
    # })
    return vae_cost_model, break_signal, val_reg_r2, val_rank_r2

In [33]:
def generate_weight_grid(step=0.1):
    m = int(round(1.0 / step))  # step=0.1 -> 10
    weights = []
    for i in range(m + 1):
        for j in range(m + 1):
            k = m - i - j
            if k < 0:
                continue
            weights.append((i/m, j/m, k/m))
    return weights
weights = generate_weight_grid(step=0.1)



In [34]:
f_weights = []
for w in weights:
    w_cost, w_unc, w_div = w
    if w_cost < 0.3:
        continue
    # if w_unc == 0.0 and w_cost > 0.0 and w_div > 0.0:
    #     f_weights.append(w)
    #     continue
    # if w_div == 0.0 and w_cost > 0.0 and w_unc > 0.0:
    #     f_weights.append(w)
        # continue
    f_weights.append(w)

In [36]:
import pandas as pd
import datetime

# 데이터셋 길이만큼의 인덱스 numpy 배열 생성
all_indices = np.arange(len(input_data_scaled))
costs = -np.log(json_diffs["cost"])

real_optimum_index = np.argmax(costs)

top_k = 1

train_seed = 2023


sampling_hyper = {
    "measure_size": [64],
    "weight" : [
            # (1.0, 0.0, 0.0),
            # (0.7, 0.0, 0.3),
            # (0.7, 0.3, 0.0),
            # (0.6, 0.1, 0.3),
            # (0.3, 0.4, 0.3),
            (0.4, 0.3, 0.3),
            # (0.3, 0.3, 0.4),
            # (0.5, 0.2, 0.3),
            ],
    "uncertainty_topk": [64],
    # "weight" : f_weights,
    "grad_num": [4],
    "rand_num": [0],
    
    "T_mc": [20],
    "seed" : range(2000, 2010),
    # "seed" : [2023,2025],
}

random_indices_list = []
all_results = []

cnt = 0

now = datetime.datetime.now().strftime("%m%d_%H%M")
filename = f"sch_result/dataset_pkl/vae_sch_{now}.csv"

for params in itertools.product(*sampling_hyper.values()):

    cnt += 1
    print(f"########## 실험 {cnt}/{len(list(itertools.product(*sampling_hyper.values())))} ##########")

    tic = time.time()
    # used_indices : 이미 측정된 인덱스 집합. train_indices와 동일
    # remaining_indices : 아직 측정되지 않은 인덱스 집합. val_indices와 동일
    used_indices = set()
    remaining_indices = set(all_indices)
    
    measure_size, weight, uncertainty_topk, grad_num, rand_num, T_mc, sampling_seed = params
    w_cost, w_unc, w_div = weight
    print(f"weights: {weight}")
    print(f"measure_size: {measure_size}, T_mc: {T_mc}, sampling_seed: {sampling_seed}")

    sampling_rng = np.random.default_rng(sampling_seed)

    hyperparameter = {

        'lambda_reg' : [0.01],
        'lambda_pair': [3.0],
        'margin_scale': [0.3],
        'gamma': [0.01],
        'beta': [0.01],
        'noise_std': [0.001],

        'encoder_lr': [1e-4],
        'feature_predictor_lr': [0],
        'cost_predictor_lr': [1e-2],
        'epochs': [1000],
        
    }



    
    random_indices, remaining_indices = random_select_indices(remaining_indices, select_size=sampling_hyper["measure_size"][0], rng=sampling_rng)
    print(f"초기 랜덤 선택 샘플 인덱스: {np.sort(random_indices)}")
    used_indices.update(random_indices)
    random_indices_list.append(random_indices)

    reg_history = []
    rank_history = []

    for phase in range(1, len(input_data_scaled) // measure_size + 1):

        print(f"=============== 측정 Phase {phase} ================")


        # DataLoader 갱신
        seed_everything(train_seed)
        train_loader, val_loader, y_mean, y_std = make_vae_reg_dataloaders(input_data_scaled, costs, used_indices, remaining_indices)

        
        vae_cost_model, optimizer, config = make_vae_reg_model(vae, hyperparameter, input_dim, latent_dim, hidden_dim, y_std, verbose=False)
        
        seed_everything(train_seed)
        vae_cost_model, topk_recall_signal, val_reg_r2, val_rank_r2 = train_regression(vae_cost_model, optimizer, train_loader, val_loader, input_data_scaled, costs, config, top_k=top_k, use_rank=False)

        reg_history.append(val_reg_r2)
        rank_history.append(val_rank_r2)
        

        


        # 다음 측정할 샘플 선택
        selected_indices, remaining_indices = select_programs(
            model=vae_cost_model,
            input_data_scaled=input_data_scaled,
            remaining_indices=remaining_indices,
            used_indices=used_indices,
            num_select=measure_size,
            T_mc=T_mc,
            w_cost=weight[0],
            w_unc=weight[1],
            w_div=weight[2],
            # w_cost=0.3,
            # w_unc=0.35,
            # w_div=0.35,
            uncertainty_topk=uncertainty_topk,
            grad_num=grad_num,
            rand_num=rand_num,
            device=device,
            rng=sampling_rng,
            
            topk_factor=5
        )
        # w_cost += 0.03
        # w_unc -= 0.02
        # w_div -= 0.01

        # selected_indices: numpy 배열
        used_indices.update(selected_indices.tolist())

        measured_optimum = True if real_optimum_index in used_indices else False


        use_topk = False
        

        break_signal = False
        if not use_topk and measured_optimum:
            break_signal = True
        elif use_topk and topk_recall_signal:
            break_signal = True
            filename= filename.replace("sch_result/", "sch_result_topk/")


        if break_signal:
            print("최적화 종료")
            print("학습한 데이터 수 :", len(used_indices)-measure_size)
            used_time = time.time() - tic
            print(f"총 측정 시간: {used_time:.2f} 초")
            print("=============================================")
            all_results.append({
                "measure_size": measure_size,
                "weights": weight,
                "uncertainty_topk": uncertainty_topk,
                "grad_num": grad_num,
                "rand_num": rand_num,
                "phase" : phase,
                "used_time": round(used_time, 2),
                "train_size" : len(used_indices)-measure_size,
                "val_reg_r2": reg_history,
                "val_rank_r2": rank_history,
                "sampling_seed": sampling_seed,
                
            })
            if use_topk:
                all_results[-1]["top_k"] = top_k

            df_results = pd.DataFrame(all_results)
            
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            df_results.to_csv(filename, index=False)
            
            break




########## 실험 1/10 ##########
weights: (0.4, 0.3, 0.3)
measure_size: 64, T_mc: 20, sampling_seed: 2000
초기 랜덤 선택 샘플 인덱스: [   7   38   92  100  212  218  234  360  385  404  522  583  593  606
  657  689  730  800  834  878  891  900  918  919 1039 1128 1149 1260
 1400 1466 1543 1573 1593 1601 1695 1704 1707 1716 1726 1808 1829 1851
 1859 1956 1969 1975 2061 2078 2093 2206 2234 2291 2397 2544 2675 2991
 3011 3060 3076 3183 3282 3294 3387 3433]
y_train mean: 6.2098153523581425, std: 1.4098141953314174
Train size : 64
Train loss epoch 1000 : reg= 1.1155 rank= 0.0159 kl= 0.1053
Val loss epoch 1000: reg= 2.4471 rank= 0.4958 kl= 0.1300
Regression R2 : 0.0257, 
Recall@1 : 0
y_train mean: 6.997604687688236, std: 1.3824993344972187
Train size : 128
Train loss epoch 1000 : reg= 1.4346 rank= 0.0165 kl= 0.1101
Val loss epoch 1000: reg= 1.9596 rank= 0.3386 kl= 0.1332
Regression R2 : 0.0527, 
Recall@1 : 1
최적화 종료
학습한 데이터 수 : 128
총 측정 시간: 6.89 초
########## 실험 2/10 ##########
weights: (0.4, 0.3, 0.3)
me

In [37]:
group_cols = [
    "measure_size",
    "weights",
    "uncertainty_topk",
    "grad_num",
    "rand_num",
]

agg_dict = {
    "phase": "mean",
    "train_size": "mean",
    "used_time": "mean",
    "val_reg_r2": "first",
    "val_rank_r2": "first",
}

df_avg = (
    df_results
    .groupby(group_cols, as_index=False)
    .agg(agg_dict)
)
df_avg

Unnamed: 0,measure_size,weights,uncertainty_topk,grad_num,rand_num,phase,train_size,used_time,val_reg_r2,val_rank_r2
0,64,"(0.4, 0.3, 0.3)",64,4,0,2.3,147.2,8.039,"[0.0257, 0.0527]","[None, None]"


## XGB test

In [None]:
import warnings

warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message=".*Old style callback is deprecated.*"
)

from tvm.auto_scheduler.cost_model.xgb_model import XGBModelInternal

json_file=
inputs, results = auto_scheduler.RecordReader(json_file).read_lines()


In [25]:


topk_size = int(measure_size * 0.95)
eps_greedy_size = measure_size - topk_size


seeds = sampling_hyper["seed"]
random_indices = random_indices_list[:len(seeds)]

xgb_results = []

now = datetime.datetime.now().strftime("%m%d_%H%M")
xgb_filename = f"result_xgb/{os.path.basename(json_file)}/xgb_search_{now}.csv"

for i, seed in enumerate(seeds):

    tic = time.time()
    sample_rng = np.random.default_rng(seed)

    
    
    tenset_model = XGBModelInternal(use_workload_embedding=False, seed=train_seed)

    seed_everything(train_seed)
    dataset = make_xgb_datasets(inputs, results)

    
    used_indices = set(random_indices[i])
    remaining_indices = set(all_indices)
    remaining_indices.difference_update(used_indices)

    train_indices = np.array(sorted(used_indices), dtype=np.int64)
    test_indices = np.array(sorted(remaining_indices), dtype=np.int64)
    print(train_indices)

    reg_history = []
    rank_history = []

    for phase in range(1,  len(input_data_scaled) // measure_size + 1):

        print(f"=============== 측정 Phase {phase} ================")

        seed_everything(train_seed)
        train_set, test_set, dataset_costs = split_xgb_datasets(dataset, train_indices, test_indices)
        real_optimum_idx = np.argmax(dataset_costs)
        seed_everything(train_seed)
        tenset_model.fit_base(train_set=train_set)
        xgb_all_preds = tenset_model.predict(dataset)
        xgb_all_preds = np.array(list(xgb_all_preds.values())[0], dtype=np.float32)
        
        
        xgb_reg_r2 = r2_score(dataset_costs, xgb_all_preds)
        reg_history.append(round(xgb_reg_r2, 4))
        print(f"XGB Reg R2 : {xgb_reg_r2:.4f}")

        # xgb_rank_r2 = pair_accuracy(xgb_all_preds, dataset_costs)
        # rank_history.append(round(xgb_rank_r2, 4))
        # print(f"XGB Rank R2 : {xgb_rank_r2:.4f}")

        recall_score = recall_at_k(torch.tensor(xgb_all_preds), torch.tensor(dataset_costs), k=10)        
        print(f"XGB Recall@{top_k} : {recall_score}")
        
        
        
        
        
        # 다음 측정할 샘플 선택
        train_indices, test_indices = xgb_select_indices(xgb_all_preds, 
                            train_indices, test_indices, topk_size=topk_size, eps_greedy_size=eps_greedy_size, rng=sample_rng)
        measured_optimum = True if real_optimum_idx in train_indices else False

        use_topk = False
        

        break_signal = False
        if not use_topk and measured_optimum:
            break_signal = True
            
        elif use_topk and recall_score:
            break_signal = True
            xgb_filename= xgb_filename.replace("result_xgb/", "result_xgb_topk/")


        if break_signal:
        # if recall_score:
            print("XGB 최적화 종료 신호 감지")
            print(f"총 측정 시간: {time.time() - tic:.2f} 초")
            print("=============================================")
            xgb_results.append({
                "measure_size": measure_size,
                "phase" : phase,
                "used_time": round(time.time() - tic, 2),
                "train_size" : len(train_indices) - measure_size,
                "val_reg_r2": reg_history,
                "val_rank_r2": rank_history,
                "sampling_seed": seed,
                
            })
            df_xgb_results = pd.DataFrame(xgb_results)
            os.makedirs(os.path.dirname(xgb_filename), exist_ok=True)
            df_xgb_results.to_csv(xgb_filename, index=False)
            # raise KeyboardInterrupt
            break
        
        if test_indices.shape[0] < measure_size:
            print("측정할 샘플이 더 이상 남아있지 않음")
            xgb_results.append({
                "measure_size": measure_size,
                "phase" : "all but not found",
                "used_time": round(time.time() - tic, 2),
                "train_size" : len(train_indices) - measure_size,
                "val_reg_r2": reg_history,
                "val_rank_r2": rank_history,
                "sampling_seed": seed,
                
            })
            df_xgb_results = pd.DataFrame(xgb_results)
            os.makedirs(os.path.dirname(xgb_filename), exist_ok=True)
            df_xgb_results.to_csv(xgb_filename, index=False)
            break
            # raise KeyboardInterrupt



[   8   41   99  108  228  236  253  389  415  435  563  629  639  654
  709  743  788  864  900  947  961  971  990  991 1121 1217 1239 1359
 1511 1581 1665 1696 1719 1727 1828 1838 1841 1851 1863 1949 1974 1998
 2006 2111 2124 2129 2223 2241 2256 2381 2411 2471 2585 2745 2885 3225
 3248 3300 3320 3434 3542 3552 3654 3703]
Fit a xgb booster. Train size: 64
XGB Reg R2 : 0.4698
XGB Recall@1 : 0
Fit a xgb booster. Train size: 128
XGB Reg R2 : 0.5222
XGB Recall@1 : 0
Fit a xgb booster. Train size: 192
XGB Reg R2 : 0.5323
XGB Recall@1 : 0
XGB 최적화 종료 신호 감지
총 측정 시간: 4.03 초
[  26   74   81  121  217  371  395  412  420  440  579  602  697  714
  745  748  809  811  817  892  945 1024 1104 1206 1210 1242 1474 1493
 1555 1562 1589 1603 1620 1637 1667 1746 1752 1764 1811 1827 1901 1974
 2066 2069 2082 2114 2119 2189 2210 2331 2527 2656 2713 2820 2886 3122
 3130 3151 3168 3222 3462 3538 3651 3678]
Fit a xgb booster. Train size: 64
XGB Reg R2 : 0.5438
XGB Recall@1 : 0
Fit a xgb booster. Train size

In [27]:
group_cols = [
    "measure_size",
]

agg_dict = {
    # "phase": "mean",
    "train_size": "mean",
    "used_time": "mean",
    "val_reg_r2": "first",
    "val_rank_r2": "first",
}

df_avg = (
    df_xgb_results
    .groupby(group_cols, as_index=False)
    .agg(agg_dict)
)
df_avg

Unnamed: 0,measure_size,train_size,used_time,val_reg_r2,val_rank_r2
0,64,198.4,4.182,"[0.4698, 0.5222, 0.5323]",[]


In [50]:
import xgboost as xgb
import multiprocessing

topk_size = int(measure_size * 0.95)
eps_greedy_size = measure_size - topk_size


seeds = sampling_hyper["seed"]
random_indices = random_indices_list[:len(seeds)]

xgb_results = []

# XGBModelInternal과 동일한 xgb_params 설정
xgb_params = {
    "max_depth": 6,
    "gamma": 0.003,
    "min_child_weight": 2,
    "eta": 0.2,
    "n_gpus": 0,
    "nthread": multiprocessing.cpu_count() // 2,
    "verbosity": 0,
    "seed": train_seed or 43,
    "disable_default_eval_metric": 1,
}

for i, seed in enumerate(seeds):

    tic = time.time()
    sample_rng = np.random.default_rng(seed)

    dataset = make_xgb_datasets(inputs, results)
    
    used_indices = set(random_indices[i])
    remaining_indices = set(all_indices)
    remaining_indices.difference_update(used_indices)

    train_indices = np.array(sorted(used_indices), dtype=np.int64)
    test_indices = np.array(sorted(remaining_indices), dtype=np.int64)
    print(train_indices)

    reg_history = []
    rank_history = []

    for phase in range(1,  len(input_data_scaled) // measure_size + 1):

        print(f"=============== 측정 Phase {phase} ================")

        seed_everything(train_seed)
        _, _, dataset_costs = split_xgb_datasets(dataset, train_indices, test_indices)
        input_train = input_data_scaled[train_indices]
        label_train = dataset_costs[train_indices]
        input_test = input_data_scaled[test_indices]
        label_test = dataset_costs[test_indices]
        
        real_optimum_idx = np.argmax(dataset_costs)
        
        # XGB 모델 학습 - input_train, label_train 사용
        seed_everything(train_seed)
        dtrain = xgb.DMatrix(input_train, label=label_train)
        dtest = xgb.DMatrix(input_test, label=label_test)
        
        # 학습 (XGBModelInternal과 유사하게 num_boost_round=300, early stopping 없이 단순화)
        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=300,
            evals=[(dtrain, "train"), (dtest, "test")],
            verbose_eval=50,
        )
        
        # input_data_scaled 전체로 predict
        dmatrix_all = xgb.DMatrix(input_data_scaled)
        xgb_all_preds = bst.predict(dmatrix_all)
        xgb_all_preds = np.array(xgb_all_preds, dtype=np.float32)
        
        
        xgb_reg_r2 = r2_score(dataset_costs, xgb_all_preds)
        reg_history.append(round(xgb_reg_r2, 4))
        print(f"XGB Reg R2 : {xgb_reg_r2:.4f}")

        # xgb_rank_r2 = pair_accuracy(xgb_all_preds, dataset_costs)
        # rank_history.append(round(xgb_rank_r2, 4))
        # print(f"XGB Rank R2 : {xgb_rank_r2:.4f}")

        recall_score = recall_at_k(torch.tensor(xgb_all_preds), torch.tensor(dataset_costs), k=10)        
        print(f"XGB Recall@{top_k} : {recall_score}")
        
        
        
        
        
        # 다음 측정할 샘플 선택
        train_indices, test_indices = xgb_select_indices(xgb_all_preds, 
                            train_indices, test_indices, topk_size=topk_size, eps_greedy_size=eps_greedy_size, rng=sample_rng)
        measured_optimum = True if real_optimum_idx in train_indices else False

        use_topk = False
        

        break_signal = False
        if not use_topk and measured_optimum:
            break_signal = True
            
        elif use_topk and recall_score:
            break_signal = True
            xgb_filename= xgb_filename.replace("result_xgb/", "result_xgb_topk/topk_")


        if break_signal:
            print("XGB 최적화 종료 신호 감지")
            print(f"총 측정 시간: {time.time() - tic:.2f} 초")
            print("=============================================")
            xgb_results.append({
                "measure_size": measure_size,
                "phase" : phase,
                "used_time": round(time.time() - tic, 2),
                "train_size" : len(train_indices) - measure_size,
                "val_reg_r2": reg_history,
                "val_rank_r2": rank_history,
                "sampling_seed": seed,
                
            })
            df_xgb_results = pd.DataFrame(xgb_results)
            # df_xgb_results.to_csv(xgb_filename, index=False)
            break
        
        if test_indices.shape[0] < measure_size:
            print("측정할 샘플이 더 이상 남아있지 않음")
            xgb_results.append({
                "measure_size": measure_size,
                "phase" : "all but not found",
                "used_time": round(time.time() - tic, 2),
                "train_size" : len(train_indices) - measure_size,
                "val_reg_r2": reg_history,
                "val_rank_r2": rank_history,
                "sampling_seed": seed,
                
            })
            df_xgb_results = pd.DataFrame(xgb_results)
            # df_xgb_results.to_csv(xgb_filename, index=False)
            break
            # raise KeyboardInterrupt

[   8   41   99  108  228  236  253  389  415  435  563  629  639  654
  709  743  788  864  900  947  961  971  990  991 1121 1217 1239 1359
 1511 1581 1665 1696 1719 1727 1828 1838 1841 1851 1863 1949 1974 1998
 2006 2111 2124 2129 2223 2241 2256 2381 2411 2471 2585 2745 2885 3225
 3248 3300 3320 3434 3542 3552 3654 3703]
XGB Reg R2 : 0.6295
XGB Recall@1 : 0
XGB 최적화 종료 신호 감지
총 측정 시간: 3.15 초
[  26   74   81  121  217  371  395  412  420  440  579  602  697  714
  745  748  809  811  817  892  945 1024 1104 1206 1210 1242 1474 1493
 1555 1562 1589 1603 1620 1637 1667 1746 1752 1764 1811 1827 1901 1974
 2066 2069 2082 2114 2119 2189 2210 2331 2527 2656 2713 2820 2886 3122
 3130 3151 3168 3222 3462 3538 3651 3678]
XGB Reg R2 : 0.5064
XGB Recall@1 : 0
XGB Reg R2 : 0.6101
XGB Recall@1 : 0
XGB Reg R2 : 0.6527
XGB Recall@1 : 0
XGB Reg R2 : 0.6519
XGB Recall@1 : 0
XGB Reg R2 : 0.6354
XGB Recall@1 : 0
XGB 최적화 종료 신호 감지
총 측정 시간: 6.34 초
[  15  135  166  294  337  392  399  406  465  490  507  528

In [124]:
df_xgb_results

Unnamed: 0,measure_size,phase,used_time,train_size,val_reg_r2,val_rank_r2,sampling_seed
0,64,2,3.16,128,"[0.455, 0.4711]",[],2000
1,64,5,4.44,320,"[0.5689, 0.5475, 0.5938, 0.5836, 0.6436]",[],2001
2,64,4,4.01,256,"[0.588, 0.5441, 0.5812, 0.6114]",[],2002
3,64,2,3.09,128,"[0.5876, 0.5477]",[],2003
4,64,2,3.24,128,"[0.4325, 0.608]",[],2004
5,64,2,3.09,128,"[0.4016, 0.5381]",[],2005
6,64,2,3.09,128,"[0.5755, 0.333]",[],2006
7,64,7,5.6,448,"[0.4894, 0.5619, 0.478, 0.5676, 0.5699, 0.5365...",[],2007
8,64,2,3.11,128,"[0.5457, 0.6573]",[],2008
9,64,4,3.92,256,"[0.3686, 0.5665, 0.5754, 0.6717]",[],2009


In [52]:
group_cols = [
    "measure_size",
]

agg_dict = {
    # "phase": "mean",
    "train_size": "mean",
    "used_time": "mean",
    "val_reg_r2": "first",
    "val_rank_r2": "first",
}

df_avg = (
    df_xgb_results
    .groupby(group_cols, as_index=False)
    .agg(agg_dict)
)
df_avg

Unnamed: 0,measure_size,train_size,used_time,val_reg_r2,val_rank_r2
0,64,176.0,4.505,[0.6295],[]


In [None]:

from tvm.auto_scheduler.cost_model.xgb_model import XGBModelInternal

for i in range(1000):

    tenset_model = XGBModelInternal()
    tenset_model.fit_base(train_set, valid_set=test_set)
    throughputs = np.array(list(test_set.throughputs.values()))

    pred = tenset_model.predict(test_set)

    true_biggest_index = np.argsort(throughputs[0])[-1]
    biggest_indices_64 = np.argsort(list(pred.values())[0])[-64:]

    # list(pred.values())[0]
    if true_biggest_index in biggest_indices_64:
        print("✓ Tenset 모델이 실제 가장 높은 throughput 정확히 예측했습니다!")
        break
    break


# pred, throughputs rank accuracy
correct_pairs = 0
total_pairs = 0
n_samples = min(2000, throughputs.shape[-1])
sample_indices = np.random.choice(throughputs.shape[-1], n_samples, replace=False)
pred_values = list(pred.values())[0]
throughput_values = throughputs.squeeze()
rank_accuracy = pair_accuracy(pred_values, throughput_values)
print(f"Tenset 모델 Rank Accuracy: {rank_accuracy:.4f}")

Fit a xgb booster. Train size: 32




Tenset 모델 Rank Accuracy: 0.8091
