In [1]:
import os
import sys
import re

project_root = "/root/work/tenset"
os.environ["TVM_HOME"] = f"{project_root}"
os.environ["TVM_LIBRARY_PATH"] = f"{project_root}/build"
if f"{project_root}/python" not in sys.path:
    sys.path.insert(0, f"{project_root}/python")

sys.path = [p for p in sys.path if not p.startswith(f"{project_root}/build")]
sys.path.append(f"{project_root}/build")
os.environ["LD_LIBRARY_PATH"] = f"{project_root}/build:" + os.environ.get("LD_LIBRARY_PATH", "")

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel
import numpy as np

class VAE_Transformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, bert_model_name='bert-base-uncased', activation="relu"):
        super(VAE_Transformer, self).__init__()

        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        elif activation == 'sigmoid':
            self.activation = nn.Sigmoid()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")

        # Encoder (BERT -> mean, logvar)
        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_output_dim = self.bert.config.hidden_size
        self.fc_mean = nn.Linear(bert_output_dim, latent_dim)
        self.fc_logvar = nn.Linear(bert_output_dim, latent_dim)

        # Decoder
        self.fc_d1 = nn.Linear(latent_dim, hidden_dim)
        self.fc_d2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc_d3 = nn.Linear(hidden_dim, input_dim)

    def encode(self, input_ids, attention_mask):
        # bert
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # 2. Pooler Output 추출 ([CLS] 토큰에 해당하는 벡터, shape: [batch, hidden_size])
        # last_hidden_state[:, 0, :] 와 유사하지만, BERT는 추가적인 dense+tanh 층을 거친 pooler_output을 제공함
        h = outputs.pooler_output
        return self.fc_mean(h), self.fc_logvar(h)

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

    def decode(self, z):
        h = F.relu(self.fc_d1(z))
        h = F.relu(self.fc_d2(h))
        # [수정] Sigmoid 제거. 선형(linear) 출력을 반환
        return self.fc_d3(h)

    def forward(self, x):
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        recon_x = self.decode(z)
        return recon_x, mean, logvar


class AE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, activation="relu"):
        super(AE, self).__init__()

        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        elif activation == 'sigmoid':
            self.activation = nn.Sigmoid()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")

        # Encoder (동일)
        self.fc_e1 = nn.Linear(input_dim, hidden_dim)
        self.fc_e2 = nn.Linear(hidden_dim, latent_dim)

        # Decoder
        self.fc_d1 = nn.Linear(latent_dim, hidden_dim)
        self.fc_d2 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h = self.activation(self.fc_e1(x))
        z = self.activation(self.fc_e2(h))
        return z

    def decode(self, z):
        h = self.activation(self.fc_d1(z))
        recon_x = self.activation(self.fc_d2(h))
        return recon_x

    def forward(self, x):
        z = self.encode(x)
        recon_x = self.decode(z)
        return recon_x

class regression(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, activation="relu"):
        super(regression, self).__init__()

        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'gelu':
            self.activation = nn.GELU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        elif activation == 'sigmoid':
            self.activation = nn.Sigmoid()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")

        self.fc = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            self.activation,
            nn.Linear(hidden_dim, hidden_dim),
            self.activation,
            nn.Linear(hidden_dim, latent_dim),
            nn.Linear(latent_dim, 1)
        )



    def forward(self, x):
        pred_cost = self.fc(x)
        return pred_cost


def vae_loss(recon_x, x, mean, logvar):
    # [수정] BCE 대신 MSE Loss 사용

    MSE = F.mse_loss(recon_x, x, reduction='sum')
    # MAE = F.l1_loss(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())

    beta = 1.0 
    return MSE + beta * KLD


def ae_loss(recon_x, x):
    # [수정] BCE 대신 MSE Loss 사용

    MSE = F.mse_loss(recon_x, x, reduction='sum')
    # MAE = F.l1_loss(recon_x, x, reduction='sum')
    # KLD = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())

    beta = 1.0 
    # return MSE + beta * KLD
    return MSE



2025-12-12 14:55:56.969810: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-12 14:55:57.086154: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from tvm import auto_scheduler
import sys
sys.path.insert(0, "/root/work/tenset/scripts")
from print_programs import return_program
from tvm.auto_scheduler.feature import get_per_store_features_from_file
from make_dataset import load_and_register_tasks

json_file = "/root/work/tenset/dataset/measure_records_tenset/k80/([0bcb8746286db050cd088f375c85372d,1,64,64,128,6,6,32,128,1,64,64,32],cuda).json"

load_and_register_tasks()
inputs, results = auto_scheduler.RecordReader(json_file).read_lines()
raw_features, raw_normalized_throughputs, task_ids, min_latency = get_per_store_features_from_file(json_file, 10000)

In [6]:
import numpy as np

records = {
    "schedules": [],
    "cost_mean": [],
    "feature" : []
}

for i in range(len(inputs)):
    state, cost = return_program(inputs[i], results[i])
    # break
    if state is not None:
        cost_mean = np.mean([x.value for x in cost])
        feature = raw_features[i]
        records["feature"].append(feature)
        records["schedules"].append(state)
        records["cost_mean"].append(cost_mean)

features = np.array(records["feature"], dtype=np.float32)
costs = np.array(records["cost_mean"], dtype=np.float32)
print("Features shape:", features.shape)
# dataset = TensorDataset(torch.from_numpy(features), torch.from_numpy(costs))
# dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
HIDDEN_DIM = 256
LATENT_DIM = 128
BATCH_SIZE = 64
EPOCHS = 100
LEARNING_RATE = 1e-3



# (전체 데이터셋 기준)
data_mean = data_log_masked.sum() / num_elements
data_std = torch.sqrt( (((data_log_masked - data_mean) * masks)**2).sum() / num_elements )

print(f"Log Mean: {data_mean:.4f}, Log Std: {data_std:.4f}")

# (data_mean, data_std는 나중에 복원을 위해 저장)

# 표준화 (패딩된 0 영역은 (0-mean)/std가 되지만, 학습 시 손실에서 제외)
normalized_data = (data_log - data_mean) / (data_std)

# --- 3. DataLoader 준비 (마스크 포함) ---
dataset = TensorDataset(normalized_data, masks) # 마스크도 함께 전달
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# --- 4. 모델, 옵티마이저 초기화 (이전과 동일) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VAE(input_dim=MAX_SEQ_LENGTH, hidden_dim=HIDDEN_DIM, latent_dim=LATENT_DIM, activation="relu").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# --- 5. 학습 루프 (마스킹된 손실 계산) ---
model.train()
print("Training Start...")
for epoch in range(EPOCHS):
    epoch_loss = 0
    for (batch, mask) in dataloader: # 마스크도 함께 받음
        batch = batch.to(device)
        mask = mask.to(device)
        
        # Forward
        recon_batch, mean, logvar = model(batch * mask) # 입력 시에도 마스킹
        # recon_batch = model(batch * mask) # 입력 시에도 마스킹
        
        # [수정] 손실 계산 시 패딩 영역 제외 (마스크 곱하기)
        recon_error = F.mse_loss(recon_batch * mask, batch * mask, reduction='sum')
        KLD = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
        loss = recon_error + 0.5*KLD
        # loss = ae_loss(recon_batch * mask, batch * mask)
        # loss = vae_loss(recon_batch * mask, batch * mask)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(dataset)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{EPOCHS}, Average Loss: {avg_loss:.4f}')

print("Training Complete.")

Log Mean: 1.1560, Log Std: 2.0146
Training Start...
Epoch 10/200, Average Loss: 7.3795
Epoch 20/200, Average Loss: 6.5847
Epoch 30/200, Average Loss: 6.1705
Epoch 40/200, Average Loss: 6.0396
Epoch 50/200, Average Loss: 5.8569
Epoch 60/200, Average Loss: 5.7841
Epoch 70/200, Average Loss: 5.8289
Epoch 80/200, Average Loss: 5.7564
Epoch 90/200, Average Loss: 5.7171
Epoch 100/200, Average Loss: 5.6903
Epoch 110/200, Average Loss: 5.6446
Epoch 120/200, Average Loss: 5.6297
Epoch 130/200, Average Loss: 5.5827


KeyboardInterrupt: 

In [19]:
# 패딩 (max_len로 길이 맞추기)
padded_data = np.ones((len(records["feature"]), max_len), dtype=np.float32)
masks = np.zeros_like(padded_data, dtype=np.float32)
for i, ext_list in enumerate(records["feature"]):
    length = min(len(ext_list), max_len)
    padded_data[i, :length] = ext_list[:length]
    masks[i, :length] = 1.0 # 실제 데이터가 있는 부분

data = torch.from_numpy(padded_data)
costs = torch.from_numpy(np.array(records["cost_mean"], dtype=np.float32)).unsqueeze(1)
masks = torch.from_numpy(masks)

data_log = torch.log(data + 1e-8) * masks   # (N, max_len)

normalized_data = torch.zeros_like(data_log)
for col_idx in range(data_log.shape[1]):
    col = data_log[:, col_idx]

    # 실제 값(0이 아닌 값)만 골라서 통계 계산
    valid = col != 0
    if valid.sum() == 0:
        # 이 컬럼은 전부 패딩이므로 그냥 0으로 둠
        continue

    col_valid = col[valid]
    col_mean = col_valid.mean()
    col_std = col_valid.std()

    # std가 0이면 분모를 1로 해서 폭발 막기
    if col_std == 0:
        col_std = 1.0

    col_norm = (col - col_mean) / col_std
    # 패딩 자리(0이었던 곳)는 다시 0으로
    col_norm = torch.where(valid, col_norm, torch.tensor(0.0, dtype=col.dtype))
    normalized_data[:, col_idx] = col_norm


In [None]:
from tvm.auto_scheduler.feature import get_per_store_features_from_file
from make_dataset import load_and_register_tasks

load_and_register_tasks()
raw_features, raw_normalized_throughputs, task_ids, min_latency = get_per_store_features_from_file("/root/work/tenset/dataset/measure_records/k80/([0c9a5ba46ffc5e1a9e5641018527117f,4,7,7,160,1,1,160,960,1,1,1,960,4,7,7,960],cuda).json", 10000)


raw_features = torch.tensor(raw_features.tolist(), dtype=torch.float32)
raw_normalized_throughputs = torch.tensor(raw_normalized_throughputs, dtype=torch.float32).unsqueeze(1)

masks = torch.zeros_like(raw_normalized_throughputs)

torch.Size([3734, 45])

In [None]:
# seed 고정
torch.manual_seed(42)
np.random.seed(42)

HIDDEN_DIM = 256
LATENT_DIM = 128
BATCH_SIZE = 128
EPOCHS = 1000
LEARNING_RATE = 1e-5

# --- 3. DataLoader 준비 (마스크 포함) ---
from torch.utils.data import random_split, DataLoader, TensorDataset

# 전체 데이터셋
dataset = TensorDataset(normalized_data, costs)

# train : val = 8 : 2 (2960 : 740 정도)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader 준비
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train size: {len(train_dataset)}, Validation size: {len(val_dataset)}")

# --- 4. 모델, 옵티마이저 초기화 (이전과 동일) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = regression(input_dim=45, hidden_dim=HIDDEN_DIM, latent_dim=LATENT_DIM, activation="relu").to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# --- 5. 학습 루프 (마스킹된 손실 계산) ---
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        pred = model(x_batch)
        loss = F.l1_loss(pred, y_batch, reduction='sum')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_dataset)

    # --- Validation ---
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            pred = model(x_batch)
            loss = F.l1_loss(pred, y_batch, reduction='sum')
            val_loss += loss.item()
    val_loss /= len(val_dataset)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Train size: 2987, Validation size: 747
Epoch 10/1000 | Train Loss: 0.0505 | Val Loss: 0.0432
Epoch 20/1000 | Train Loss: 0.0479 | Val Loss: 0.0408
Epoch 30/1000 | Train Loss: 0.0467 | Val Loss: 0.0397
Epoch 40/1000 | Train Loss: 0.0460 | Val Loss: 0.0391
Epoch 50/1000 | Train Loss: 0.0455 | Val Loss: 0.0386
Epoch 60/1000 | Train Loss: 0.0451 | Val Loss: 0.0383
Epoch 70/1000 | Train Loss: 0.0448 | Val Loss: 0.0381
Epoch 80/1000 | Train Loss: 0.0446 | Val Loss: 0.0378
Epoch 90/1000 | Train Loss: 0.0444 | Val Loss: 0.0377
Epoch 100/1000 | Train Loss: 0.0441 | Val Loss: 0.0375
Epoch 110/1000 | Train Loss: 0.0439 | Val Loss: 0.0374
Epoch 120/1000 | Train Loss: 0.0437 | Val Loss: 0.0372
Epoch 130/1000 | Train Loss: 0.0436 | Val Loss: 0.0371
Epoch 140/1000 | Train Loss: 0.0434 | Val Loss: 0.0370
Epoch 150/1000 | Train Loss: 0.0433 | Val Loss: 0.0369
Epoch 160/1000 | Train Loss: 0.0431 | Val Loss: 0.0368
Epoch 170/1000 | Train Loss: 0.0429 | Val Loss: 0.0366
Epoch 180/1000 | Train Loss: 0.0428

KeyboardInterrupt: 

In [29]:
def preprocess(raw_data, max_len, data_mean, data_std):
    """원시 리스트를 모델 입력 텐서로 변환"""
    # 1. 패딩
    padded = np.zeros(max_len, dtype=np.float32)
    length = min(len(raw_data), max_len)
    padded[:length] = raw_data[:length]
    
    # 2. 텐서 변환
    tensor_data = torch.from_numpy(padded)
    
    # 3. 로그 변환 (0이 없다고 가정)
    log_data = torch.log(tensor_data + 1e-8) # 0 방지
    
    # 4. 표준화 (학습 시 사용한 평균, 표준편차 사용)
    normalized_data = (log_data - data_mean) / (data_std + 1e-8)
    
    # 5. 패딩 영역 0으로 마스킹 (중요)
    mask = torch.zeros_like(tensor_data)
    mask[:length] = 1.0
    
    return normalized_data.unsqueeze(0), mask.unsqueeze(0) # 배치 차원 추가

def postprocess(normalized_tensor, data_mean, data_std):
    """모델 출력 텐서를 원본 스케일의 extent로 변환"""
    # 1. 역-표준화
    log_tensor = (normalized_tensor * (data_std + 1e-8)) + data_mean
    
    # 2. 역-로그 (exp)
    extent_tensor = torch.exp(log_tensor)
    
    # 3. 정수로 변환 (extent는 정수)
    return torch.round(extent_tensor).int()




# --- 1. 설정 (학습 시와 동일하게) ---
MAX_SEQ_LENGTH = 45
HIDDEN_DIM = 256
LATENT_DIM = 128

MODEL_PATH = 'vae_model.pth'        # 학습된 모델 경로
STATS_PATH = 'norm_stats.pth'      # 학습 시 저장한 정규화 통계 (mean, std)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 2. 모델 및 통계 로드 ---
# (주의: 학습 시 아래와 같이 저장했다고 가정)
# torch.save(model, MODEL_PATH)
# torch.save({'mean': data_mean, 'std': data_std}, STATS_PATH)

try:
    # model = torch.load(MODEL_PATH, map_location=device)
    model.eval() # 테스트 모드로 설정

    # stats = torch.load(STATS_PATH)
    # data_mean = stats['mean'].to(device)
    # data_std = stats['std'].to(device)
except FileNotFoundError:
    print(f"오류: {MODEL_PATH} 또는 {STATS_PATH} 파일을 찾을 수 없습니다.")
    print("테스트 전에 모델과 통계 데이터를 먼저 학습/저장해야 합니다.")
    exit()

print(f"모델 {MODEL_PATH} 및 통계 {STATS_PATH} 로드 완료.\n")

# --- 3. 테스트 1: 재구성 (Reconstruction) ---
print("--- 1. 재구성 테스트 ---")
test_sample_raw = [28, 1, 112, 1, 1, 1, 37, 112, 1, 64, 112, 1, 16, 4, 2, 4, 2, 2, 2, 8, 2, 4]

# 전처리
test_input, mask = preprocess(test_sample_raw, MAX_SEQ_LENGTH, data_mean, data_std)
test_input = test_input.to(device)
mask = mask.to(device)

with torch.no_grad(): # 기울기 계산 비활성화
    recon_output, _, _ = model(test_input * mask)

# 후처리
recon_sample = postprocess(recon_output.squeeze(0), data_mean, data_std)

# 원본 길이만큼 잘라서 비교
original_len = len(test_sample_raw)

print(f" 원본 데이터: {test_sample_raw}")
print(f" 복원 데이터: {recon_sample[:original_len].tolist()}")

# (참고: VAE는 손실 압축이므로 원본과 완벽히 같지 않습니다)

# --- 4. 테스트 2: 생성 (Generation) ---
print("\n--- 2. 신규 생성 테스트 ---")
num_to_generate = 3

with torch.no_grad():
    # 잠재 공간(N(0,1))에서 무작위 샘플링
    z = torch.randn(num_to_generate, LATENT_DIM).to(device)
    
    # 디코더로 생성
    generated_output = model.decode(z)
    
# 후처리
generated_samples = postprocess(generated_output, data_mean, data_std)

print(f"{num_to_generate}개의 신규 루프 범위 벡터 생성:")
for i, sample in enumerate(generated_samples):
    # 0이나 음수는 1로 클리핑 (extent는 최소 1)
    cleaned_sample = torch.clamp(sample, min=1)
    
    # (편의상 앞 25개 정도만 출력)
    print(f" Sample {i+1}: {cleaned_sample[:25].tolist()}...")

모델 vae_model.pth 및 통계 norm_stats.pth 로드 완료.

--- 1. 재구성 테스트 ---
 원본 데이터: [28, 1, 112, 1, 1, 1, 37, 112, 1, 64, 112, 1, 16, 4, 2, 4, 2, 2, 2, 8, 2, 4]
 복원 데이터: [5, 4, 53, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 39, 24, 53, 1, 6]

--- 2. 신규 생성 테스트 ---
3개의 신규 루프 범위 벡터 생성:
 Sample 1: [3, 3, 272, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 74, 3, 270, 2, 1, 269, 2, 1]...
 Sample 2: [11, 4, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 30, 32, 10, 3, 28, 10, 1, 1]...
 Sample 3: [91, 3, 47, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 48, 47, 1, 7, 46, 2, 1]...
