In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim
import torch.nn.functional as F
from abc import abstractmethod
from collections import defaultdict
from functools import lru_cache
from itertools import count
from typing import List, Dict
from typing import Tuple, Any
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from torch.nn import MSELoss, LSTM, GRU, RNN
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
from importlib import reload
from config import Config
import config
from data import MDataset, Graph, GraphNode, load_graphs, save_dataset_pkl, load_dataset_pkl, save_scalers_pkl, load_scalers_pkl
import data
from base_module import MModule, pad_np_vectors
import base_module
from executor import single_train_loop, nested_detach
import executor
from objects import ModelType
import objects
from metric import MetricUtil
import metric
from logger import init_logging, logging
import logger
import gcn
from gcn import GCNLayer
import transformer
from transformer import TransformerModel
reload(config)
reload(data)
reload(base_module)
reload(executor)
reload(objects)
reload(metric)
reload(logger)
reload(gcn)
reload(transformer)
from config import Config
from data import MDataset, Graph, load_graphs
from base_module import MModule
from objects import ModelType
from metric import MetricUtil
from logger import init_logging
from gcn import GCNLayer
from transformer import TransformerModel
init_logging()

datasets_path: /root/guohao/repos/DLT-perf-model/datasets
configs_path: /root/guohao/repos/DLT-perf-model/notebooks/configs
datasets_path: /root/guohao/repos/DLT-perf-model/datasets
configs_path: /root/guohao/repos/DLT-perf-model/notebooks/configs


In [4]:
dataset_environment_str = "T4_CPUALL"
normalizer_cls = StandardScaler # MinMaxScaler
dummy = False
model_type = ModelType.RNN
method_prefix = "SubgraphBased"

In [5]:

eval_graphs = load_graphs(dataset_environment_str,
                            train_or_eval="train",
                            use_dummy=dummy,
                            max_row=200_000)
train_graphs = load_graphs(dataset_environment_str,
                            train_or_eval="train",
                            use_dummy=dummy,
                            max_row=1000_000)

[2023-12-13 09:21:51,786] {data.py:441} INFO - Loading graphs train
[2023-12-13 09:21:51,786] {data.py:412} INFO - Loading merged.csv
[2023-12-13 09:21:52,304] {data.py:415} INFO - Loaded merged.csv, 200000 rows
[2023-12-13 09:21:52,670] {data.py:421} INFO - Loaded mnasnet1_3.180_7.csv, 769 rows
[2023-12-13 09:21:53,403] {data.py:421} INFO - Loaded convnext_small.96_7.csv, 1595 rows
[2023-12-13 09:21:53,987] {data.py:421} INFO - Loaded googlenet.120_7.csv, 905 rows
[2023-12-13 09:21:54,331] {data.py:421} INFO - Loaded mobilenet_v3_small.249_7.csv, 732 rows
[2023-12-13 09:21:54,474] {data.py:421} INFO - Loaded squeezenet1_0.173_7.csv, 280 rows
[2023-12-13 09:21:54,825] {data.py:421} INFO - Loaded rand_4000.251_7.csv, 719 rows
[2023-12-13 09:21:56,018] {data.py:421} INFO - Loaded densenet161.132_7.csv, 2507 rows
[2023-12-13 09:21:57,227] {data.py:421} INFO - Loaded densenet169.127_7.csv, 2633 rows
[2023-12-13 09:21:57,394] {data.py:421} INFO - Loaded vgg19_bn.119_7.csv, 333 rows
[2023-12

In [6]:
train_configs = {
    ModelType.MLPTestSubgraph.name: Config.from_dict({
        "model": "MLPTestSubgraph",
        "all_seed": 42,
        "dataset_environment_str": dataset_environment_str,
        "dataset_params": {
            "duration_summed": False,
        },
        "dataset_dummy": False,
        "batch_size": 16,
        "eval_steps": 5000,
        "learning_rate": 1e-3,
        "epochs": 20,
        "optimizer": "Adam",
        "meta_configs": {
            "learning_rate": 0.005,
            "meta_learning_rate": 0.001,
            "meta_train_steps": 1000,
            "meta_task_per_step": 8,
            "meta_fast_adaption_step": 5,
            "meta_dataset_train_environment_strs": [dataset_environment_str],
            "meta_dataset_eval_environment_strs": [dataset_environment_str],
        },
    }),
    ModelType.LSTM.name: Config.from_dict({
        "model": "LSTM",
        "dataset_environment_str": dataset_environment_str,
        "meta_dataset_environment_strs": [dataset_environment_str],
        "dataset_subgraph_node_size": 10,
        "all_seed": 42,
        "dataset_params": {
            "duration_summed": False,
        },
        "model_params": {
            "num_layers": 4,
            "bidirectional": True
        },
        "dataset_dummy": True,
        "batch_size": 16,
        "eval_steps": 5000,
        "learning_rate": 1e-3,
        "epochs": 20,
        "optimizer": "Adam",
        "meta_configs": {
            "learning_rate": 0.005,
            "meta_learning_rate": 0.001,
            "meta_train_steps": 1000,
            "meta_task_per_step": 8,
            "meta_fast_adaption_step": 5,
            "meta_dataset_train_environment_strs": [dataset_environment_str],
            "meta_dataset_eval_environment_strs": [dataset_environment_str],
        },
    }),
    ModelType.RNN.name: Config.from_dict({
        "model": "RNN",
        "dataset_environment_str": dataset_environment_str,
        "meta_dataset_environment_strs": [dataset_environment_str],
        "dataset_subgraph_node_size": 10,
        "all_seed": 42,
        "dataset_params": {
            "duration_summed": False,
        },
        "model_params": {
            "num_layers": 5,
            "bidirectional": True,
            "hidden_size": 64
        },
        "dataset_dummy": True,
        "batch_size": 16,
        "eval_steps": 5000,
        "learning_rate": 1e-3,
        "epochs": 20,
        "optimizer": "Adam",
        "meta_configs": {
            "learning_rate": 0.005,
            "meta_learning_rate": 0.001,
            "meta_train_steps": 1000,
            "meta_task_per_step": 8,
            "meta_fast_adaption_step": 5,
            "meta_dataset_train_environment_strs": [dataset_environment_str],
            "meta_dataset_eval_environment_strs": [dataset_environment_str],
        },
    }),
    ModelType.GRU.name: Config.from_dict({
        "model": "GRU",
        "dataset_environment_str": dataset_environment_str,
        "meta_dataset_environment_strs": [dataset_environment_str],
        "dataset_subgraph_node_size": 10,
        "all_seed": 42,
        "dataset_params": {
            "duration_summed": False,
        },
        "model_params": {
            "num_layers": 5,
            "bidirectional": True
        },
        "dataset_dummy": True,
        "batch_size": 16,
        "eval_steps": 5000,
        "learning_rate": 1e-3,
        "epochs": 20,
        "optimizer": "Adam",
        "meta_configs": {
            "learning_rate": 0.005,
            "meta_learning_rate": 0.001,
            "meta_train_steps": 1000,
            "meta_task_per_step": 8,
            "meta_fast_adaption_step": 5,
            "meta_dataset_train_environment_strs": [dataset_environment_str],
            "meta_dataset_eval_environment_strs": [dataset_environment_str],
        },
    }),
    ModelType.GCNSubgraph.name: Config.from_dict({
        "model": "GCNGrouping",
        "dataset_environment_str": dataset_environment_str,
        "dataset_subgraph_node_size": 10,
        "all_seed": 42,
        "dataset_params": {
            "duration_summed": False,
        },
        "dataset_dummy": True,
        "batch_size": 16,
        "eval_steps": 5000,
        "learning_rate": 1e-3,
        "epochs": 20,
        "optimizer": "Adam",
        "meta_configs": {
            "learning_rate": 0.005,
            "meta_learning_rate": 0.001,
            "meta_train_steps": 1000,
            "meta_task_per_step": 8,
            "meta_fast_adaption_step": 5,
            "meta_dataset_train_environment_strs": [dataset_environment_str],
            "meta_dataset_eval_environment_strs": [dataset_environment_str],
        },
    }),
    ModelType.Transformer.name: Config.from_dict({
        "model": "Transformer",
        "dataset_environment_str": dataset_environment_str,
        "dataset_subgraph_node_size": 10,
        "all_seed": 42,
        "dataset_params": {
            "duration_summed": False,
        },
        "model_params": {
            "nlayers": 6,
            "d_hid": 64,
            "dropout": 0.0
        },
        "dataset_dummy": True,
        "batch_size": 16,
        "eval_steps": 5000,
        "learning_rate": 1e-3,
        "epochs": 20,
        "optimizer": "Adam",
        "meta_configs": {
            "learning_rate": 0.005,
            "meta_learning_rate": 0.001,
            "meta_train_steps": 1000,
            "meta_task_per_step": 8,
            "meta_fast_adaption_step": 5,
            "meta_dataset_train_environment_strs": [dataset_environment_str],
            "meta_dataset_eval_environment_strs": [dataset_environment_str],
        },
    }),
}

conf: Config = train_configs[model_type.name]

In [7]:
def subgraph_features(graph: Graph, subgraph_node_size: int = 10, step: int = 5, dataset_params: Dict = {}) -> \
        Tuple[List[Dict], List[Dict]]:
    subgraphs, _ = graph.subgraphs(subgraph_node_size=subgraph_node_size, step=step)
    X, Y = list(), list()

    def subgraph_feature(nodes: List[GraphNode]):
        feature_matrix = list()
        for node in nodes:
            feature = node.op.to_feature_array(
                mode=dataset_params.get("mode", "complex"))
            feature = np.array(feature)
            feature_matrix.append(feature)

        feature_matrix = pad_np_vectors(feature_matrix)
        feature_matrix = np.array(feature_matrix)

        adj_matrix = [
            [0.] * len(nodes) for _ in range(len(nodes))
        ]
        for curr_idx, node in enumerate(nodes):
            if curr_idx + 1 < len(nodes):
                adj_matrix[curr_idx][curr_idx+1] = 1.

        adj_matrix = np.array(adj_matrix)
        # x
        feature = {
            "x_graph_id": graph.ID,
            "x_node_ids": "|".join([str(node.node_id) for node in nodes]),
            "x_subgraph_feature": feature_matrix,
            "x_adj_matrix": adj_matrix
        }

        # y
        subgraph_duration = sum(node.duration + node.gap for node in subgraph)
        nodes_durations = list()
        for node in subgraph:
            node_duration_label = (
                node.duration, node.gap
            )
            nodes_durations.append(node_duration_label)

        label = {
            "y_graph_id": graph.ID,
            "y_nodes_durations": nodes_durations,
            "y_subgraph_durations": (subgraph_duration,)
        }

        return feature, label

    for i, subgraph in enumerate(subgraphs):
        x, y = subgraph_feature(subgraph)
        X.append(x)
        Y.append(y)

    return X, Y


def init_dataset(graphs: List[Graph]) -> MDataset:
    X = list()
    Y = list()

    subgraph_feature_maxsize = 0

    for graph in graphs:
        X_, Y_ = subgraph_features(graph=graph,
                                        subgraph_node_size=conf.dataset_subgraph_node_size,
                                        step=conf.dataset_subgraph_step,
                                        dataset_params=conf.dataset_params)
        for x in X_:
            subgraph_feature_size = len(x["x_subgraph_feature"][0])
            subgraph_feature_maxsize = max(subgraph_feature_maxsize, subgraph_feature_size)

        X.extend(X_)
        Y.extend(Y_)

    for x in X:
        x["x_subgraph_feature"] = pad_np_vectors(x["x_subgraph_feature"], maxsize=subgraph_feature_maxsize)

    dataset = MDataset(X, Y)
    return dataset

train_ds = init_dataset(train_graphs)
eval_ds = init_dataset(eval_graphs)

In [8]:
def get_scalers(raw_train_ds: MDataset):

    def _preprocess_required_data(ds: MDataset):
        x_subgraph_feature_array = list()
        y_nodes_durations_array = list()
        y_subgraph_durations_array = list()

        for data in ds:
            feature, label = data
            x_subgraph_feature = feature["x_subgraph_feature"]
            assert isinstance(x_subgraph_feature, list)
            x_subgraph_feature_array.extend(x_subgraph_feature)

            y_nodes_durations = label["y_nodes_durations"]
            assert isinstance(y_nodes_durations, list)
            y_nodes_durations_array.extend(y_nodes_durations)

            y_subgraph_durations = label["y_subgraph_durations"]
            y_subgraph_durations_array.append(y_subgraph_durations)

        x_subgraph_feature_array = np.array(x_subgraph_feature_array)
        y_nodes_durations_array = np.array(y_nodes_durations_array)
        y_subgraph_durations_array = np.array(y_subgraph_durations_array)
        return [x_subgraph_feature_array, y_nodes_durations_array, y_subgraph_durations_array]
    
    scaler_cls = conf.dataset_normalizer_cls

    x_subgraph_feature_array, y_nodes_durations_array, y_subgraph_durations_array = _preprocess_required_data(
        ds=raw_train_ds)

    x_subgraph_feature_scaler = scaler_cls()
    x_subgraph_feature_scaler.fit(x_subgraph_feature_array)

    y_nodes_durations_scaler = scaler_cls()
    y_nodes_durations_scaler.fit(y_nodes_durations_array)

    y_subgraph_durations_scaler = scaler_cls()
    y_subgraph_durations_scaler.fit(y_subgraph_durations_array)

    return x_subgraph_feature_scaler, y_nodes_durations_scaler, y_subgraph_durations_scaler

scalers = get_scalers(train_ds)
x_subgraph_feature_scaler, y_nodes_durations_scaler, y_subgraph_durations_scaler = scalers


In [9]:

def preprocess_dataset(ds: MDataset) -> MDataset:
    x_subgraph_feature_scaler, y_nodes_durations_scaler, y_subgraph_durations_scaler = scalers

    processed_features = list()
    processed_labels = list()

    for data in ds:
        feature, label = data
        x_subgraph_feature = feature["x_subgraph_feature"]
        assert isinstance(x_subgraph_feature, list)
        x_subgraph_feature = np.array(x_subgraph_feature).astype(np.float32)
        transformed_x_subgraph_feature = x_subgraph_feature_scaler.transform(x_subgraph_feature)

        x_adj_matrix = feature["x_adj_matrix"]
        x_adj_matrix = np.array(x_adj_matrix).astype(np.float32)

        y_nodes_durations = label["y_nodes_durations"]
        assert isinstance(y_nodes_durations, list)
        y_nodes_durations = np.array(y_nodes_durations).astype(np.float32)
        transformed_y_nodes_durations = y_nodes_durations_scaler.transform(y_nodes_durations)

        y_subgraph_durations = label["y_subgraph_durations"]
        y_subgraph_durations_array = (y_subgraph_durations,)
        y_subgraph_durations_array = y_subgraph_durations_scaler.transform(y_subgraph_durations_array)
        transformed_y_subgraph_durations = y_subgraph_durations_array[0]

        processed_features.append({
            "x_graph_id": feature["x_graph_id"],
            "x_node_ids": feature["x_node_ids"],
            "x_subgraph_feature": torch.Tensor(transformed_x_subgraph_feature),
            "x_adj_matrix": torch.Tensor(x_adj_matrix)
        })

        processed_labels.append({
            "y_graph_id": label["y_graph_id"],
            "y_nodes_durations": torch.Tensor(transformed_y_nodes_durations),
            "y_subgraph_durations": torch.Tensor(transformed_y_subgraph_durations)
        })

    ds = MDataset(processed_features, processed_labels)
    return ds


preprocessed_train_ds = preprocess_dataset(train_ds)
preprocessed_eval_ds = preprocess_dataset(eval_ds)


In [10]:

save_dataset_pkl(preprocessed_train_ds, conf.dataset_environment, method_prefix, 'train',
                         conf.dataset_normalization)
save_dataset_pkl(preprocessed_eval_ds, conf.dataset_environment, method_prefix, 'eval',
                         conf.dataset_normalization)
save_scalers_pkl(scalers, conf.dataset_environment, method_prefix, 'train',
                         conf.dataset_normalization)

In [11]:
preprocessed_train_ds = load_dataset_pkl(conf.dataset_environment, method_prefix, 'train', 
                                         conf.dataset_normalization)
preprocessed_eval_ds = load_dataset_pkl(conf.dataset_environment, method_prefix, 'eval',
                                        conf.dataset_normalization)
scalers = load_scalers_pkl(conf.dataset_environment, method_prefix, 'train',
                           conf.dataset_normalization)
x_subgraph_feature_scaler, y_nodes_durations_scaler, y_subgraph_durations_scaler = scalers

Loading dataset T4_CPU-1 SubgraphBased train Standard
Loading dataset T4_CPU-1 SubgraphBased eval Standard
Loading scalers T4_CPU-1 SubgraphBased train, Standard


In [12]:
def compute_evaluate_metrics(input_batches, output_batches, eval_loss) -> Dict[str, float]:
    def compute_graph_nodes_durations(outputs_, node_ids_str_):
            x_subgraph_feature_scaler, y_nodes_durations_scaler, y_subgraph_durations_scaler = scalers
            node_to_durations = defaultdict(list)
            for i, output_ in enumerate(outputs_):
                node_ids = node_ids_str_[i]
                node_ids_ = node_ids.split("|")
                assert len(output_) == len(node_ids_)
                transformed: np.ndarray = y_nodes_durations_scaler.inverse_transform(output_)
                for i, node_id in enumerate(node_ids_):
                    node_to_durations[node_id].append(np.sum(transformed[i]))
            node_to_duration = {k: np.average(v) for k, v in node_to_durations.items()}
            return node_to_duration

    graph_id_to_node_to_duration = defaultdict(lambda: defaultdict(list))
    for inputs, outputs in zip(input_batches, output_batches):
        outputs = nested_detach(outputs)
        outputs = outputs.cpu().numpy()
        graph_ids = inputs["x_graph_id"]
        graph_groups = defaultdict(list)
        for i, graph_id in enumerate(graph_ids):
            graph_groups[graph_id].append(i)

        for graph_id, indices in graph_groups.items():
            group_x_node_ids = [v for i, v in enumerate(inputs["x_node_ids"]) if i in indices]
            group_outputs = [v for i, v in enumerate(outputs) if i in indices]
            node_to_durations = compute_graph_nodes_durations(group_outputs, group_x_node_ids)
            for node, duration in node_to_durations.items():
                graph_id_to_node_to_duration[graph_id][node].append(duration)
    graph_id_to_duration_pred = dict()
    # TODO check this!!!
    for graph_id, node_to_duration in graph_id_to_node_to_duration.items():
        duration_pred = 0
        for _, duration_preds in node_to_duration.items():
            duration_pred += np.average(duration_preds)
        graph_id_to_duration_pred[graph_id] = duration_pred
    duration_metrics = MetricUtil.compute_duration_metrics(eval_graphs, graph_id_to_duration_pred)
    return {"eval_loss": eval_loss, **duration_metrics}


In [13]:

def to_device(conf: Config, features, labels):
    features['x_subgraph_feature'] = features['x_subgraph_feature'].to(conf.device)
    features['x_adj_matrix'] = features['x_adj_matrix'].to(conf.device)
    labels['y_nodes_durations'] = labels['y_nodes_durations'].to(conf.device)
    labels['y_subgraph_durations'] = labels['y_subgraph_durations'].to(conf.device)
    return features, labels

In [14]:

class MLPTest_SubgraphModel(MModule):

    def __init__(self, x_node_feature_count, x_node_feature_size, y_nodes_duration_count, y_nodes_duration_size,
                 **kwargs):
        super().__init__(**kwargs)
        self.x_node_feature_count, self.x_node_feature_size, self.y_nodes_duration_count, self.y_nodes_duration_size \
            = x_node_feature_count, x_node_feature_size, y_nodes_duration_count, y_nodes_duration_size
        self.flatten = torch.nn.Flatten()
        self.linear1 = torch.nn.Linear(in_features=self.x_node_feature_count * self.x_node_feature_size,
                                       out_features=64)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(in_features=64,
                                       out_features=32)
        self.relu2 = torch.nn.ReLU()
        self.output = torch.nn.Linear(32, self.y_nodes_duration_count * self.y_nodes_duration_size)
        self.loss_fn = MSELoss()

    def forward(self, X):
        X = X["x_subgraph_feature"]
        X = self.flatten(X)
        X = self.linear1(X)
        X = self.relu1(X)
        X = self.linear2(X)
        X = self.relu2(X)
        Y = self.output(X)
        Y = torch.reshape(Y, (-1, self.y_nodes_duration_count, self.y_nodes_duration_size))
        return Y

    def compute_loss(self, outputs, Y):
        nodes_durations = Y["y_nodes_durations"]
        loss = self.loss_fn(outputs, nodes_durations)
        return loss

def init_MLPTestSubgraph_model() -> MModule | Any:
    sample_preprocessed_ds = preprocessed_train_ds
    sample_x_dict = sample_preprocessed_ds.features[0]
    sample_y_dict = sample_preprocessed_ds.labels[0]
    x_node_feature_count = len(sample_x_dict["x_subgraph_feature"])
    x_node_feature_size = len(sample_x_dict["x_subgraph_feature"][0])
    y_nodes_duration_count = len(sample_y_dict["y_nodes_durations"])
    y_nodes_duration_size = len(sample_y_dict["y_nodes_durations"][0])
    return MLPTest_SubgraphModel(x_node_feature_count,
                                    x_node_feature_size,
                                    y_nodes_duration_count,
                                    y_nodes_duration_size)


In [15]:

class LSTMModel(MModule):
    def __init__(self, feature_size, nodes_durations_len, num_layers, bidirectional, **kwargs):
        super().__init__(**kwargs)
        self.lstm = LSTM(input_size=feature_size, hidden_size=feature_size, num_layers=num_layers, batch_first=True,
                         bidirectional=bidirectional)
        num_directions = 2 if bidirectional else 1
        self.project = torch.nn.Linear(in_features=feature_size * num_directions, out_features=nodes_durations_len)
        self.loss_fn = MSELoss()

    def forward(self, X):
        X = X["x_subgraph_feature"]
        print(X.shape)
        out, _ = self.lstm(X)
        
        Y = self.project(out)
        print(Y.shape)
        return Y

    def compute_loss(self, outputs, Y):
        node_durations = Y["y_nodes_durations"]
        print(node_durations.shape)
        loss = self.loss_fn(outputs, node_durations)
        return loss

def init_LSTM_model() -> MModule | Any:
    def default_model_params() -> Dict[str, Any]:
        return {
            "num_layers": 4,
            "bidirectional": True,
        }

    sample_preprocessed_ds = preprocessed_train_ds
    sample_x_dict = sample_preprocessed_ds.features[0]
    sample_y_dict = sample_preprocessed_ds.labels[0]
    x_node_feature_size = len(sample_x_dict["x_subgraph_feature"][0])
    y_nodes_durations_len = len(sample_y_dict["y_nodes_durations"][0])
    model_params = conf.model_params
    final_params = default_model_params()
    for k, v in final_params.items():
        final_params[k] = model_params.get(k, v)
    print(final_params)
    return LSTMModel(
        feature_size=x_node_feature_size,
        nodes_durations_len=y_nodes_durations_len,
        **final_params
    )


In [16]:


class GRUModel(MModule):
    def __init__(self, feature_size, nodes_durations_len, num_layers, bidirectional, **kwargs):
        super().__init__(**kwargs)
        self.gru = GRU(input_size=feature_size, hidden_size=feature_size, num_layers=num_layers, batch_first=True,
                       bidirectional=bidirectional)
        num_directions = 2 if bidirectional else 1
        self.project = torch.nn.Linear(in_features=feature_size * num_directions, out_features=nodes_durations_len)
        self.loss_fn = MSELoss()

    def forward(self, X):
        X = X["x_subgraph_feature"]
        out, _ = self.gru(X)
        Y = self.project(out)
        return Y

    def compute_loss(self, outputs, Y):
        node_durations = Y["y_nodes_durations"]
        loss = self.loss_fn(outputs, node_durations)
        return loss


def init_GRU_model() -> MModule | Any:
    def default_model_params() -> Dict[str, Any]:
        return {
            "num_layers": 4,
            "bidirectional": True,
        }

    sample_preprocessed_ds = preprocessed_train_ds
    sample_x_dict = sample_preprocessed_ds.features[0]
    sample_y_dict = sample_preprocessed_ds.labels[0]
    x_node_feature_size = len(sample_x_dict["x_subgraph_feature"][0])
    y_nodes_durations_len = len(sample_y_dict["y_nodes_durations"][0])
    model_params = conf.model_params
    final_params = default_model_params()
    for k, v in final_params.items():
        final_params[k] = model_params.get(k, v)
    return GRUModel(
        feature_size=x_node_feature_size,
        nodes_durations_len=y_nodes_durations_len,
        **final_params
    )


In [17]:


class GCNSubgraphModel(MModule):
    def __init__(self, dim_feats, dim_h, dim_out, n_layers, dropout):
        super(GCNSubgraphModel, self).__init__()
        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(GCNLayer(dim_feats, dim_h, F.relu, 0))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(GCNLayer(dim_h, dim_h, F.relu, dropout))
        # output layer
        self.layers.append(GCNLayer(dim_h, dim_out, None, dropout))
        self.loss_fn = MSELoss()

    def forward(self, X):
        adj, features = X["x_adj_matrix"], X["x_subgraph_feature"]
        h = features
        for layer in self.layers:
            h = layer(adj, h)
        return h

    def compute_loss(self, outputs, Y) -> torch.Tensor:
        y_nodes_durations = Y["y_nodes_durations"]
        loss = self.loss_fn(outputs, y_nodes_durations)
        return loss


def init_GCNSubgraph_model() -> MModule | Any:
    def default_model_params() -> Dict[str, Any]:
        return {
            "dim_h": None,
            "n_layers": 2,
            "dropout": 0.1,
        }
    sample_preprocessed_ds = preprocessed_train_ds
    sample_x_dict = sample_preprocessed_ds.features[0]
    sample_y_dict = sample_preprocessed_ds.labels[0]
    x_node_feature_size = len(sample_x_dict["x_subgraph_feature"][0])
    y_nodes_durations_len = len(sample_y_dict["y_nodes_durations"][0])
    model_params = conf.model_params
    final_params = default_model_params()
    for k, v in final_params.items():
        final_params[k] = model_params.get(k, v)
    if final_params["dim_h"] is None:
        final_params["dim_h"] = x_node_feature_size
    return GCNSubgraphModel(
        dim_feats=x_node_feature_size,
        dim_out=y_nodes_durations_len,
        **final_params
    )

In [18]:
def init_Transformer_model() -> MModule | Any:
    def default_model_params() -> Dict[str, Any]:
        nhead: int = 8
        d_hid: int = 512
        nlayers: int = 6
        dropout: float = 0.5
        return {
            "nhead": nhead,
            "d_hid": d_hid,
            "nlayers": nlayers,
            "dropout": dropout
        }
    sample_preprocessed_ds = preprocessed_train_ds
    sample_x_dict = sample_preprocessed_ds.features[0]
    sample_y_dict = sample_preprocessed_ds.labels[0]
    x_node_feature_size = len(sample_x_dict["x_subgraph_feature"][0])
    nodes_durations_len = len(sample_y_dict["y_nodes_durations"][0])
    model_params = conf.model_params
    final_params = default_model_params()
    for k, v in final_params.items():
        final_params[k] = model_params.get(k, v)

    nhead = final_params["nhead"]
    while x_node_feature_size % nhead != 0:
        nhead -= 1
    if nhead != final_params["nhead"]:
        final_params["nhead"] = nhead
        logging.info(f"Transformer nhead set to {nhead}.")
        conf.model_params["nhead"] = nhead

    return TransformerModel(
        d_model=x_node_feature_size,
        output_d=nodes_durations_len,
        **final_params
    )

In [19]:
class RNNModel(MModule):
    def __init__(self, feature_size, nodes_durations_len,hidden_size, num_layers, bidirectional, **kwargs):
        print(f'feature_size: {feature_size}, nodes_durations_len: {nodes_durations_len}, num_layers: {num_layers}, bidirectional: {bidirectional}')
        super().__init__(**kwargs)
        self.hidden_size = hidden_size
        self.input_size = feature_size
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1
        self.node_durations_len = 2
        self.rnn = RNN(input_size=feature_size, 
                       hidden_size=hidden_size, 
                       num_layers=num_layers,
                          batch_first=True, 
                       bidirectional=bidirectional)
        
        
        self.project = torch.nn.Linear(in_features=self.hidden_size * self.num_directions, out_features=nodes_durations_len)
        
        self.loss_fn = MSELoss()

    def forward(self, X):
        X = X["x_subgraph_feature"] # (batch_size, subgraph_size(seq_len), input_size)
        batch_size = X.size(0)
        hidden = self.init_hidden(batch_size).cuda()
        out, _ = self.rnn(X, hidden) #(seq_len, batchsize, hidden_size)
        Y = self.project(out) # 16, 10, 2
        return Y

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.num_directions, batch_size, self.hidden_size)
        return hidden

    def compute_loss(self, outputs, Y):
        node_durations = Y["y_nodes_durations"]
        loss = self.loss_fn(outputs, node_durations)
        return loss

def init_RNN_model() -> MModule | Any:
    def default_model_params() -> Dict[str, Any]:
        return {
            "num_layers": 4,
            "hidden_size": 64,
            "bidirectional": True,
        }

    sample_preprocessed_ds = preprocessed_train_ds
    sample_x_dict = sample_preprocessed_ds.features[0]
    sample_y_dict = sample_preprocessed_ds.labels[0]
    x_node_feature_size = len(sample_x_dict["x_subgraph_feature"][0])
    y_nodes_durations_len = len(sample_y_dict["y_nodes_durations"][0])
    model_params = conf.model_params
    final_params = default_model_params()
    for k, v in final_params.items():
        final_params[k] = model_params.get(k, v)
    print(final_params)
    return RNNModel(
        feature_size=x_node_feature_size,
        nodes_durations_len=y_nodes_durations_len,
        **final_params
    )


In [20]:
init_model_funcs = {
    ModelType.Transformer.name: init_Transformer_model,
    ModelType.GCNSubgraph.name: init_GCNSubgraph_model,
    ModelType.GRU.name: init_GRU_model,
    ModelType.LSTM.name: init_LSTM_model,
    ModelType.MLPTestSubgraph.name: init_MLPTestSubgraph_model,
    ModelType.RNN.name: init_RNN_model,
}

# model_type = ModelType.MLPTestSubgraph
# model_type = ModelType.LSTM
# model_type = ModelType.GRU
model_type = ModelType.RNN
init_model = init_model_funcs[model_type.name]

model = init_model()
model = model.to(conf.device)
single_train_loop(model_type, conf, preprocessed_train_ds, preprocessed_eval_ds, model, compute_evaluate_metrics, to_device)


# model_type = ModelType.GRU
model_type = ModelType.LSTM
init_model = init_model_funcs[model_type.name]

model = init_model()
model = model.to(conf.device)
single_train_loop(model_type, conf, preprocessed_train_ds, preprocessed_eval_ds, model, compute_evaluate_metrics, to_device)

model_type = ModelType.GRU
init_model = init_model_funcs[model_type.name]

model = init_model()
model = model.to(conf.device)
single_train_loop(model_type, conf, preprocessed_train_ds, preprocessed_eval_ds, model, compute_evaluate_metrics, to_device)

model_type = ModelType.MLPTestSubgraph
init_model = init_model_funcs[model_type.name]

model = init_model()
model = model.to(conf.device)
single_train_loop(model_type, conf, preprocessed_train_ds, preprocessed_eval_ds, model, compute_evaluate_metrics, to_device)



{'num_layers': 5, 'hidden_size': 64, 'bidirectional': True}
feature_size: 66, nodes_durations_len: 2, num_layers: 5, bidirectional: True


[2023-12-13 09:35:42,091] {executor.py:120} INFO - ModelType.RNN start single training.
[2023-12-13 09:35:42,091] {executor.py:122} INFO - ModelType.RNN training epoch 0


  0%|          | 0/12479 [00:00<?, ?it/s]

[2023-12-13 09:35:42,469] {executor.py:139} INFO - ModelType.RNN trained for 0.37886704 seconds.
[2023-12-13 09:35:42,470] {executor.py:140} INFO - ModelType.RNN eval at step 0.
[2023-12-13 09:35:52,251] {executor.py:144} INFO - ModelType.RNN train loss: 1.522997260093689, eval metrics: {'eval_loss': 0.6059628767888614, 'MRE': 2.417379619393378, 'MAE': 1.3833079841999312, 'RMSE': 1029.99431445302}
[2023-12-13 09:35:52,253] {executor.py:174} INFO - Saving model at step 0 with loss 1.522997260093689,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 40%|███▉      | 4984/12479 [00:41<00:46, 161.99it/s]

[2023-12-13 09:36:23,547] {executor.py:139} INFO - ModelType.RNN trained for 41.456463222 seconds.
[2023-12-13 09:36:23,548] {executor.py:140} INFO - ModelType.RNN eval at step 5000.
[2023-12-13 09:36:33,180] {executor.py:144} INFO - ModelType.RNN train loss: 0.020022425800561905, eval metrics: {'eval_loss': 0.3603524173169064, 'MRE': 0.34208161475674, 'MAE': 0.300849411227036, 'RMSE': 346.8737881058838}
[2023-12-13 09:36:33,181] {executor.py:174} INFO - Saving model at step 5000 with loss 0.020022425800561905,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 80%|████████  | 9991/12479 [01:20<00:14, 168.85it/s]

[2023-12-13 09:37:03,064] {executor.py:139} INFO - ModelType.RNN trained for 80.973818055 seconds.
[2023-12-13 09:37:03,065] {executor.py:140} INFO - ModelType.RNN eval at step 10000.
[2023-12-13 09:37:12,866] {executor.py:144} INFO - ModelType.RNN train loss: 0.02907049097120762, eval metrics: {'eval_loss': 0.32469584891152453, 'MRE': 0.6683707536777103, 'MAE': 0.40480302359147646, 'RMSE': 298.3462667225354}
[2023-12-13 09:37:12,867] {executor.py:174} INFO - Saving model at step 10000 with loss 0.02907049097120762,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


100%|██████████| 12479/12479 [01:45<00:00, 118.24it/s]

[2023-12-13 09:37:27,631] {executor.py:122} INFO - ModelType.RNN training epoch 1



 20%|██        | 2515/12479 [00:14<00:58, 169.78it/s]

[2023-12-13 09:37:42,510] {executor.py:139} INFO - ModelType.RNN trained for 120.419303585 seconds.
[2023-12-13 09:37:42,511] {executor.py:140} INFO - ModelType.RNN eval at step 15000.
[2023-12-13 09:37:52,378] {executor.py:144} INFO - ModelType.RNN train loss: 0.02535936050117016, eval metrics: {'eval_loss': 0.3306924883428833, 'MRE': 0.5058184386976602, 'MAE': 0.4183782700177087, 'RMSE': 469.5062264797037}
[2023-12-13 09:37:52,379] {executor.py:174} INFO - Saving model at step 15000 with loss 0.02535936050117016,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 60%|██████    | 7507/12479 [00:54<00:29, 167.39it/s]

[2023-12-13 09:38:22,055] {executor.py:139} INFO - ModelType.RNN trained for 159.964053476 seconds.
[2023-12-13 09:38:22,055] {executor.py:140} INFO - ModelType.RNN eval at step 20000.
[2023-12-13 09:38:31,763] {executor.py:144} INFO - ModelType.RNN train loss: 0.047554850578308105, eval metrics: {'eval_loss': 0.21065442453273603, 'MRE': 0.49869468781756005, 'MAE': 0.29803931315890136, 'RMSE': 211.57890057897183}
[2023-12-13 09:38:31,765] {executor.py:174} INFO - Saving model at step 20000 with loss 0.047554850578308105,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


100%|██████████| 12479/12479 [01:33<00:00, 133.46it/s]

[2023-12-13 09:39:01,135] {executor.py:122} INFO - ModelType.RNN training epoch 2



  0%|          | 26/12479 [00:00<01:38, 125.91it/s]

[2023-12-13 09:39:01,450] {executor.py:139} INFO - ModelType.RNN trained for 199.359295802 seconds.
[2023-12-13 09:39:01,450] {executor.py:140} INFO - ModelType.RNN eval at step 25000.
[2023-12-13 09:39:11,207] {executor.py:144} INFO - ModelType.RNN train loss: 0.01356277521699667, eval metrics: {'eval_loss': 0.19657311136609515, 'MRE': 0.3212146346941857, 'MAE': 0.2571877529326001, 'RMSE': 263.51758483683676}
[2023-12-13 09:39:11,209] {executor.py:174} INFO - Saving model at step 25000 with loss 0.01356277521699667,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 40%|████      | 5025/12479 [00:39<00:42, 176.82it/s]

[2023-12-13 09:39:40,367] {executor.py:139} INFO - ModelType.RNN trained for 238.276522213 seconds.
[2023-12-13 09:39:40,368] {executor.py:140} INFO - ModelType.RNN eval at step 30000.
[2023-12-13 09:39:50,210] {executor.py:144} INFO - ModelType.RNN train loss: 0.09832141548395157, eval metrics: {'eval_loss': 0.18145326581305618, 'MRE': 0.530329447931071, 'MAE': 0.35744606400749895, 'RMSE': 288.99605849262156}
[2023-12-13 09:39:50,210] {executor.py:174} INFO - Saving model at step 30000 with loss 0.09832141548395157,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 80%|████████  | 10038/12479 [01:19<00:14, 167.71it/s]

[2023-12-13 09:40:20,363] {executor.py:139} INFO - ModelType.RNN trained for 278.271951511 seconds.
[2023-12-13 09:40:20,363] {executor.py:140} INFO - ModelType.RNN eval at step 35000.
[2023-12-13 09:40:30,313] {executor.py:144} INFO - ModelType.RNN train loss: 0.02369546703994274, eval metrics: {'eval_loss': 0.28446841908571047, 'MRE': 0.24456087819601557, 'MAE': 0.27476882607734066, 'RMSE': 303.91031848819574}
[2023-12-13 09:40:30,314] {executor.py:174} INFO - Saving model at step 35000 with loss 0.02369546703994274,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


100%|██████████| 12479/12479 [01:43<00:00, 120.31it/s]

[2023-12-13 09:40:44,864] {executor.py:122} INFO - ModelType.RNN training epoch 3



 20%|██        | 2553/12479 [00:15<00:58, 170.10it/s]

[2023-12-13 09:41:00,085] {executor.py:139} INFO - ModelType.RNN trained for 317.99448331 seconds.
[2023-12-13 09:41:00,086] {executor.py:140} INFO - ModelType.RNN eval at step 40000.
[2023-12-13 09:41:09,760] {executor.py:144} INFO - ModelType.RNN train loss: 2.9941625595092773, eval metrics: {'eval_loss': 0.3594483902415959, 'MRE': 1.7549973178665792, 'MAE': 1.0166000763411087, 'RMSE': 700.0495498470169}
[2023-12-13 09:41:09,761] {executor.py:174} INFO - Saving model at step 40000 with loss 2.9941625595092773,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 61%|██████    | 7554/12479 [00:54<00:29, 166.89it/s]

[2023-12-13 09:41:39,144] {executor.py:139} INFO - ModelType.RNN trained for 357.053690257 seconds.
[2023-12-13 09:41:39,145] {executor.py:140} INFO - ModelType.RNN eval at step 45000.
[2023-12-13 09:41:48,867] {executor.py:144} INFO - ModelType.RNN train loss: 0.024705780670046806, eval metrics: {'eval_loss': 0.26102575620890817, 'MRE': 1.5371166432489203, 'MAE': 0.7996904885945477, 'RMSE': 552.7280412681787}
[2023-12-13 09:41:48,868] {executor.py:174} INFO - Saving model at step 45000 with loss 0.024705780670046806,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


100%|██████████| 12479/12479 [01:32<00:00, 134.56it/s]

[2023-12-13 09:42:17,609] {executor.py:122} INFO - ModelType.RNN training epoch 4



  1%|          | 80/12479 [00:00<01:14, 167.06it/s]

[2023-12-13 09:42:18,152] {executor.py:139} INFO - ModelType.RNN trained for 396.061324165 seconds.
[2023-12-13 09:42:18,152] {executor.py:140} INFO - ModelType.RNN eval at step 50000.
[2023-12-13 09:42:32,053] {executor.py:144} INFO - ModelType.RNN train loss: 0.24536852538585663, eval metrics: {'eval_loss': 0.1620832942403817, 'MRE': 0.8657671129262904, 'MAE': 0.4590074368420701, 'RMSE': 329.4132057529774}
[2023-12-13 09:42:32,054] {executor.py:174} INFO - Saving model at step 50000 with loss 0.24536852538585663,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 41%|████      | 5073/12479 [00:43<00:43, 171.99it/s]

[2023-12-13 09:43:01,590] {executor.py:139} INFO - ModelType.RNN trained for 439.499885933 seconds.
[2023-12-13 09:43:01,591] {executor.py:140} INFO - ModelType.RNN eval at step 55000.
[2023-12-13 09:43:11,223] {executor.py:144} INFO - ModelType.RNN train loss: 0.040661271661520004, eval metrics: {'eval_loss': 0.15109567669237434, 'MRE': 0.8463470669614983, 'MAE': 0.6262718485537556, 'RMSE': 446.34539911179695}
[2023-12-13 09:43:11,224] {executor.py:174} INFO - Saving model at step 55000 with loss 0.040661271661520004,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


 81%|████████  | 10068/12479 [01:22<00:14, 170.54it/s]

[2023-12-13 09:43:40,218] {executor.py:139} INFO - ModelType.RNN trained for 478.1270238 seconds.
[2023-12-13 09:43:40,218] {executor.py:140} INFO - ModelType.RNN eval at step 60000.
[2023-12-13 09:43:49,981] {executor.py:144} INFO - ModelType.RNN train loss: 0.044416576623916626, eval metrics: {'eval_loss': 0.16815653110834552, 'MRE': 1.103761748258861, 'MAE': 0.5694832063256446, 'RMSE': 402.54781095230607}
[2023-12-13 09:43:49,982] {executor.py:174} INFO - Saving model at step 60000 with loss 0.044416576623916626,save path: /root/guohao/repos/DLT-perf-model/notebooks/ckpts/RNN/single_train2023-12-13_09-35-42


100%|██████████| 12479/12479 [01:46<00:00, 117.27it/s]

[2023-12-13 09:44:04,023] {executor.py:122} INFO - ModelType.RNN training epoch 5



 21%|██        | 2592/12479 [00:15<00:56, 176.14it/s]

[2023-12-13 09:44:19,236] {executor.py:139} INFO - ModelType.RNN trained for 517.145416026 seconds.
[2023-12-13 09:44:19,237] {executor.py:140} INFO - ModelType.RNN eval at step 65000.


 21%|██        | 2605/12479 [00:22<01:24, 117.50it/s]


KeyboardInterrupt: 