# Data Cleaning and Preprocessing

In [1]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

from utils.eda import *
from metadata import SPLIT

LAYOUT_ROOT = Path("./data/raw/npz_all/npz/layout/")
LAYOUT_PROC_ROOT = Path("./data/processed/layout/")
pd.options.display.max_columns = None

In [2]:
coll = "nlp-random"
src, search = coll.split("-")
data_root = LAYOUT_PROC_ROOT/f"{src}"/f"{search}"
data_root

PosixPath('data/processed/layout/nlp/random')

## Takeaways
1. `node_config_feat` is too large to handle at once.
    * ~ 1G for an entire feature matrix.

In [75]:
with open(data_root/"train.pkl", "rb") as f:
    df_tr = pickle.load(f)
with open(data_root/"valid.pkl", "rb") as f:
    df_val = pickle.load(f)
with open(data_root/"test.pkl", "rb") as f:
    df_test = pickle.load(f)

In [3]:
df = []
for split in SPLIT:
    with open(data_root/f"{split}.pkl", "rb") as f:
        df_split = pickle.load(f)
        df.append(df_split)
df = pd.concat(df, ignore_index=True)
summarize(df, coll, 3)

=====Summary of nlp-random=====


Unnamed: 0,edge_index,node_feat,node_opcode,node_config_ids,node_splits,config_runtime,file,split
0,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[1133, 1142, 1158, 1160, 1190, 1193, 1304, 130...","[[0, 5400]]","[36991241, 36990478, 43789919, 46781980, 46033...",albert_en_base_batch_size_32_train.npz,train
1,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[357, 367, 369, 375, 380, 390, 393, 397, 398, ...","[[0, 1756]]","[16557932, 16544884, 17129934, 23457130, 19650...",small_bert_bert_en_uncased_L-10_H-768_A-12_bat...,train
2,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[2495, 2504, 2520, 2522, 2552, 2555, 2666, 266...","[[0, 11779]]","[130405751, 130381694, 199657824, 158424992, 2...",talking-heads_base_batch_size_64_train.npz,train


Shape: (244, 8)
NaN ratio:


Unnamed: 0,edge_index,node_feat,node_opcode,node_config_ids,node_splits,config_runtime,file,split
NaN Ratio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Zero ratio:


In [4]:
df.groupby("split").file.count()

split
test      17
train    207
valid     20
Name: file, dtype: int64

In [5]:
node_op = np.concatenate(df["node_opcode"])
np.min(node_op), np.max(node_op)

(2, 100)

In [6]:
np.unique(node_op), len(np.unique(node_op))

(array([  2,   5,  12,  13,  19,  20,  22,  24,  25,  31,  32,  34,  37,
         41,  42,  45,  48,  50,  52,  54,  55,  57,  58,  59,  60,  62,
         63,  66,  70,  75,  77,  79,  81,  82,  83,  87,  88,  89,  92,
         94,  95,  96,  98, 100], dtype=uint8),
 44)

In [7]:
node_feat = []
for split, gp in df.groupby("split"):
    node_feat_split = pd.DataFrame(np.concatenate(gp["node_feat"].values))
    node_feat_split["split"] = split
    node_feat.append(node_feat_split)
node_feat = pd.concat(node_feat, ignore_index=True)
node_feat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,split
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,test
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,test
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,test
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,test
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,test


In [13]:
# node_feat_chunks = {
#     "is_root": (0, 1),
#     "element_size_in_bits": (1, 2),
#     "shape_element_type_is_X": (2, 21),
#     "shape_dimensions_X": (21, 29),
#     "shape_tuple_shapes_size": (29, 30),
#     "parameter_number": (30, 31),
#     "dimensions_X": (31, 37),
#     "window_size_X": (37, 45),
#     "window_stride_X": (45, 53),
#     "window_padding_low_X": (53, 61),
#     "window_padding_high_X": (61, 69),
#     "window_window_dilation_X": (69, 77),
#     "window_base_dilation_X": (77, 85),
#     "window_window_reversal_X": (85, 93),
#     "convolution_dim_numbers_input_batch_dim": (93, 94),
#     "convolution_dim_numbers_input_feature_dim": (94, 95),
#     "convolution_dim_numbers_input_spatial_dims_X": (95, 99),
#     "convolution_dim_numbers_kernel_input_feature_dim": (99, 100),
#     "convolution_dim_numbers_kernel_output_feature_dim": (100, 101),
#     "convolution_dim_numbers_kernel_spatial_dims_X": (101, 105),
#     "convolution_dim_numbers_output_batch_dim": (105, 106),
#     "convolution_dim_numbers_output_feature_dim": (106, 107),
#     "feature_group_count": (107, 108),
#     "batch_group_count": (108, 109),
#     "slice_dims_start_X": (109, 113),
#     "slice_dims_stride_X": (113, 117),
#     "slice_dims_limit_X": (117, 121),
#     "dynamic_slice_sizes_X": (121, 125),
#     "padding_config_edge_padding_low_X": (125, 129),
#     "padding_config_edge_padding_high_X": (129, 133),
#     "is_stable": (133, 134),
#     "layout_minor_to_major_X": (134, 140)
# }
# with open("./data/processed/node_feat_chunks.pkl", "wb") as f:
#     pickle.dump(node_feat_chunks, f)

In [17]:
# node_feat_idx2name = {
#     0: "is_root",  #  - whether this node is the output
#     1: "element_size_in_bits",  # - deprecated, always 0
#     # // 2–20: One hot vector of shape_element_type.
#     2: "shape_element_type_is_invalid_type",
#     3: "shape_element_type_is_pred",
#     4: "shape_element_type_is_s8",
#     5: "shape_element_type_is_s16",
#     6: "shape_element_type_is_s32",
#     7: "shape_element_type_is_s64",
#     8: "shape_element_type_is_u8",
#     9: "shape_element_type_is_u16",
#     10: "shape_element_type_is_u32",
#     11: "shape_element_type_is_u64",
#     12: "shape_element_type_is_f16",
#     13: "shape_element_type_is_f32",
#     14: "shape_element_type_is_f64",
#     15: "shape_element_type_is_bf16",
#     16: "shape_element_type_is_c64",
#     17: "shape_element_type_is_c128",
#     18: "shape_element_type_is_tuple",
#     19: "shape_element_type_is_opaque_type",
#     20: "shape_element_type_is_token",
#     # // 21–28: Size (number of elements) for each dimension, or an upper bound on the size if the dimension is dynamic.  In XLA, dimensions are numbered from 0 to N-1 for an N-dimensional array. The first element of 'shape_dimensions' is the size of dimension 0, the second element is the size of dimension 1, and so forth.  Empty list indicates a scalar.
#     21: "shape_dimensions_0",
#     22: "shape_dimensions_1",
#     23: "shape_dimensions_2",
#     24: "shape_dimensions_3",
#     25: "shape_dimensions_4",
#     26: "shape_dimensions_5",
#     27: "shape_dimensions_sum",
#     28: "shape_dimensions_product",
#     29: "shape_tuple_shapes_size",  # - for tuples only, the shapes of constituent shapes in the tuple sequence
#     30: "parameter_number",   # = K - indicating that is is the Kth parameter to the computation, only for Parameter operation
#     # // 31–36: Dimensions present for some operations that require reshaping or broadcasting, including Reshape, Reduce, ReduceWindow, and Reverse.
#     31: "dimensions_0",
#     32: "dimensions_1",
#     33: "dimensions_2",
#     34: "dimensions_3",
#     35: "dimensions_4",
#     36: "dimensions_5",
#     # // 37–92: Windowing information in an operation such as convolution. The window is moved across a base area and for each position of the window a computation is performed.
#     37: "window_size_0",
#     38: "window_size_1",
#     39: "window_size_2",
#     40: "window_size_3",
#     41: "window_size_4",
#     42: "window_size_5",
#     43: "window_size_sum",
#     44: "window_size_product",
#     45: "window_stride_0",
#     46: "window_stride_1",
#     47: "window_stride_2",
#     48: "window_stride_3",
#     49: "window_stride_4",
#     50: "window_stride_5",
#     51: "window_stride_sum",
#     52: "window_stride_product",
#     53: "window_padding_low_0",
#     54: "window_padding_low_1",
#     55: "window_padding_low_2",
#     56: "window_padding_low_3",
#     57: "window_padding_low_4",
#     58: "window_padding_low_5",
#     59: "window_padding_low_sum",
#     60: "window_padding_low_product",
#     61: "window_padding_high_0",
#     62: "window_padding_high_1",
#     63: "window_padding_high_2",
#     64: "window_padding_high_3",
#     65: "window_padding_high_4",
#     66: "window_padding_high_5",
#     67: "window_padding_high_sum",
#     68: "window_padding_high_product",
#     # // 69–76: Dilation factor of the sliding window. A dilation factor of 1 means no dilation. window_dilation - 1 no-op entries ("holes") are implicitly placed between each kernel element.
#     69: "window_window_dilation_0",
#     70: "window_window_dilation_1",
#     71: "window_window_dilation_2",
#     72: "window_window_dilation_3",
#     73: "window_window_dilation_4",
#     74: "window_window_dilation_5",
#     75: "window_window_dilation_sum",
#     76: "window_window_dilation_product",
#     # // 77-84: Dilation factor of the base area. A dilation factor of 1 means no dilation. base_dilation - 1 no-op entries ("holes") are implicitly placed between each base area element.
#     77: "window_base_dilation_0",
#     78: "window_base_dilation_1",
#     79: "window_base_dilation_2",
#     80: "window_base_dilation_3",
#     81: "window_base_dilation_4",
#     82: "window_base_dilation_5",
#     83: "window_base_dilation_sum",
#     84: "window_base_dilation_product",
#     # // 85-92: Window reversal means that this dimension was logically reversed before the operation.
#     85: "window_window_reversal_0",
#     86: "window_window_reversal_1",
#     87: "window_window_reversal_2",
#     88: "window_window_reversal_3",
#     89: "window_window_reversal_4",
#     90: "window_window_reversal_5",
#     91: "window_window_reversal_true_count",
#     92: "window_window_reversal_false_count",
#     # // 93–106: The dimension numbers used for a convolution.
#     93: "convolution_dim_numbers_input_batch_dim",  # - the dimension number that represents batch in the input
#     94: "convolution_dim_numbers_input_feature_dim",  # - the dimension number that represents features in the input
#     # // 95–98: Dimension numbers for the spatial dimensions that the window moves through in the input.
#     95: "convolution_dim_numbers_input_spatial_dims_0",
#     96: "convolution_dim_numbers_input_spatial_dims_1",
#     97: "convolution_dim_numbers_input_spatial_dims_2",
#     98: "convolution_dim_numbers_input_spatial_dims_3",
#     99: "convolution_dim_numbers_kernel_input_feature_dim",   # - the dimension number that represents input features in the convolutional kernel (rhs)
#     100: "convolution_dim_numbers_kernel_output_feature_dim",   # - the dimension number that represents output features in the convolutional kernel (rhs)
#     # // 101-104: Dimension numbers for the spatial dimensions that the window moves through in the kernel (rhs). window.strides(0) is the stride in the kernel_spatial_dimensions(0) dimension.
#     101: "convolution_dim_numbers_kernel_spatial_dims_0",
#     102: "convolution_dim_numbers_kernel_spatial_dims_1",
#     103: "convolution_dim_numbers_kernel_spatial_dims_2",
#     104: "convolution_dim_numbers_kernel_spatial_dims_3",
#     105: "convolution_dim_numbers_output_batch_dim",  # - the dimension number that represents batch in the output
#     106: "convolution_dim_numbers_output_feature_dim",  # - the dimension number that represents features in the output
#     107: "feature_group_count",  # - the number of feature groups, used for a convolution. Must be a divisor of the input feature dimension and output feature dimension. If not specified, it will use a default value of 1.
#     108: "batch_group_count",  # - the number of batch groups, used for a convolution.
#     # // 109–120: [begin/start, end/limit) index range and stride for a slice operation.
#     109: "slice_dims_start_0",
#     110: "slice_dims_start_1",
#     111: "slice_dims_start_sum",
#     112: "slice_dims_start_product",
#     113: "slice_dims_stride_0",
#     114: "slice_dims_stride_1",
#     115: "slice_dims_stride_sum",
#     116: "slice_dims_stride_product",
#     117: "slice_dims_limit_0",
#     118: "slice_dims_limit_1",
#     119: "slice_dims_limit_sum",
#     120: "slice_dims_limit_product",
#     # // 121 - 124: [start, start + size) range size for a dynamic slice ('start' is specified dynamically in the second operand of the operation).
#     121: "dynamic_slice_sizes_0",
#     122: "dynamic_slice_sizes_1",
#     123: "dynamic_slice_sizes_sum",
#     124: "dynamic_slice_sizes_product",
#     # // 125–132: Padding configuration that describes the edge padding of a pad operation.
#     125: "padding_config_edge_padding_low_0",
#     126: "padding_config_edge_padding_low_1",
#     127: "padding_config_edge_padding_low_sum",
#     128: "padding_config_edge_padding_low_product",
#     129: "padding_config_edge_padding_high_0",
#     130: "padding_config_edge_padding_high_1",
#     131: "padding_config_edge_padding_high_sum",
#     132: "padding_config_edge_padding_high_product",
#     133: "is_stable", # - whether this Sort operation should be stable
#     # // 134–139: Physical layout used to pack the tensor shape.
#     134: "layout_minor_to_major_0",
#     135: "layout_minor_to_major_1",
#     136: "layout_minor_to_major_2",
#     137: "layout_minor_to_major_3",
#     138: "layout_minor_to_major_4",
#     139: "layout_minor_to_major_5",
# }
# with open("./data/processed/node_feat_idx2name.pkl", "wb") as f:
#     pickle.dump(node_feat_idx2name, f)
# node_feat_idx2name[0], node_feat_idx2name[1]

In [8]:
with open("./data/processed/node_feat_idx2name.pkl", "rb") as f:
    node_feat_idx2name = pickle.load(f)

In [8]:
with open("./data/processed/node_feat_chunks.pkl", "rb") as f:
    node_feat_chunks = pickle.load(f)

{'is_root': (0, 1),
 'element_size_in_bits': (1, 2),
 'shape_element_type_is_X': (2, 21),
 'shape_dimensions_X': (21, 29),
 'shape_tuple_shapes_size': (29, 30),
 'parameter_number': (30, 31),
 'dimensions_X': (31, 37),
 'window_size_X': (37, 45),
 'window_stride_X': (45, 53),
 'window_padding_low_X': (53, 61),
 'window_padding_high_X': (61, 69),
 'window_window_dilation_X': (69, 77),
 'window_base_dilation_X': (77, 85),
 'window_window_reversal_X': (85, 93),
 'convolution_dim_numbers_input_batch_dim': (93, 94),
 'convolution_dim_numbers_input_feature_dim': (94, 95),
 'convolution_dim_numbers_input_spatial_dims_X': (95, 99),
 'convolution_dim_numbers_kernel_input_feature_dim': (99, 100),
 'convolution_dim_numbers_kernel_output_feature_dim': (100, 101),
 'convolution_dim_numbers_kernel_spatial_dims_X': (101, 105),
 'convolution_dim_numbers_output_batch_dim': (105, 106),
 'convolution_dim_numbers_output_feature_dim': (106, 107),
 'feature_group_count': (107, 108),
 'batch_group_count': (1

## 1. Drop Constant Columns
Because `train`, `valid` and `test` splits are concatenated to analyze, we can ignore constant features. That is, the columns are useless.

In [23]:
def _get_const_cols(node_feat: pd.DataFrame, splits: Optional[List[str]] = None) -> List[str]:
    """Return constant node features."""
    if splits is not None:
        node_feat = node_feat[node_feat["split"].isin(splits)].reset_index(drop=True)
    n_uniqs = node_feat.nunique()
    const_cols = n_uniqs[n_uniqs == 1].index.tolist()
    if "split" in const_cols:
        const_cols.remove("split")
    
    # Quick sanity check
    for c in const_cols:
        assert node_feat[c].sum() / len(node_feat) == node_feat[c][0]
    
    return const_cols

In [10]:
const_cols = _get_const_cols(node_feat)
node_feat.drop(const_cols, axis=1, inplace=True)
node_feat.head()

Unnamed: 0,0,3,6,7,10,13,18,21,22,23,24,27,28,29,30,31,32,33,34,107,108,109,111,112,113,114,115,117,118,119,120,128,129,130,131,132,134,135,136,137,split
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test


## 2. Drop Constant Columns Present in Valid/Test or Only in Test (Optional)
<div class="alert alert-block alert-warning">
    <h5>Do these features actually help modeling?</h5>
</div>

In [24]:
drop_const_cols_in_eval_set = True

In [25]:
if drop_const_cols_in_eval_set:
    const_cols_val_test = _get_const_cols(node_feat, ["valid", "test"])
    const_cols_test = _get_const_cols(node_feat, ["test"])
    cols_drop = list(set(const_cols_val_test + const_cols_val_test))
    node_feat.drop(cols_drop, axis=1, inplace=True)
node_feat.head(3)

Unnamed: 0,0,3,6,7,10,13,18,21,22,23,24,27,28,29,30,31,32,33,34,107,108,109,111,112,113,114,115,117,118,119,120,128,129,130,131,132,134,135,136,split
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,test
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,test
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,test


## 3. Label Encode `shape_element_type`
Train a dense embedding for this feature chunk.

In [28]:
chunk_bound = node_feat_chunks["shape_element_type_is_X"]
ch, ct = chunk_bound

In [29]:
cols = [c for c in node_feat.columns if c in range(ch, ct)]
cols

[3, 6, 7, 10, 13, 18]

In [37]:
node_feat["shape_element_type"] = np.where(node_feat[cols])[1]
node_feat.drop(cols, axis=1, inplace=True)

## 4. Determine Feature Range to Apply Further Transformations

In [39]:
def _show_range(node_feat: pd.DataFrame, cols: Optional[List[str]] = None) -> None:
    if cols is not None:
        node_feat = node_feat[cols + ["split"]]
    for chunk_name, chunk_bound in node_feat_chunks.items():
        ch, ct = chunk_bound
        all_chunk_cols = list(range(ch, ct))
        chunk_cols = [c for c in node_feat.columns if c in all_chunk_cols]
        if len(chunk_cols) == 0: continue

        print(f"=== Chunk {chunk_name} ===")
        df_chunk = node_feat[chunk_cols + ["split"]]
        for col in chunk_cols:
            n_uniqs = df_chunk.groupby("split")[col].nunique()
            uniq_msg = ""
            for split in SPLIT:
                uniq_msg += f"{split} {n_uniqs[split]} "
            val_cnts = df_chunk.groupby("split")[col].value_counts()
            print(f"--- Col {col} {uniq_msg} ---")
            display(val_cnts)
            print(f"Min: {df_chunk[col].min()} | Max: {df_chunk[col].max()}")
        print("\n")

In [44]:
def _abs_log1p(arr: pd.Series) -> np.ndarray:
    arr_ = np.where(arr >= 0, np.log1p(arr), -np.log1p(arr.abs()))
    
    return arr_

In [45]:
transform = "log1p"

In [40]:
leg_cols = [0, 30, 31, 32, 33, 34, 107, 108, 109, 111, 112, 113, 114, 115,
            128, 132, 134, 135, 136]
_show_range(node_feat, leg_cols)

=== Chunk is_root ===
--- Col 0 train 2 valid 2 test 2  ---


split  0  
train  0.0    1116344
valid  0.0     105155
test   0.0      72131
train  1.0      75522
valid  1.0       7046
test   1.0       4588
Name: count, dtype: int64

Min: 0.0 | Max: 1.0


=== Chunk parameter_number ===
--- Col 30 train 4 valid 4 test 4  ---


split  30 
test   0.0      72004
       1.0       4571
       2.0         76
       3.0         68
train  0.0    1114789
       1.0      75315
       2.0        934
       3.0        828
valid  0.0     105003
       1.0       7026
       2.0         92
       3.0         80
Name: count, dtype: int64

Min: 0.0 | Max: 3.0


=== Chunk dimensions_X ===
--- Col 31 train 4 valid 4 test 4  ---


split  31 
test   0.0      73023
       2.0       2902
       1.0        420
       3.0        374
train  0.0    1136210
       2.0      43587
       3.0       6303
       1.0       5766
valid  0.0     107504
       2.0       3635
       1.0        568
       3.0        494
Name: count, dtype: int64

Min: 0.0 | Max: 3.0
--- Col 32 train 4 valid 4 test 4  ---


split  32 
test   0.0      73006
       1.0       2567
       3.0        691
       2.0        455
train  0.0    1131708
       1.0      44063
       3.0       8271
       2.0       7824
valid  0.0     107022
       1.0       3766
       3.0        735
       2.0        678
Name: count, dtype: int64

Min: 0.0 | Max: 3.0
--- Col 33 train 4 valid 4 test 4  ---


split  33 
test   0.0      75323
       1.0        858
       2.0        382
       3.0        156
train  0.0    1169794
       1.0      13140
       2.0       6409
       3.0       2523
valid  0.0     110257
       1.0       1244
       2.0        506
       3.0        194
Name: count, dtype: int64

Min: 0.0 | Max: 3.0
--- Col 34 train 4 valid 3 test 3  ---


split  34 
test   0.0      76077
       2.0        362
       3.0        280
train  0.0    1182611
       3.0       5052
       2.0       4188
       1.0         15
valid  0.0     111399
       3.0        406
       2.0        396
Name: count, dtype: int64

Min: 0.0 | Max: 3.0


=== Chunk feature_group_count ===
--- Col 107 train 2 valid 2 test 2  ---


split  107
test   0.0      76586
       1.0        133
train  0.0    1190195
       1.0       1671
valid  0.0     112029
       1.0        172
Name: count, dtype: int64

Min: 0.0 | Max: 1.0


=== Chunk batch_group_count ===
--- Col 108 train 2 valid 2 test 2  ---


split  108
test   0.0      76586
       1.0        133
train  0.0    1190195
       1.0       1671
valid  0.0     112029
       1.0        172
Name: count, dtype: int64

Min: 0.0 | Max: 1.0


=== Chunk slice_dims_start_X ===
--- Col 109 train 3 valid 3 test 3  ---


split  109
test   0.0      76679
       1.0         24
       2.0         16
train  0.0    1191336
       1.0        318
       2.0        212
valid  0.0     112141
       1.0         36
       2.0         24
Name: count, dtype: int64

Min: 0.0 | Max: 2.0
--- Col 111 train 3 valid 3 test 3  ---


split  111
test   0.0      76679
       1.0         24
       2.0         16
train  0.0    1191336
       1.0        318
       2.0        212
valid  0.0     112141
       1.0         36
       2.0         24
Name: count, dtype: int64

Min: 0.0 | Max: 2.0
--- Col 112 train 3 valid 3 test 3  ---


split  112
test   1.0      76485
       0.0        218
       2.0         16
train  1.0    1188948
       0.0       2706
       2.0        212
valid  1.0     111905
       0.0        272
       2.0         24
Name: count, dtype: int64

Min: 0.0 | Max: 2.0


=== Chunk slice_dims_stride_X ===
--- Col 113 train 2 valid 2 test 2  ---


split  113
test   0.0      76461
       1.0        258
train  0.0    1188630
       1.0       3236
valid  0.0     111869
       1.0        332
Name: count, dtype: int64

Min: 0.0 | Max: 1.0
--- Col 114 train 2 valid 2 test 2  ---


split  114
test   0.0      76567
       1.0        152
train  0.0    1189998
       1.0       1868
valid  0.0     112017
       1.0        184
Name: count, dtype: int64

Min: 0.0 | Max: 1.0
--- Col 115 train 4 valid 4 test 4  ---


split  115
test   0.0      76461
       2.0        118
       1.0        106
       3.0         34
train  0.0    1188630
       2.0       1454
       1.0       1368
       3.0        414
valid  0.0     111869
       1.0        148
       2.0        144
       3.0         40
Name: count, dtype: int64

Min: 0.0 | Max: 3.0


=== Chunk padding_config_edge_padding_low_X ===
--- Col 128 train 2 valid 2 test 2  ---


split  128
test   1.0      76687
       0.0         32
train  1.0    1191442
       0.0        424
valid  1.0     112153
       0.0         48
Name: count, dtype: int64

Min: 0.0 | Max: 1.0


=== Chunk padding_config_edge_padding_high_X ===
--- Col 132 train 2 valid 2 test 2  ---


split  132
test   1.0      76703
       0.0         16
train  1.0    1191654
       0.0        212
valid  1.0     112177
       0.0         24
Name: count, dtype: int64

Min: 0.0 | Max: 1.0


=== Chunk layout_minor_to_major_X ===
--- Col 134 train 4 valid 4 test 4  ---


split  134
test   0.0     39244
       2.0     17016
       1.0     15717
       3.0      4742
train  0.0    597932
       2.0    291215
       1.0    226519
       3.0     76200
valid  0.0     58400
       2.0     25037
       1.0     22435
       3.0      6329
Name: count, dtype: int64

Min: 0.0 | Max: 3.0
--- Col 135 train 4 valid 4 test 4  ---


split  135
test   0.0     52544
       1.0     17677
       2.0      6208
       3.0       290
train  0.0    787547
       1.0    300450
       2.0     99885
       3.0      3984
valid  0.0     77246
       1.0     25969
       2.0      8638
       3.0       348
Name: count, dtype: int64

Min: 0.0 | Max: 3.0
--- Col 136 train 4 valid 3 test 3  ---


split  136
test   0.0      69917
       1.0       6316
       2.0        486
train  0.0    1082795
       1.0     102249
       2.0       6732
       3.0         90
valid  0.0     102785
       1.0       8808
       2.0        608
Name: count, dtype: int64

Min: 0.0 | Max: 3.0




In [None]:
illeg_cols = [c for c in node_feat.columns if c not in leg_cols and str(c).isdigit()]
_show_range(node_feat, illeg_cols)

In [46]:
for c in illeg_cols:
    node_feat[f"{c}_log1p"] = _abs_log1p(node_feat[c])
summarize(node_feat, "Proc Df", 2)

=====Summary of Proc Df=====


Unnamed: 0,0,21,22,23,24,27,28,29,30,31,32,33,34,107,108,109,111,112,113,114,115,117,118,119,120,128,129,130,131,132,134,135,136,split,shape_element_type,21_log1p,22_log1p,23_log1p,24_log1p,27_log1p,28_log1p,29_log1p,117_log1p,118_log1p,119_log1p,120_log1p,129_log1p,130_log1p,131_log1p
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,test,4,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,test,4,0.0,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.0


Shape: (1380786, 49)
NaN ratio:


Unnamed: 0,0,128,130,131,132,134,135,136,split,shape_element_type,21_log1p,22_log1p,23_log1p,24_log1p,27_log1p,28_log1p,29_log1p,117_log1p,118_log1p,119_log1p,120_log1p,129_log1p,130_log1p,129,120,21,119,22,23,24,27,28,29,30,31,32,33,34,107,108,109,111,112,113,114,115,117,118,131_log1p
NaN Ratio,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Zero ratio:


In [47]:
selected_cols = []
for c in range(140):
    if c not in node_feat: 
        continue
    elif f"{c}_log1p" in node_feat:
        selected_cols.append(f"{c}_log1p")
    else:
        selected_cols.append(c)
selected_cols.append("shape_element_type")
selected_cols

[0,
 '21_log1p',
 '22_log1p',
 '23_log1p',
 '24_log1p',
 '27_log1p',
 '28_log1p',
 '29_log1p',
 30,
 31,
 32,
 33,
 34,
 107,
 108,
 109,
 111,
 112,
 113,
 114,
 115,
 '117_log1p',
 '118_log1p',
 '119_log1p',
 '120_log1p',
 128,
 '129_log1p',
 '130_log1p',
 '131_log1p',
 132,
 134,
 135,
 136,
 'shape_element_type']

In [None]:
# Just the naive four-stage cleaning
node_feat_clean = node_feat[selected_cols]
node_feat_clean.min().min(), node_feat_clean.max().max()

(0.0, 21.48756217956543)

In [55]:
node_feat_clean_tr

In [56]:
node_feat_clean_tr = node_feat_clean[node_feat_clean["split"] == "train"].drop("split", axis=1)
node_feat_clean_valid = node_feat_clean[node_feat_clean["split"] == "valid"].drop("split", axis=1)
node_feat_clean_test = node_feat_clean[node_feat_clean["split"] == "test"].drop("split", axis=1)
node_feat_clean_tr.shape, node_feat_clean_valid.shape, node_feat_clean_test.shape

((1191866, 34), (112201, 34), (76719, 34))

In [76]:
df_tr["node_feat_raw"] = df_tr["node_feat"]
df_tr["n_nodes"] = df_tr["node_feat_raw"].apply(lambda x: len(x))
node_feat_clean_tr_split = np.split(node_feat_clean_tr.values, df_tr["n_nodes"].cumsum()[:-1])
df_tr["node_feat"] = node_feat_clean_tr_split
assert (df_tr["n_nodes"] == df_tr["node_feat"].apply(lambda x: len(x))).all()

In [77]:
df_val["node_feat_raw"] = df_val["node_feat"]
df_val["n_nodes"] = df_val["node_feat_raw"].apply(lambda x: len(x))
node_feat_clean_val_split = np.split(node_feat_clean_valid.values, df_val["n_nodes"].cumsum()[:-1])
df_val["node_feat"] = node_feat_clean_val_split
assert (df_val["n_nodes"] == df_val["node_feat"].apply(lambda x: len(x))).all()

In [78]:
df_test["node_feat_raw"] = df_test["node_feat"]
df_test["n_nodes"] = df_test["node_feat_raw"].apply(lambda x: len(x))
node_feat_clean_test_split = np.split(node_feat_clean_test.values, df_test["n_nodes"].cumsum()[:-1])
df_test["node_feat"] = node_feat_clean_test_split
assert (df_test["n_nodes"] == df_test["node_feat"].apply(lambda x: len(x))).all()

In [None]:
node_feat.columns = [node_feat_idx2name[c] for c in node_feat.columns[:-1]] + ["split"]
node_feat.head()

Unnamed: 0,is_root,shape_element_type_is_pred,shape_element_type_is_s32,shape_element_type_is_s64,shape_element_type_is_u32,shape_element_type_is_f32,shape_element_type_is_tuple,shape_dimensions_0,shape_dimensions_1,shape_dimensions_2,shape_dimensions_3,shape_dimensions_sum,shape_dimensions_product,shape_tuple_shapes_size,parameter_number,dimensions_0,dimensions_1,dimensions_2,dimensions_3,feature_group_count,batch_group_count,slice_dims_start_0,slice_dims_start_sum,slice_dims_start_product,slice_dims_stride_0,slice_dims_stride_1,slice_dims_stride_sum,slice_dims_limit_0,slice_dims_limit_1,slice_dims_limit_sum,slice_dims_limit_product,padding_config_edge_padding_low_product,padding_config_edge_padding_high_0,padding_config_edge_padding_high_1,padding_config_edge_padding_high_sum,padding_config_edge_padding_high_product,layout_minor_to_major_0,layout_minor_to_major_1,layout_minor_to_major_2,layout_minor_to_major_3,split
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,test


In [82]:
df_tr.to_pickle("./data/processed/layout/nlp/random/train.pkl")
df_val.to_pickle("./data/processed/layout/nlp/random/valid.pkl")
df_test.to_pickle("./data/processed/layout/nlp/random/test.pkl")

## Dump `node_config_feat` Separately

In [26]:
from tqdm.notebook import tqdm

def _dump_node_config_feat(src: str, search: str) -> None:
    data_root = LAYOUT_ROOT / f"{src}/{search}"
    for split in SPLIT:
        print(split)
        split_root = data_root / split
        dump_root = Path(f"./data/processed/layout/{src}/{search}/node_config_feat/{split}")

        for data_file in tqdm(sorted(split_root.glob("*.npz"))):
            file_name = str(data_file).split("/")[-1]
            data_tmp = dict(np.load(data_file))
            node_config_feat_tmp = data_tmp["node_config_feat"]
            np.savez_compressed(dump_root/file_name, node_config_feat=node_config_feat_tmp)

In [27]:


for src in ["nlp", "xla"]:
    for search in ["random", "default"]:
        print(f"== {src} - {search} ==")
        _dump_node_config_feat(src, search)

== nlp - random ==
train


  0%|          | 0/207 [00:00<?, ?it/s]

valid


  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [29]:
len(os.listdir("./data/processed/layout/nlp/random/node_config_feat/train"))

207

In [1]:
import pandas as pd
df_val = pd.read_pickle("./data/processed/layout/nlp/random/valid.pkl")
df_val.head()

Unnamed: 0,edge_index,node_feat,node_opcode,node_config_ids,node_splits,config_runtime,file,split,node_feat_raw,n_nodes
0,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.693147182464...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[357, 367, 369, 375, 380, 390, 393, 397, 398, ...","[[0, 1756]]","[1039041, 1038348, 898675, 1142171, 992545, 11...",small_bert_bert_en_uncased_L-10_H-128_A-2_batc...,valid,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1756
1,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.693147182464...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[899, 908, 924, 926, 956, 959, 1070, 1072, 109...","[[0, 4460]]","[4418038, 4424309, 4959535, 5935049, 5333162, ...",small_bert_bert_en_uncased_L-4_H-256_A-4_batch...,valid,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",4460
2,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.693147182464...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[1217, 1226, 1242, 1244, 1274, 1277, 1388, 139...","[[0, 5876]]","[9611554, 9610622, 11808725, 12076011, 1111423...",small_bert_bert_en_uncased_L-6_H-256_A-4_batch...,valid,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",5876
3,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.693147182464...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[285, 295, 297, 303, 308, 318, 321, 325, 326, ...","[[0, 1316]]","[2940005, 2943195, 3349407, 3335482, 3306370, ...",small_bert_bert_en_uncased_L-6_H-768_A-12_batc...,valid,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1316
4,"[[2, 0], [2, 1], [5, 3], [5, 4], [8, 6], [8, 7...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.693147182464...","[63, 63, 57, 63, 63, 2, 63, 63, 2, 63, 63, 2, ...","[2171, 2180, 2196, 2198, 2228, 2231, 2342, 234...","[[0, 10124]]","[75697907, 75703198, 104099328, 100644894, 105...",small_bert_bert_en_uncased_L-12_H-768_A-12_bat...,valid,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",10124


In [3]:
len(df_val)

20

In [5]:
import numpy as np
a = np.load("./data/processed/layout/nlp/random/node_config_feat/valid/albert_en_xlarge_batch_size_16_test.npz")

In [6]:
a["node_config_feat"]

BadZipFile: Bad CRC-32 for file 'node_config_feat.npy'

In [7]:
a = dict(np.load("./data/raw/npz_all/npz/layout/nlp/random/valid/albert_en_xlarge_batch_size_16_test.npz"))["node_config_feat"]
a.shape

(56032, 217, 18)

In [8]:
np.savez_compressed("./data/processed/layout/nlp/random/node_config_feat/valid/albert_en_xlarge_batch_size_16_test.npz", node_config_feat=a)

In [10]:
# b = np.load("./data/processed/layout/nlp/random/node_config_feat/valid/albert_en_xlarge_batch_size_16_test.npz")
np.max(b["node_config_feat"])

3.0