In [11]:
import numpy as np
import torch
import pandas as pd
import random
import os

In [12]:
MAX_NUMBER_NODES = 9
MAX_FEATURES_AMOUNT = ((MAX_NUMBER_NODES * MAX_NUMBER_NODES - MAX_NUMBER_NODES) // 2 )
NUMBER_NODES_IN_DATASET = [MAX_NUMBER_NODES, 7]

# final_shape = MAX_FEATURES_AMOUNT + MAX_NUMBER_NODES + 1
# features + labels + optimal band (skipped in the models when loading the data)

In [13]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [14]:
def get_dataset_path(number_nodes):
  dataset_path = os.path.join(
    '..',
    '..',
    '..',
    'datasets',
    'examples',
    f'opt_band_{number_nodes}_nodes_graph.csv'
)
  return dataset_path

def save_dataset(df, description):
  df.to_csv(f'./dataset_{description}.csv', index=False, line_terminator='\n')

def split_and_shuffle_datasets(df):
    train_dataset_df = df.sample(frac=0.8)
    df_remaining = df.drop(index=train_dataset_df.index)
    val_dataset_df = df_remaining.sample(frac=0.5)
    test_dataset_df = df_remaining.drop(index=val_dataset_df.index)

    return train_dataset_df, val_dataset_df, test_dataset_df

def shuffle_dataset(df):
  return df.sample(frac=1.0)

def append_dataset(df, number_nodes):
  featuresAmount = ((number_nodes * number_nodes - number_nodes) // 2 )
  mask_features_amount = 0
  mask_labels_amount = 0
  if number_nodes < MAX_NUMBER_NODES:
    mask_features_amount = MAX_FEATURES_AMOUNT - featuresAmount
    mask_labels_amount = MAX_NUMBER_NODES - number_nodes
  new_df = load_data(number_nodes, featuresAmount, mask_features_amount, mask_labels_amount)
  df = pd.concat((df, new_df))
  df = shuffle_dataset(df)
  return df

In [15]:
columns = []
for i in range(MAX_FEATURES_AMOUNT):
  columns.append(f'xDigit_{i}')
columns.append("opt_band")
for j in range(MAX_NUMBER_NODES):
  columns.append(f"yLabel_{j}")

def load_data(number_nodes, featuresAmount, mask_features_amount, mask_labels_amount):
  def get_masked_dataset(row):
    x = row[0 : featuresAmount]
    y = row[featuresAmount: ]
    # mask_features = np.ones(mask_features_amount) * -1
    mask_features = np.ones(mask_features_amount) * 2
    x = np.concatenate((x, mask_features))
    # mask_labels = np.ones(mask_labels_amount) * -1
    mask_labels = np.ones(mask_labels_amount) * 35
    y = np.concatenate((y, mask_labels))
    row = np.concatenate((x, y))
    return row

  df = pd.read_csv(get_dataset_path(number_nodes))
  print(df.shape)
  if mask_features_amount != 0:
    df = pd.DataFrame(list(map(get_masked_dataset, df.to_numpy())), columns=columns)
  print(df.shape)
  return df
  
masked_df = pd.DataFrame(columns=columns)
for number_nodes in NUMBER_NODES_IN_DATASET:
  featuresAmount = ((number_nodes * number_nodes - number_nodes) // 2 )
  mask_features_amount = 0
  mask_labels_amount = 0

  if number_nodes < MAX_NUMBER_NODES:
    mask_features_amount = MAX_FEATURES_AMOUNT - featuresAmount
    mask_labels_amount = MAX_NUMBER_NODES - number_nodes

  df = load_data(number_nodes, featuresAmount, mask_features_amount, mask_labels_amount)
  masked_df = pd.concat((masked_df, df))

train_dataset_df, val_dataset_df, test_dataset_df = split_and_shuffle_datasets(masked_df)

test_dataset_df = append_dataset(test_dataset_df, 5)

save_dataset(train_dataset_df, '7_9_train(2and35_as_mask)')
save_dataset(val_dataset_df, '7_9_val(2and35_as_mask)')
save_dataset(test_dataset_df, '5_7_9_test(2and35_as_mask)')

(274668, 46)
(274668, 46)
(1044, 29)
(1044, 46)
(34, 16)
(34, 46)
