In [1]:
%reload_ext autoreload
%autoreload 2
from common import *
from scipy import stats
import plotnine as p9
import umap

In [2]:
data_dir = "../data/"
input_properties_type = "tabular"
system = "gcc"
method = "embed"
dimensions = 8

(
    perf_matrix,
    input_features,
    config_features,
    all_performances,
    input_preprocessor,
    config_preprocessor,
) = load_data(
    system=system, data_dir=data_dir, input_properties_type=input_properties_type
)
performance = all_performances[0]

print(f"Loaded data for `{system}`")
print(f"perf_matrix:{perf_matrix.shape}")
print(f"input_features(before preprocessing):{input_features.shape}")
print(f"config_features(before preprocessing):{config_features.shape}")

# This covers both train and test data
input_config_map_all = (
    perf_matrix[["inputname", "configurationID"] + [performance]]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)

regret_map_all = input_config_map_all.groupby("inputname").transform(
    lambda x: ((x - x.min()) / (x.max() - x.min()))  # .fillna(0)
)

rank_map_all = input_config_map_all.groupby("inputname").transform(
    lambda x: stats.rankdata(x, method="min")
)

data_split = split_data(perf_matrix)

train_inp = data_split["train_inp"]
train_cfg = data_split["train_cfg"]
test_inp = data_split["test_inp"]
test_cfg = data_split["test_cfg"]
train_data = data_split["train_data"]

# This is a look up for performance measurements from inputname + configurationID
# It only covers the training data
input_config_map = (
    train_data[["inputname", "configurationID"] + [performance]]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)

rank_map = input_config_map.groupby("inputname").transform(
    lambda x: stats.rankdata(x, method="min")
)

regret_map = input_config_map.groupby("inputname").transform(
    lambda x: ((x - x.min()) / (x.max() - x.min()))  # .fillna(0)
)

# We create the rank of inputs for a configuration by ranking their (input-internal) regret
cfg_rank_map = regret_map.groupby("configurationID").transform(
    lambda x: stats.rankdata(x, method="min")
)


Loaded data for `gcc`
perf_matrix:(2080, 15)
input_features(before preprocessing):(26, 5)
config_features(before preprocessing):(80, 5)
Training data: 61.54%
Both new: 4.62%
Config new: 15.38%
Input new: 18.46%


In [3]:
measurements = input_config_map.values.reshape(
    (len(data_split["train_inp"]), len(data_split["train_cfg"]), 1)
)


# TODO If we have more than one performance metric, 
# we can calculate the level in the pareto front as a rank
# This reduces to simply ranking in the case of one performance metric (nice!)
ic_dist_mat = stats.rankdata(measurements, axis=1)
ci_dist_mat = stats.rankdata(measurements, axis=0).swapaxes(0, 1)

# spearman rank distance
ii_dist_mat = pearson_rank_distance_matrix(measurements)
cc_dist_mat = pearson_rank_distance_matrix(measurements.swapaxes(0,1))

In [4]:
print("ic_dist_mat", ic_dist_mat.shape, ic_dist_mat.min(), ic_dist_mat.max())
print("ci_dist_mat", ci_dist_mat.shape, ci_dist_mat.min(), ci_dist_mat.max())
print("ii_dist_mat", ii_dist_mat.shape, ii_dist_mat.min(), ii_dist_mat.max())
print("cc_dist_mat", cc_dist_mat.shape, cc_dist_mat.min(), cc_dist_mat.max())

ic_dist_mat (20, 64, 1) 5.0 62.5
ci_dist_mat (64, 20, 1) 1.0 20.0
ii_dist_mat (20, 20, 1) 0.0 0.9475274725274725
cc_dist_mat (64, 64, 1) 0.0 0.9954887218045113


In [6]:
regret_arr = torch.from_numpy(
    regret_map.loc[(train_inp, train_cfg), :]
    .reset_index()
    .pivot_table(index="inputname", columns="configurationID", values=performance)
    .values
)


In [19]:
pearson_rank_distance_matrix(np.expand_dims(regret_map.loc[(train_inp, train_cfg), :]
.reset_index()
.pivot_table(index="configurationID", columns="inputname", values=performance)
.values, -1)).shape

(64, 64, 1)

In [21]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline


In [22]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})
penguins = pd.read_csv("https://raw.githubusercontent.com/allisonhorst/palmerpenguins/c19a904462482430170bfe2c718775ddb7dbb885/inst/extdata/penguins.csv")
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [25]:
train_data.head()

Unnamed: 0,configurationID,optim,-floop-interchange,-fprefetch-loop-arrays,-ffloat-store,-fno-asm,size,ctime,exec,inputname,csize,#LOCs,number_literal,for_statement,if_statement
1760,0,-O2,1,1,0,0,17552,0.148,20.408121,2mm,3732,156,25,16,1
1772,2,-Og,0,1,0,0,17720,0.087,15.330144,2mm,3732,156,25,16,1
1783,3,-O0,1,0,1,0,17752,0.06,23.697489,2mm,3732,156,25,16,1
1816,6,-Ofast,1,0,0,0,23208,0.169,13.578675,2mm,3732,156,25,16,1
1827,7,-O0,0,0,1,1,17752,0.073,25.434111,2mm,3732,156,25,16,1


In [26]:
sns.pairplot(train_data.drop("configurationID", axis=1), hue=performance);

KeyboardInterrupt: 