The aim of this notebook is to generate a csv file with the results of the metrics applied on synthethic drifted data.

In [None]:
import sys, os
sys.path.append(os.path.abspath('Utils'))
sys.path.append(os.path.abspath('data'))
sys.path.append(os.path.abspath('thresholds_and_results'))

from utils_generateTests import split_data, reduced_on_drift_kdim, test_on_reduced_kdim
from utils_dimRedDef import find_dimensions_number, initialize_DimReduction, scale_dataset, init_scaler
from utils_resNet import df_from_folder, init_resnet

In [None]:
# params
seed_split = 2
# seed_split = 1
seed_drift = 10
seed_metrics = 100
info_dataset = [seed_split, seed_drift, seed_metrics]


In [None]:
# number of dimensions for dimensionality reduction
k = 6
# csv file where saving results
resultFile = 'thresholds_and_results/2dim/devResults_2d.csv'


In [None]:
# initialize the resNet, used as pre-process for each image in order to extract more relevant feature and not work directly with pixels
model = init_resnet(seed_split)

### Defining folders

In [None]:
# defining folder directories (created in drift_and_thresholds.ipynb)
intensity_path = 'data/synthetic_data/drift_intensity/'
gaussian_path_1 = 'data/synthetic_data/drift_gaussian_1/'
gaussian_path_10 = 'data/synthetic_data/drift_gaussian_10/'
gaussian_path_100 = 'data/synthetic_data/drift_gaussian_100/'
input_path = 'data/synthetic_data/black/'

In [None]:
# split dataframe in train, test, validation sets, as lists
train_list, test_list, val_list = split_data(input_path, seed_split)

In [None]:
# create dataframes needed as input for the governance pipeline 
train = df_from_folder(input_path, model,  train_list)
test = df_from_folder(input_path,  model,  test_list)
val = df_from_folder(input_path,  model, val_list)
val_intensity = df_from_folder(intensity_path, model, val_list)
val_gaussian_1 = df_from_folder(gaussian_path_1, model,  val_list)
val_gaussian_10 = df_from_folder(gaussian_path_10,  model, val_list)
val_gaussian_100 = df_from_folder(gaussian_path_100,  model, val_list)

Find PCA with components that get 80% of variance on test set as done in the paper 'Failing Loudly' by Rabanser & co. to find the number of components to reduce the dataframes to. 

In [None]:
# k = find_dimensions_number(test)      
k=6

Initialize dimensionality reductors

In [None]:
test

In [None]:
# initialize scaler for autoencoder
standard_scaler = init_scaler(train)


In [None]:
# scaling source dataset
train_scaled = scale_dataset(train, standard_scaler)
test_scaled = scale_dataset(test, standard_scaler)
val_scaled = scale_dataset(val, standard_scaler)
# scaling drifted dataset
val_intensity_scaled = scale_dataset(val_intensity, standard_scaler)
val_gaussian_1_scaled = scale_dataset(val_gaussian_1, standard_scaler)
val_gaussian_10_scaled = scale_dataset(val_gaussian_10, standard_scaler)
val_gaussian_100_scaled = scale_dataset(val_gaussian_100, standard_scaler)

In [None]:
# initialize dimensionality reduction
reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer = initialize_DimReduction(seed_metrics, test,  train_scaled, test_scaled, k=k)   #k not specified: k=2

In [None]:
# Apply dimensionality reduction
test_dim_red, info_drift = reduced_on_drift_kdim(test,  info_dataset,  reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer, )
val_dim_red, info_drift_val= reduced_on_drift_kdim(val,  info_dataset,  reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer,)
#  on drifted
gau1_dim_red, info_drift_g1 = reduced_on_drift_kdim(val_gaussian_1, info_dataset, reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer ,  sigma=1, drift='Gaussian')
gau10_dim_red, info_drift_g10 = reduced_on_drift_kdim(val_gaussian_10, info_dataset, reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer,   sigma=10, drift='Gaussian')
gau100_dim_red, info_drift_g100 = reduced_on_drift_kdim(val_gaussian_100, info_dataset, reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer,   sigma=100, drift='Gaussian')
intensity_dim_red, info_drift_i = reduced_on_drift_kdim(val_intensity, info_dataset, reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer ,  sigma=42, drift='intensity')


Apply tests

In [None]:
test_on_reduced_kdim(val, test,  val_dim_red, test_dim_red, seed_metrics, resultFile, info_drift = info_drift_val, k=k)
test_on_reduced_kdim(val_gaussian_1, test, gau1_dim_red, test_dim_red,  seed_metrics, resultFile, info_drift_g1, k)
test_on_reduced_kdim(val_gaussian_10, test,  gau10_dim_red, test_dim_red, seed_metrics, resultFile, info_drift_g10, k)
test_on_reduced_kdim(val_gaussian_100, test, gau100_dim_red, test_dim_red,  seed_metrics, resultFile, info_drift_g100, k)
test_on_reduced_kdim(val_intensity, test, intensity_dim_red, test_dim_red, seed_metrics, resultFile, info_drift_i, k)
