The aim of this notebook is to apply metrics on production data. We are considering two datasets: one in a 'normal' day and the second one in a 'drifted' day, i.e. a day when drifted occoured in producction.

In [1]:
import sys, os
sys.path.append(os.path.abspath('Utils'))
sys.path.append(os.path.abspath('data'))
sys.path.append(os.path.abspath('thresholds_and_results'))

import numpy as np

from utils_driftSimulating import  create_black_folder
from utils_resNet import init_resnet, df_from_folder
from utils_dimRedDef import init_scaler, scale_dataset, initialize_DimReduction
from utils_generateTests import reduced_on_drift_kdim, test_on_reduced_kdim






  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# params
seed_split = 1
seed_drift = 10
seed_metrics = 100
info_dataset = [seed_split, seed_drift, seed_metrics]



In [3]:
# number of dimensions for dimensionality reduction
k = 6
# csv file where saving results
resultFile = 'thresholds_and_results/6dim/prodResults_6d.csv'

resultFile = 'thresholds_and_results/6dim/prod6d_prova.csv'

### Preprocess

In [14]:
# directories for original production data
normal_path = 'data/original_data/normal/'
drifted_path = 'data/original_data/drifted/'

# directories where saving production black images
normal_b_path = 'data/synthetic_data/normal_black/'
drifted_b_path = 'data/synthetic_data/drifted_black/'

# # apply black filter, to do just once because we apply black filter to production images
# create_black_folder(normal_path, normal_b_path)
# create_black_folder(drifted_path, drifted_b_path)

# source folder
source_path = 'data/synthetic_data/black/'


In [15]:
# initialize ResNet for pre-processing for each image in order to extract more relevant feature and not work directly with pixels
model = init_resnet(seed_split)



In [16]:
# get production data
normal_df = df_from_folder(normal_b_path, model)
drifted_df = df_from_folder(drifted_b_path, model)

In [17]:
# get source data as training data for reference

len_dataframe = min(len(normal_df), len(drifted_df))    # needed for get balanced length of source data (training data >>> production data)

# Select a part of development (source) data as training data and a part as reference for the comparison with the production data
imagesList = os.listdir(source_path)        # list of all source data
rs = np.random.RandomState(seed_split)
rs.shuffle(imagesList)

# get lists of images for train set and source set (development set on which compare production data to)
source_list = imagesList[0:len_dataframe]          # images we will use to make comparison
train_list = imagesList[len_dataframe+1 : 2*len_dataframe]      # images for training

# get source df and training dataframe
source_df = df_from_folder(source_path, model,  source_list)
train_df = df_from_folder(source_path, model, train_list) # we need also train_df for training autoencoder

Initialize scaler and dimensionality reductors

In [18]:
# initialize standar scaler
standard_scaler = init_scaler(train_df)

# scaling dataframes for autoencoders
train_scaled = scale_dataset(train_df, standard_scaler)
source_scaled = scale_dataset(source_df, standard_scaler)
normal_scaled = scale_dataset(normal_df, standard_scaler)
drifted_scaled = scale_dataset(drifted_df, standard_scaler)


In [19]:
# initialize dimensionality reduction
reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer = initialize_DimReduction(seed_metrics, source_df,  train_scaled, source_scaled, k)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")




Epoch 1/15


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [20]:
# Apply dimensionality reduction 
# reduce source dataframe
source_dim_red, _ = reduced_on_drift_kdim(source_df,  info_dataset,  reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer)
# reduce production data 'normal' day
normal_dim_red, info_drift_normal = reduced_on_drift_kdim(normal_df,  info_dataset,  reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer,  sigma=42, drift = 'normal')
# reduce production data 'drifted' day
drifted_dim_red, info_drift_drifted = reduced_on_drift_kdim(drifted_df,  info_dataset,  reducer_pca, reducer_umap, U_encoder_layer, T_encoder_layer,  sigma=42, drift = 'production_drift')



### Apply tests

In [21]:
# apply tests on data from 'normal' day
test_on_reduced_kdim(normal_df, source_df, normal_dim_red, source_dim_red, seed_metrics, resultFile, info_drift_normal, k)
# apply tests on data from 'drifted' day
test_on_reduced_kdim(drifted_df, source_df, drifted_dim_red, source_dim_red, seed_metrics, resultFile, info_drift_drifted, k)