In [None]:
from datetime import date, datetime, timedelta
import pandas as pd

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from scipy import stats
import seaborn as sns
import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import time

In [None]:
random_seeds = list(np.arange(0, 30))
random_seeds

In [None]:
len(random_seeds)

In [None]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period
    
    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [None]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [None]:
def downsampling(training_features, training_labels, ratio=10):
    #return training_features, training_labels

    idx_true = np.where(training_labels == True)[0]
    idx_false = np.where(training_labels == False)[0]
    #print('Before dowmsampling:', len(idx_true), len(idx_false))
    idx_false_resampled = resample(idx_false, n_samples=len(idx_true)*ratio, replace=False, random_state = random_seed)
    idx_resampled = np.concatenate([idx_false_resampled, idx_true])
    idx_resampled.sort()
    resampled_features = training_features[idx_resampled]
    resampled_labels = training_labels[idx_resampled]
    #print('After dowmsampling:', len(idx_true), len(idx_false_resampled))
    return resampled_features, resampled_labels

Feature Importance Functions

In [None]:
def important_features_extraction(model, features_input):
    
    # extract features and their importances
    
    feature_importance_ranking = model.feature_importances_
    zipped_features = list(zip(feature_importance_ranking, features_input))
    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)
    
    # extract mean of importances
    
    importances = [i[0] for i in sorted_features_zip]
    mean_importances = np.mean(importances)
    
    # extract most important features and return
    
    most_important_features = [i[1] for i in sorted_features_zip if i[0]>= mean_importances]
    
    return most_important_features

In [None]:
def filtering_non_important_features(features_array, features_names, important_features_names):
    # transform array into dataframe and attach features
    df_features = pd.DataFrame(np.array(features_array), columns = features_names)
    
    # filter out columns with non-relevant features
    df_important_features = df_features[df_features.columns[~df_features.columns.isin(important_features)==0]]
    
    # transform dataframe with only into features back into array
    important_features_array = df_important_features.to_numpy()
    
    return important_features_array

In [None]:
def features_labels_preprocessing(DATASET_PATH, dataset):
    
    if(dataset=='b'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
        columns = ['serial_number', 'date'] + features_disk_failure + ['label']
        
        # read dataset
        df = pd.read_csv(DATASET_PATH_DISK, header=None, dtype = 'str').iloc[1:,1:]
        df.columns = columns
        
        # ignore serial number
        df = df[df.columns[1:]]
        
        for feature in features_disk_failure:
            df[feature] = df[feature].astype(float)


        d = {'True': True, 'False': False}
        df['label'] = df['label'].map(d)

        df['label'].unique()

        # transform date to date time
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        # divide on weeks
        df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        features = df[df.columns[:-1]].to_numpy()
        labels = df[df.columns[-1]].to_numpy()
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))
        
    elif(dataset=='g'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']
        columns_initial = ['Job ID', 'Status', 'Start Time', 'End Time'] + features_job_failure
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None)
        df.columns = columns_initial
        df = df.tail(-1)
        # ignore Job ID
        df = df.drop(['Job ID'], axis = 1)
        columns = features_job_failure

        include_end_time = False
        
        print('Features and Labels Preprocessing')
        
        # features and labels preprocessing
        features = df[(['Start Time']+ features_job_failure)].to_numpy()
        labels = (df['Status']==3).to_numpy()

        # FEATURES PREPROCESSING
        offset = (1 if include_end_time else 0)

        # ENCODE USER ID
        le = preprocessing.LabelEncoder()
        features[:, 1+offset] = le.fit_transform(features[:, 1+offset])

        # ENCODE JOB NAME
        le = preprocessing.LabelEncoder()
        features[:, 2+offset] = le.fit_transform(features[:, 2+offset])

        features = features.astype(float)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('g'))
        
    else:
        print('Incorrect Dataset')
    
    return feature_list, label_list

In [None]:
def ks_drift_detection(reference_data, testing_data):
    
    # extract distributions from reference and testing data
    
    distribution_extraction_time_start = time.time()
    distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_extraction_time_end = time.time() - distribution_extraction_time_start
    # apply KS statistical test
    
    ks_test_time_start = time.time()
    stat_test = stats.kstest
    v, p = stat_test(distribution_reference, distribution_test)
    ks_test_time_end = time.time() - ks_test_time_start
    # check if drift
    
    if(p<0.05):
        drift_alert = 1
    else:
        drift_alert = 0

    return drift_alert, distribution_extraction_time_end, ks_test_time_end

Feature Importance Functions

In [None]:
def important_features_extraction(model, features_input):
    
    # extract features and their importances
    
    feature_importance_ranking = model.feature_importances_
    zipped_features = list(zip(feature_importance_ranking, features_input))
    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)
    
    # extract mean of importances
    
    importances = [i[0] for i in sorted_features_zip]
    mean_importances = np.mean(importances)
    
    # extract most important features and return
    
    most_important_features = [i[1] for i in sorted_features_zip if i[0]>= mean_importances]
    
    return most_important_features

In [None]:
def filtering_non_important_features(features_array, features_names, important_features_names):
    # transform array into dataframe and attach features
    df_features = pd.DataFrame(np.array(features_array), columns = features_names)
    
    # filter out columns with non-relevant features
    df_important_features = df_features[df_features.columns[~df_features.columns.isin(important_features)==0]]
    
    # transform dataframe with only into features back into array
    important_features_array = df_important_features.to_numpy()
    
    return important_features_array

In [None]:
N_WORKERS = 1

# Extracting Labels and Features

In [None]:
DATASET_PATH_DISK = '../../../Documents/phd_related/AIOps_disk_failure_prediction/raw_data_2015_2017/disk_2015_complete.csv'


In [None]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']

In [None]:
feature_list, label_list = features_labels_preprocessing(DATASET_PATH_DISK, 'b')

In [None]:
len(feature_list)

In [None]:
# original implementation
months = ['M1_2', 'M2_3', 'M3_4', 'M4_5', 'M5_6', 'M6_7', 'M7_8', 'M8_9', 'M9_10', 'M10_11', 'M11_12']

# divide on weeks
'''
weeks = []
for i in range(0, len(feature_list)-1):
    string_week = 'W' + str(i+1) + '_' + str(i+2)
    weeks.append(string_week)
len(weeks)
'''

In [None]:
num_chunks = len(feature_list)
num_chunks

## True Labels

In [None]:
true_testing_labels = np.hstack(label_list[num_chunks//2:])
true_testing_labels

In [None]:
len(true_testing_labels)

In [None]:
N_ITER_SEARCH = 100

In [None]:
param_dist_rf = {
            'n_estimators': stats.randint(1e1, 1e2),
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [int(x) for x in np.linspace(10, 110, num=6)] + [None],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 4, 8],
            'class_weight':['balanced', None],
            'bootstrap': [True, False]
        }

# Static Model (No Retraining)

# DF Results

In [None]:
df_results_disk = pd.DataFrame()
df_results_disk

In [None]:
for random_seed in random_seeds:
    print('Random Seed', random_seed)
    begin_total_static = time.time()
    partial_roc_auc = []
    total_test_static = 0

    # extracting training features and labels
    training_features = np.vstack(feature_list[0: num_chunks//2])
    training_labels = np.hstack(label_list[0: num_chunks//2])

    # scaling training data
    scaler = StandardScaler()
    training_features = scaler.fit_transform(training_features)

    # downsampling training data
    training_features_downsampling, training_labels_downsampling = downsampling(training_features, training_labels)

    # training model
    begin_train_time_static = time.time()
    
    
    
        
    begin_hyperparam_tunning_static = time.time()
    model = RandomForestClassifier(random_state = random_seed)
    random_search = RandomizedSearchCV(model,
                                               param_distributions = param_dist_rf,
                                               n_iter=N_ITER_SEARCH,
                                               scoring='roc_auc',
                                               cv=4, n_jobs=1)
    print('Finding Hyperparameters')
    random_search.fit(training_features_downsampling, training_labels_downsampling)

    static_model = random_search.best_estimator_
    
    end_hyperparam_tunning_static = time.time() - begin_hyperparam_tunning_static
    
    print('Training')
    static_model.fit(training_features_downsampling, training_labels_downsampling)
    end_train_time_static = time.time() - begin_train_time_static
    #print('Training time: ', end_train_time_static)

    total_time_training = 0
    predictions_test_static_model = []

    # true testing labels
    true_testing_labels = np.hstack(label_list[num_chunks//2:])


    # lengths of tests
    len_test = 0

    print('Testing model on periods')
    begin_test_time_static = time.time()
    for i in tqdm(range(num_chunks//2, num_chunks)):

        # obtain testing features and labels
        testing_features = feature_list[i]
        #len_test = len_test + len(testing_features)
        testing_labels = label_list[i]

        # scaling testing features
        testing_features = scaler.transform(testing_features)

        # evaluate model on testing data
        begin_test_time_static = time.time()
        predictions_test_updated = static_model.predict(testing_features)
        end_test_time_static = time.time() - begin_test_time_static
        total_test_static = total_test_static + end_test_time_static

        partial_roc_auc.append(roc_auc_score(testing_labels, predictions_test_updated))

        predictions_test_static_model = np.concatenate([predictions_test_static_model, predictions_test_updated])




    end_total_static = time.time() - begin_total_static
    
    df_results_static_rf = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts', 'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Label_Costs'])
    df_results_static_rf.loc[0] = [random_seed, 'static', '0/' + str(int(num_chunks//2)), partial_roc_auc, np.mean(partial_roc_auc), roc_auc_score(true_testing_labels, predictions_test_static_model), predictions_test_static_model, true_testing_labels, end_train_time_static, end_hyperparam_tunning_static, total_test_static, np.zeros(25, dtype=int), 0.0]

    df_results_disk = pd.concat([df_results_disk, df_results_static_rf])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/rq3_static_model_backblaze_data.csv')


# DF Results

In [None]:
df_results_disk = pd.DataFrame()
df_results_disk

# Periodic Model 

In [None]:
for random_seed in random_seeds:

    print('Random Seed', random_seed)
    
    total_time_training = 0
    predictions_test_sw = []
    lengths_training_sw = []
    partial_roc_auc_sw = []
    
    

    begin_total_sw = time.time()

    total_train_sw = 0 
    total_hyperparam_sw = 0
    total_test_sw = 0


    for i in tqdm(range(num_chunks//2, num_chunks)):

        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init


        # scaler and downsampling for training data
        update_scaler = StandardScaler()
        training_features = update_scaler.fit_transform(training_features)
        training_features_downsampling, training_labels_downsampling = downsampling(training_features, training_labels)

        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        # scaling testing features
        testing_features = update_scaler.transform(testing_features)

        
        
        # training model
        begin_train_sw = time.time()




        begin_hyperparam_tunning_update = time.time()
        model = RandomForestClassifier(random_state = random_seed)
        random_search = RandomizedSearchCV(model,
                                                   param_distributions = param_dist_rf,
                                                   n_iter=N_ITER_SEARCH,
                                                   scoring='roc_auc',
                                                   cv=4, n_jobs=1)
        print('Finding Hyperparameters')
        random_search.fit(training_features_downsampling, training_labels_downsampling)

        update_model = random_search.best_estimator_
        print(update_model)

        end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update

        print('Training')
        update_model.fit(training_features_downsampling, training_labels_downsampling)
        end_train_sw = time.time() - begin_train_sw
        
        
        total_hyperparam_sw = total_hyperparam_sw + end_hyperparam_tunning_update
        total_train_sw = total_train_sw + end_train_sw
        
        
        
        begin_test_sw = time.time()
        
        predictions_test_updated = update_model.predict(testing_features)
        end_test_sw = time.time() - begin_test_sw
        total_test_sw = total_test_sw + end_test_sw


        partial_roc_auc_sw.append(roc_auc_score(testing_labels, predictions_test_updated))
        predictions_test_sw = np.concatenate([predictions_test_sw, predictions_test_updated])

        training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i+1])
        lengths_training_sw.append(len(training_features))
        training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i+1])

    end_total_sw = time.time() - begin_total_sw
    
    
    df_results_periodic_sw = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts', 'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Label_Costs'])
    df_results_periodic_sw.loc[0] = [random_seed, 'periodic-sw', str(int(num_chunks//2)) + '/' + str(int(num_chunks//2)), partial_roc_auc_sw, np.mean(partial_roc_auc_sw), roc_auc_score(true_testing_labels, predictions_test_sw), predictions_test_sw, true_testing_labels,  total_train_sw, total_hyperparam_sw, total_test_sw, np.ones(25, dtype=int), len(true_testing_labels)]

    df_results_disk = pd.concat([df_results_disk, df_results_periodic_sw])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/rq3_periodic_sw_model_backblaze_data.csv')


# Build Drift Detection based Model Update

### KS on all features

# DF Results

In [None]:
df_results_disk = pd.DataFrame()
df_results_disk

In [None]:
initial_training_batches_list = list(range(0, num_chunks//2))


for random_seed in random_seeds:

    print('Random Seed:', random_seed)
    necessary_label_annotation_effort = 0
    total_time_training = 0
    no_necessary_retrainings = 0
    lengths_training_ks_all = []
    partial_roc_auc_ks_all_model = []
    
    
    predictions_test_ks_all_model = []
    
    

    total_train_sw_all = 0
    total_hyperparam_sw_ks_all = 0
    total_test_time_ks_all = 0
    
    total_drift_detection_time = 0
    total_distribution_extraction_time = 0
    total_stat_test_time = 0
    
    
    detected_drifts = []


    for i in tqdm(range(num_chunks//2, num_chunks)):
    #print('Period', i-num_chunks//2)
    
        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init
            current_training_batches_list = initial_training_batches_list.copy()
            print('Initial Training Batches', current_training_batches_list)

        #print('Training for Model before Scaling', training_features)
        

        # scaler and downsampling on training data
        update_scaler = StandardScaler()
        training_features_model = update_scaler.fit_transform(training_features)
        training_features_model, training_labels_model = downsampling(training_features_model, training_labels)

        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        
        #print('Testing Model before Scaling', testing_features)
        # scaling testing features
        testing_features_model = update_scaler.transform(testing_features)
        testing_labels_model = testing_labels


         # training model
        begin_train_sw_ks_all = time.time()


        if(i==num_chunks//2 or need_to_retrain == 1):
            print('RETRAINING MODEL')
            
            begin_train_sw_ks_all = time.time()
        
            begin_hyperparam_tunning_update = time.time()
            model = RandomForestClassifier(random_state = random_seed)
            random_search = RandomizedSearchCV(model,
                                                       param_distributions = param_dist_rf,
                                                       n_iter=N_ITER_SEARCH,
                                                       scoring='roc_auc',
                                                       cv=4, n_jobs=1, random_state = random_seed)

            
            random_search.fit(training_features_model, training_labels_model)
            
            update_model_ks_all = random_search.best_estimator_
            
            

            end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update
            
            total_hyperparam_sw_ks_all = total_hyperparam_sw_ks_all + end_hyperparam_tunning_update
            
            
            
            
            update_model_ks_all.fit(training_features_model, training_labels_model)
            
            end_train_sw_ks_all = time.time() - begin_train_sw_ks_all
        
            total_train_sw_all = total_train_sw_all + end_train_sw_ks_all
        
        
        # evaluate model on testing data
        
        begin_test_time_ks_all = time.time()
        predictions_test_updated = update_model_ks_all.predict(testing_features_model)
        
        end_test_time_ks_all = time.time() - begin_test_time_ks_all
        total_test_time_ks_all = total_test_time_ks_all + end_test_time_ks_all

        partial_roc_auc_ks_all_model.append(roc_auc_score(testing_labels_model, predictions_test_updated))
        
        predictions_test_ks_all_model = np.concatenate([predictions_test_ks_all_model, predictions_test_updated])
        
        
        print('Predictions Test Batch', len(predictions_test_updated))
        print('Prediction Test All', len(predictions_test_ks_all_model))
        
        
        # Drift Detection
        
        need_to_retrain = 0
        
        print('MODEL', update_model_ks_all)
        
        
        drift_time_start = time.time()
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(training_features_model, testing_features_model)
        drift_time_end = time.time() - drift_time_start
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
        
        detected_drifts.append(drift_alert)
        
        
        
        
        drift_time_start = time.time()
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(training_features_model, testing_features_model)
        drift_time_end = time.time() - drift_time_start
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
                
        if(drift_alert==1):
        
            need_to_retrain = 1
            drift_alert = 0

       
       
            
            print('CHANGE OF TRAINING')

            no_necessary_retrainings = no_necessary_retrainings + 1
            necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)

            #new_training_features = np.concatenate([training_features[len(testing_features):], testing_features])
            #new_training_labels = np.concatenate([training_labels[len(testing_labels):], testing_labels])
            
            
            
            current_training_batches_list.remove(current_training_batches_list[0])        
            current_training_batches_list.append(i)
        
            #print('Current Training Batches',current_training_batches_list)
            
            
            training_features_list_updated = [feature_list[i] for i in current_training_batches_list]
            training_labels_list_updated = [label_list[i] for i in current_training_batches_list]
        
            training_features = np.vstack(training_features_list_updated)
            training_labels = np.hstack(training_labels_list_updated)

            #training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i+1])
            #training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i+1])
        
        print('Current Training Batches',current_training_batches_list)
    
    
    df_results_ks_all_model = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts_Overall',  'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Drift_Detection_Total_Time', 'Distribution_Extraction_Time', 'Statistical_Test_Time', 'Label_Costs'])
    df_results_ks_all_model.loc[0] = [random_seed, 'KS_ALL', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), partial_roc_auc_ks_all_model, np.mean(partial_roc_auc_ks_all_model), roc_auc_score(true_testing_labels, predictions_test_ks_all_model), predictions_test_ks_all_model, true_testing_labels, total_train_sw_all, total_hyperparam_sw_ks_all, total_test_time_ks_all, detected_drifts, total_drift_detection_time, total_distribution_extraction_time, total_stat_test_time, necessary_label_annotation_effort]
    
    
    
    df_results_disk = pd.concat([df_results_disk, df_results_ks_all_model])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/rq3_ks_all_sw_model_disk_data.csv')
    


### KS on PCA Features

# DF Results

In [None]:
df_results_disk = pd.DataFrame()
df_results_disk

In [None]:
initial_training_batches_list = list(range(0, num_chunks//2))


for random_seed in random_seeds:
    
    
    print('Random Seed:', random_seed)
    necessary_label_annotation_effort = 0
    total_time_training = 0
    no_necessary_retrainings = 0
    lengths_training_ks_pca = []
    partial_roc_auc_ks_pca_model = []
    
    
    predictions_test_ks_pca_model = []
    
    

    total_train_sw_pca = 0
    total_hyperparam_sw_ks_pca = 0
    total_test_time_ks_pca = 0
    
    total_drift_detection_time = 0
    total_distribution_extraction_time = 0
    total_stat_test_time = 0
    total_pca_time = 0
    
    
    detected_drifts = []


    for i in tqdm(range(num_chunks//2, num_chunks)):


    
        #print('Period', i-num_chunks//2)
    
        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init
            current_training_batches_list = initial_training_batches_list.copy()
            print('Initial Training Batches', current_training_batches_list)

        #print('Training for Model before Scaling', training_features)
        

        # scaler and downsampling for training data
        update_scaler = StandardScaler()
        training_features_model = update_scaler.fit_transform(training_features)
        training_features_model, training_labels_model = downsampling(training_features_model, training_labels)

        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        
        #print('Testing Model before Scaling', testing_features)
        # scaling testing features
        testing_features_model = update_scaler.transform(testing_features)
        testing_labels_model = testing_labels


         # training model
        begin_train_sw_ks_pca = time.time()


        if(i==num_chunks//2 or need_to_retrain == 1):
            print('RETRAINING MODEL')
            
            begin_train_sw_ks_pca = time.time()
        
            begin_hyperparam_tunning_update = time.time()
            model = RandomForestClassifier(random_state = random_seed)
            random_search = RandomizedSearchCV(model,
                                                       param_distributions = param_dist_rf,
                                                       n_iter=N_ITER_SEARCH,
                                                       scoring='roc_auc',
                                                       cv=4, n_jobs=1, random_state = random_seed)

            
            random_search.fit(training_features_model, training_labels_model)
            
            update_model_ks_pca = random_search.best_estimator_
            
            

            end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update
            
            total_hyperparam_sw_ks_pca = total_hyperparam_sw_ks_pca + end_hyperparam_tunning_update
            
            
            
            
            update_model_ks_pca.fit(training_features_model, training_labels_model)
            
            end_train_sw_ks_pca = time.time() - begin_train_sw_ks_pca
        
            total_train_sw_pca = total_train_sw_pca + end_train_sw_ks_pca
        
        
        # evaluate model on testing data
        
        begin_test_time_ks_pca = time.time()
        predictions_test_updated = update_model_ks_pca.predict(testing_features_model)
        
        end_test_time_ks_pca = time.time() - begin_test_time_ks_pca
        total_test_time_ks_pca = total_test_time_ks_pca + end_test_time_ks_pca

        partial_roc_auc_ks_pca_model.append(roc_auc_score(testing_labels_model, predictions_test_updated))
        
        predictions_test_ks_pca_model = np.concatenate([predictions_test_ks_pca_model, predictions_test_updated])
        
        
        print('Predictions Test Batch', len(predictions_test_updated))
        print('Prediction Test All', len(predictions_test_ks_pca_model))
        
        
        # Drift Detection
        
        need_to_retrain = 0
        
        print('MODEL', update_model_ks_pca)
        
        drift_time_start = time.time()
        
        # Extract PCA Features
        
        pca_computing_time_start = time.time()
        
        pca = PCA(n_components = 0.95, random_state = random_seed)
        pca.fit(training_features_model)

        df_train_features_sorted_pca = pca.transform(training_features_model)
        df_test_features_sorted_pca = pca.transform(testing_features_model)
        
        pca_computing_time_end = time.time() - pca_computing_time_start
        
        
        # Detect Drift
        
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(df_train_features_sorted_pca, df_test_features_sorted_pca)
        drift_time_end = time.time() - drift_time_start
        
        
        
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_pca_time = total_pca_time + pca_computing_time_end
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
        
        detected_drifts.append(drift_alert)
        
        
        
        if(drift_alert==1):
        
            need_to_retrain = 1
            drift_alert = 0

       
       
            
            print('CHANGE OF TRAINING')

            no_necessary_retrainings = no_necessary_retrainings + 1
            necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)

            #new_training_features = np.concatenate([training_features[len(testing_features):], testing_features])
            #new_training_labels = np.concatenate([training_labels[len(testing_labels):], testing_labels])
            
            
            
            current_training_batches_list.remove(current_training_batches_list[0])        
            current_training_batches_list.append(i)
        
            #print('Current Training Batches',current_training_batches_list)
            
            
            training_features_list_updated = [feature_list[i] for i in current_training_batches_list]
            training_labels_list_updated = [label_list[i] for i in current_training_batches_list]
        
            training_features = np.vstack(training_features_list_updated)
            training_labels = np.hstack(training_labels_list_updated)

            #training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i+1])
            #training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i+1])
        
        print('Current Training Batches',current_training_batches_list)
    
    
    df_results_ks_pca_model = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts_Overall',  'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Drift_Detection_Total_Time', 'PCA_Computing_time', 'Distribution_Extraction_Time', 'Statistical_Test_Time', 'Label_Costs'])
    df_results_ks_pca_model.loc[0] = [random_seed, 'KS_PCA', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), partial_roc_auc_ks_pca_model, np.mean(partial_roc_auc_ks_pca_model), roc_auc_score(true_testing_labels, predictions_test_ks_pca_model), predictions_test_ks_pca_model, true_testing_labels, total_train_sw_pca, total_hyperparam_sw_ks_pca, total_test_time_ks_pca, detected_drifts, total_drift_detection_time, total_pca_time, total_distribution_extraction_time, total_stat_test_time, necessary_label_annotation_effort]
    
    
    df_results_disk = pd.concat([df_results_disk, df_results_ks_pca_model])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/rq3_ks_pca_sw_model_backblaze_data.csv')
    

### Most Important Features

# DF Results

In [None]:
df_results_disk = pd.DataFrame()
df_results_disk

In [None]:
initial_training_batches_list = list(range(0, num_chunks//2))


for random_seed in random_seeds:


    print('Random Seed', random_seed)
    no_necessary_retrainings = 0
    necessary_label_annotation_effort = 0
    total_time_training = 0
    lengths_training_ks_fi = []
    partial_roc_auc_ks_fi_model = []
    
    
    predictions_test_ks_fi_model = []
    


    total_train_sw_fi = 0
    total_hyperparam_sw_ks_fi = 0
    total_test_time_ks_fi = 0
    
    total_ks_distribution_extraction = 0
    total_ks_drift_detection = 0
    total_feature_importance_extraction_time = 0
    
    detected_drifts = []


    for i in tqdm(range(num_chunks//2, num_chunks)):


    
        #print('Period', i-num_chunks//2)
    
        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init
            current_training_batches_list = initial_training_batches_list.copy()
            print('Initial Training Batches', current_training_batches_list)

        #print('Training for Model before Scaling', training_features)
        

        # scaler and downsampling for training data
        update_scaler = StandardScaler()
        training_features_model = update_scaler.fit_transform(training_features)
        training_features_model, training_labels_model = downsampling(training_features_model, training_labels)

        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        
        #print('Testing Model before Scaling', testing_features)
        # scaling testing features
        testing_features_model = update_scaler.transform(testing_features)
        testing_labels_model = testing_labels


         # training model
        begin_train_sw_ks_fi = time.time()


        if(i==num_chunks//2 or need_to_retrain == 1):
            print('RETRAINING MODEL')
            
            begin_train_sw_ks_fi = time.time()
        
            begin_hyperparam_tunning_update = time.time()
            model = RandomForestClassifier(random_state = random_seed)
            random_search = RandomizedSearchCV(model,
                                                       param_distributions = param_dist_rf,
                                                       n_iter=N_ITER_SEARCH,
                                                       scoring='roc_auc',
                                                       cv=4, n_jobs=1, random_state = random_seed)

            
            random_search.fit(training_features_model, training_labels_model)
            
            update_model_ks_fi = random_search.best_estimator_
            
            

            end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update
            
            total_hyperparam_sw_ks_fi = total_hyperparam_sw_ks_fi + end_hyperparam_tunning_update
            
            
            
            
            update_model_ks_fi.fit(training_features_model, training_labels_model)
            
            end_train_sw_ks_fi = time.time() - begin_train_sw_ks_fi
        
            total_train_sw_fi = total_train_sw_fi + end_train_sw_ks_fi
        
        
        # evaluate model on testing data
        
        begin_test_time_ks_fi = time.time()
        predictions_test_updated = update_model_ks_fi.predict(testing_features_model)
        
        end_test_time_ks_fi = time.time() - begin_test_time_ks_fi
        total_test_time_ks_fi = total_test_time_ks_fi + end_test_time_ks_fi

        partial_roc_auc_ks_fi_model.append(roc_auc_score(testing_labels_model, predictions_test_updated))
        
        predictions_test_ks_fi_model = np.concatenate([predictions_test_ks_fi_model, predictions_test_updated])
        
        
        print('Predictions Test Batch', len(predictions_test_updated))
        print('Prediction Test All', len(predictions_test_ks_fi_model))
        
        
        # Drift Detection
        
        need_to_retrain = 0
        
        print('MODEL', update_model_ks_fi)
        
        drift_time_start = time.time()
        
        # Extract Most Important Features
        feature_importance_extraction_start = time.time()
        
        important_features = important_features_extraction(update_model_ks_fi, features_disk_failure)
        print('Important Features', important_features)
        print(len(important_features))

        # filter non-important features from train and test

        training_important_features_model = filtering_non_important_features(training_features_model, features_disk_failure, important_features)
        testing_important_features_model = filtering_non_important_features(testing_features_model, features_disk_failure, important_features)

        feature_importance_extraction_end = time.time() - feature_importance_extraction_start
        
        
        # Detect Drift
        
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(training_important_features_model, testing_important_features_model)
        drift_time_end = time.time() - drift_time_start
        
        
        
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_feature_importance_extraction_time = total_feature_importance_extraction_time + feature_importance_extraction_end
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
        
        detected_drifts.append(drift_alert)
        
        
        
        if(drift_alert==1):
        
            need_to_retrain = 1
            drift_alert = 0

       
       
            
            print('CHANGE OF TRAINING')

            no_necessary_retrainings = no_necessary_retrainings + 1
            necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)

            #new_training_features = np.concatenate([training_features[len(testing_features):], testing_features])
            #new_training_labels = np.concatenate([training_labels[len(testing_labels):], testing_labels])
            
            
            
            current_training_batches_list.remove(current_training_batches_list[0])        
            current_training_batches_list.append(i)
        
            #print('Current Training Batches',current_training_batches_list)
            
            
            training_features_list_updated = [feature_list[i] for i in current_training_batches_list]
            training_labels_list_updated = [label_list[i] for i in current_training_batches_list]
        
            training_features = np.vstack(training_features_list_updated)
            training_labels = np.hstack(training_labels_list_updated)

            #training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i+1])
            #training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i+1])
        
        print('Current Training Batches',current_training_batches_list)
    
    
    df_results_ks_fi_model = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts_Overall',  'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Drift_Detection_Total_Time', 'FI_Extraction_Time', 'Distribution_Extraction_Time', 'Statistical_Test_Time', 'Label_Costs'])
    df_results_ks_fi_model.loc[0] = [random_seed, 'KS_FI', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), partial_roc_auc_ks_fi_model, np.mean(partial_roc_auc_ks_fi_model), roc_auc_score(true_testing_labels, predictions_test_ks_fi_model), predictions_test_ks_fi_model, true_testing_labels, total_train_sw_fi, total_hyperparam_sw_ks_fi, total_test_time_ks_fi, detected_drifts, total_drift_detection_time, total_feature_importance_extraction_time, total_distribution_extraction_time, total_stat_test_time, necessary_label_annotation_effort]
    
    
    df_results_disk = pd.concat([df_results_disk, df_results_ks_fi_model])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/rq3_ks_FI_sw_model_backblaze_data.csv')
    
