In [1]:
from datetime import date, datetime, timedelta
import pandas as pd

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from scipy import stats
import seaborn as sns
import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import time

In [2]:
random_seeds = list(np.arange(30))
random_seeds

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [3]:
len(random_seeds)

30

In [4]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period
    
    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [5]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [6]:
def downsampling(training_features, training_labels, ratio=10):
    #return training_features, training_labels

    idx_true = np.where(training_labels == True)[0]
    idx_false = np.where(training_labels == False)[0]
    #print('Before dowmsampling:', len(idx_true), len(idx_false))
    idx_false_resampled = resample(idx_false, n_samples=len(idx_true)*ratio, replace=False, random_state = random_seed)
    idx_resampled = np.concatenate([idx_false_resampled, idx_true])
    idx_resampled.sort()
    resampled_features = training_features[idx_resampled]
    resampled_labels = training_labels[idx_resampled]
    #print('After dowmsampling:', len(idx_true), len(idx_false_resampled))
    return resampled_features, resampled_labels

Feature Importance Functions

In [7]:
def important_features_extraction(model, features_input):
    
    # extract features and their importances
    
    feature_importance_ranking = model.feature_importances_
    zipped_features = list(zip(feature_importance_ranking, features_input))
    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)
    
    # extract mean of importances
    
    importances = [i[0] for i in sorted_features_zip]
    mean_importances = np.mean(importances)
    
    # extract most important features and return
    
    most_important_features = [i[1] for i in sorted_features_zip if i[0]>= mean_importances]
    
    return most_important_features

In [8]:
def filtering_non_important_features(features_array, features_names, important_features_names):
    # transform array into dataframe and attach features
    df_features = pd.DataFrame(np.array(features_array), columns = features_names)
    
    # filter out columns with non-relevant features
    df_important_features = df_features[df_features.columns[~df_features.columns.isin(important_features)==0]]
    
    # transform dataframe with only into features back into array
    important_features_array = df_important_features.to_numpy()
    
    return important_features_array

In [9]:
def features_labels_preprocessing(DATASET_PATH, dataset):
    
    if(dataset=='b'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
        columns = ['serial_number', 'date'] + features_disk_failure + ['label']
        
        # read dataset
        df = pd.read_csv(DATASET_PATH_DISK, header=None, dtype = 'str').iloc[1:,1:]
        df.columns = columns
        
        # ignore serial number
        df = df[df.columns[1:]]
        
        for feature in features_disk_failure:
            df[feature] = df[feature].astype(float)


        d = {'True': True, 'False': False}
        df['label'] = df['label'].map(d)

        df['label'].unique()

        # transform date to date time
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        # divide on weeks
        df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        features = df[df.columns[:-1]].to_numpy()
        labels = df[df.columns[-1]].to_numpy()
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))
        
    elif(dataset=='g'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']
        columns_initial = ['Job ID', 'Status', 'Start Time', 'End Time'] + features_job_failure
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None)
        df.columns = columns_initial
        df = df.tail(-1)
        # ignore Job ID
        df = df.drop(['Job ID'], axis = 1)
        columns = features_job_failure

        include_end_time = False
        
        print('Features and Labels Preprocessing')
        
        # features and labels preprocessing
        features = df[(['Start Time']+ features_job_failure)].to_numpy()
        labels = (df['Status']==3).to_numpy()

        # FEATURES PREPROCESSING
        offset = (1 if include_end_time else 0)

        # ENCODE USER ID
        le = preprocessing.LabelEncoder()
        features[:, 1+offset] = le.fit_transform(features[:, 1+offset])

        # ENCODE JOB NAME
        le = preprocessing.LabelEncoder()
        features[:, 2+offset] = le.fit_transform(features[:, 2+offset])

        features = features.astype(float)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('g'))
        
    else:
        print('Incorrect Dataset')
    
    return feature_list, label_list

In [10]:
def ks_drift_detection(reference_data, testing_data):
    
    # extract distributions from reference and testing data
    
    distribution_extraction_time_start = time.time()
    distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_extraction_time_end = time.time() - distribution_extraction_time_start
    # apply KS statistical test
    
    ks_test_time_start = time.time()
    stat_test = stats.kstest
    v, p = stat_test(distribution_reference, distribution_test)
    ks_test_time_end = time.time() - ks_test_time_start
    # check if drift
    
    if(p<0.05):
        drift_alert = 1
    else:
        drift_alert = 0

    return drift_alert, distribution_extraction_time_end, ks_test_time_end

Feature Importance Functions

In [11]:
def important_features_extraction(model, features_input):
    
    # extract features and their importances
    
    feature_importance_ranking = model.feature_importances_
    zipped_features = list(zip(feature_importance_ranking, features_input))
    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)
    
    # extract mean of importances
    
    importances = [i[0] for i in sorted_features_zip]
    mean_importances = np.mean(importances)
    
    # extract most important features and return
    
    most_important_features = [i[1] for i in sorted_features_zip if i[0]>= mean_importances]
    
    return most_important_features

In [12]:
def filtering_non_important_features(features_array, features_names, important_features_names):
    # transform array into dataframe and attach features
    df_features = pd.DataFrame(np.array(features_array), columns = features_names)
    
    # filter out columns with non-relevant features
    df_important_features = df_features[df_features.columns[~df_features.columns.isin(important_features)==0]]
    
    # transform dataframe with only into features back into array
    important_features_array = df_important_features.to_numpy()
    
    return important_features_array

In [13]:
N_WORKERS = 1

# Extracting Labels and Features

In [14]:
DATASET_PATH_DISK = '../../../Documents/phd_related/AIOps_disk_failure_prediction/raw_data_2015_2017/disk_2015_complete.csv'


In [15]:
feature_list, label_list = features_labels_preprocessing(DATASET_PATH_DISK, 'b')

Data Reading and Preprocessing
Features and Labels Computing


In [16]:
len(feature_list)

12

In [17]:
num_chunks = len(feature_list)
num_chunks

12

## True Labels

In [18]:
true_testing_labels = np.hstack(label_list[num_chunks//2:])
true_testing_labels

array([False, False, False, ..., False, False, False])

In [19]:
len(true_testing_labels)

83460

In [20]:
N_ITER_SEARCH = 100

In [21]:
param_dist_rf = {
            'n_estimators': stats.randint(1e1, 1e2),
            'criterion': ['gini', 'entropy'],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [int(x) for x in np.linspace(10, 110, num=6)] + [None],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 4, 8],
            'class_weight':['balanced', None],
            'bootstrap': [True, False]
        }

# DF Results

In [22]:
df_results_disk = pd.DataFrame()
df_results_disk

# Periodic Model 

In [39]:
for random_seed in random_seeds:

    print('Random Seed', random_seed)
    
    total_time_training = 0
    predictions_test_fh = []
    
    partial_roc_auc_fh = []
    
    

    begin_total_fh = time.time()

    total_train_fh = 0 
    total_hyperparam_fh = 0
    total_test_fh = 0


    for i in tqdm(range(num_chunks//2, num_chunks)):

        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init


        # scaler and downsampling for training data
        update_scaler = StandardScaler()
        training_features = update_scaler.fit_transform(training_features)
        training_features_downsampling, training_labels_downsampling = downsampling(training_features, training_labels)

        print('LEN TRAINING', len(training_features_downsampling))
        length_training_fh = length_training_fh + len(training_features_downsampling)
        
        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        # scaling testing features
        testing_features = update_scaler.transform(testing_features)

        
        
        # training model
        begin_train_fh = time.time()




        begin_hyperparam_tunning_update = time.time()
        model = RandomForestClassifier(random_state = random_seed)
        random_search = RandomizedSearchCV(model,
                                                   param_distributions = param_dist_rf,
                                                   n_iter=N_ITER_SEARCH,
                                                   scoring='roc_auc',
                                                   cv=4, n_jobs=1)
        print('Finding Hyperparameters')
        random_search.fit(training_features_downsampling, training_labels_downsampling)

        update_model = random_search.best_estimator_
        print(update_model)

        end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update

        print('Training')
        update_model.fit(training_features_downsampling, training_labels_downsampling)
        end_train_fh = time.time() - begin_train_fh
        
        
        total_hyperparam_fh = total_hyperparam_fh + end_hyperparam_tunning_update
        total_train_fh = total_train_fh + end_train_fh
        
        
        # evaluate model on testing data
        begin_test_fh = time.time()
        predictions_test_updated = update_model.predict(testing_features)
        end_test_fh = time.time() - begin_test_fh
        total_test_fh = total_test_fh + end_test_fh


        partial_roc_auc_fh.append(roc_auc_score(testing_labels, predictions_test_updated))
        predictions_test_fh = np.concatenate([predictions_test_fh, predictions_test_updated])

        training_features = np.vstack(feature_list[0: i+1])
        training_labels = np.hstack(label_list[0: i+1])
        
        print('Length of Training', length_training_fh)

    end_total_fh = time.time() - begin_total_fh
    
    
    df_results_periodic_fh = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts', 'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Label_Costs'])
    df_results_periodic_fh.loc[0] = [random_seed, 'periodic-sw', str(int(num_chunks//2)) + '/' + str(int(num_chunks//2)), partial_roc_auc_fh, np.mean(partial_roc_auc_fh), roc_auc_score(true_testing_labels, predictions_test_fh), predictions_test_fh, true_testing_labels,  total_train_fh, total_hyperparam_fh, total_test_fh, np.ones(int(num_chunks//2), dtype=int), len(true_testing_labels)]

    df_results_disk = pd.concat([df_results_disk, df_results_periodic_fh])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/periodic_fh_model_backblaze_data_green.csv')


Random Seed 0


  0%|          | 0/6 [00:00<?, ?it/s]

LEN TRAINING 1782
Finding Hyperparameters


 17%|█▋        | 1/6 [00:32<02:42, 32.49s/it]

RandomForestClassifier(bootstrap=False, max_depth=10, max_features='log2',
                       n_estimators=49, random_state=0)
Training
Length of Training 11099
LEN TRAINING 2145
Finding Hyperparameters


 33%|███▎      | 2/6 [01:13<02:30, 37.52s/it]

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10,
                       max_features='log2', min_samples_leaf=2,
                       min_samples_split=8, n_estimators=60, random_state=0)
Training
Length of Training 13244
LEN TRAINING 2519
Finding Hyperparameters
RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=96,
                       random_state=0)
Training


 50%|█████     | 3/6 [01:56<01:59, 39.94s/it]

Length of Training 15763
LEN TRAINING 2871
Finding Hyperparameters
RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=90,
                       max_features='log2', min_samples_leaf=2,
                       min_samples_split=4, n_estimators=77, random_state=0)
Training


 67%|██████▋   | 4/6 [02:45<01:27, 43.53s/it]

Length of Training 18634
LEN TRAINING 3157
Finding Hyperparameters
RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=110,
                       min_samples_leaf=4, n_estimators=63, random_state=0)
Training


 83%|████████▎ | 5/6 [03:36<00:46, 46.20s/it]

Length of Training 21791
LEN TRAINING 3443
Finding Hyperparameters
RandomForestClassifier(criterion='entropy', max_depth=50, min_samples_leaf=2,
                       min_samples_split=8, n_estimators=71, random_state=0)
Training


100%|██████████| 6/6 [04:31<00:00, 45.21s/it]


Length of Training 25234
Random Seed 1


  0%|          | 0/6 [00:00<?, ?it/s]

LEN TRAINING 1782
Finding Hyperparameters


 17%|█▋        | 1/6 [00:34<02:54, 34.89s/it]

RandomForestClassifier(class_weight='balanced', max_features='log2',
                       min_samples_leaf=2, n_estimators=62, random_state=1)
Training
Length of Training 27016
LEN TRAINING 2145
Finding Hyperparameters


 33%|███▎      | 2/6 [01:17<02:38, 39.57s/it]

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_leaf=2, min_samples_split=8,
                       n_estimators=21, random_state=1)
Training
Length of Training 29161
LEN TRAINING 2519
Finding Hyperparameters


 50%|█████     | 3/6 [01:58<02:00, 40.12s/it]

RandomForestClassifier(bootstrap=False, max_depth=10, max_features='log2',
                       min_samples_split=4, n_estimators=19, random_state=1)
Training
Length of Training 31680
LEN TRAINING 2871
Finding Hyperparameters


 67%|██████▋   | 4/6 [02:45<01:25, 42.96s/it]

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=70, min_samples_leaf=2, n_estimators=49,
                       random_state=1)
Training
Length of Training 34551
LEN TRAINING 3157
Finding Hyperparameters


 83%|████████▎ | 5/6 [03:36<00:45, 45.78s/it]

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=38,
                       random_state=1)
Training
Length of Training 37708
LEN TRAINING 3443
Finding Hyperparameters


100%|██████████| 6/6 [04:30<00:00, 45.10s/it]


RandomForestClassifier(class_weight='balanced', max_depth=110,
                       max_features='log2', min_samples_leaf=2, n_estimators=31,
                       random_state=1)
Training
Length of Training 41151
Random Seed 2


  0%|          | 0/6 [00:00<?, ?it/s]

LEN TRAINING 1782
Finding Hyperparameters
RandomForestClassifier(max_depth=110, n_estimators=99, random_state=2)
Training


 17%|█▋        | 1/6 [00:33<02:48, 33.80s/it]

Length of Training 42933
LEN TRAINING 2145
Finding Hyperparameters


 33%|███▎      | 2/6 [01:12<02:27, 36.97s/it]

RandomForestClassifier(criterion='entropy', max_depth=90, min_samples_leaf=2,
                       n_estimators=41, random_state=2)
Training
Length of Training 45078
LEN TRAINING 2519
Finding Hyperparameters


 50%|█████     | 3/6 [01:55<01:57, 39.31s/it]

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=70,
                       min_samples_leaf=2, min_samples_split=8, n_estimators=35,
                       random_state=2)
Training
Length of Training 47597
LEN TRAINING 2871
Finding Hyperparameters
RandomForestClassifier(bootstrap=False, max_depth=50, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=63, random_state=2)
Training


 67%|██████▋   | 4/6 [02:45<01:27, 43.63s/it]

Length of Training 50468
LEN TRAINING 3157
Finding Hyperparameters
RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=50,
                       min_samples_leaf=2, n_estimators=93, random_state=2)
Training


 83%|████████▎ | 5/6 [03:39<00:47, 47.59s/it]

Length of Training 53625
LEN TRAINING 3443
Finding Hyperparameters


100%|██████████| 6/6 [04:37<00:00, 46.24s/it]


RandomForestClassifier(criterion='entropy', max_depth=110, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=39, random_state=2)
Training
Length of Training 57068
Random Seed 3


  0%|          | 0/6 [00:00<?, ?it/s]

LEN TRAINING 1782
Finding Hyperparameters


 17%|█▋        | 1/6 [00:36<03:04, 36.81s/it]

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=10, min_samples_leaf=2, n_estimators=13,
                       random_state=3)
Training
Length of Training 58850
LEN TRAINING 2145
Finding Hyperparameters


 33%|███▎      | 2/6 [01:19<02:41, 40.31s/it]

RandomForestClassifier(criterion='entropy', max_depth=110, max_features='log2',
                       min_samples_leaf=4, n_estimators=11, random_state=3)
Training
Length of Training 60995
LEN TRAINING 2519
Finding Hyperparameters


 50%|█████     | 3/6 [02:06<02:09, 43.23s/it]

RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=90,
                       max_features='log2', min_samples_leaf=2, n_estimators=61,
                       random_state=3)
Training
Length of Training 63514
LEN TRAINING 2871
Finding Hyperparameters


 67%|██████▋   | 4/6 [02:57<01:32, 46.49s/it]

RandomForestClassifier(bootstrap=False, max_depth=70, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=24, random_state=3)
Training
Length of Training 66385
LEN TRAINING 3157
Finding Hyperparameters
RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=50, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=69, random_state=3)
Training


 83%|████████▎ | 5/6 [03:52<00:49, 49.62s/it]

Length of Training 69542
LEN TRAINING 3443
Finding Hyperparameters
RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=30, max_features='log2', min_samples_leaf=4,
                       min_samples_split=8, n_estimators=96, random_state=3)
Training


100%|██████████| 6/6 [04:51<00:00, 48.66s/it]


Length of Training 72985
Random Seed 4


  0%|          | 0/6 [00:00<?, ?it/s]

LEN TRAINING 1782
Finding Hyperparameters
RandomForestClassifier(class_weight='balanced', max_depth=90,
                       max_features='log2', min_samples_split=4,
                       n_estimators=90, random_state=4)
Training


 17%|█▋        | 1/6 [00:38<03:10, 38.17s/it]

Length of Training 74767
LEN TRAINING 2145
Finding Hyperparameters


 17%|█▋        | 1/6 [01:01<05:07, 61.49s/it]


KeyboardInterrupt: 

In [40]:
df_results_disk

Unnamed: 0,Random_Seed,Model,Drifts,ROC_AUC_Batch,ROC_AUC_BATCH_MEAN,ROC_AUC_Total,Predictions,True_Testing_Labels,Train_Time,Hyperparam_Tunning_Time,Test_Time,Drifts_Detected,Label_Costs
0,0,periodic-sw,6/6,"[0.9224351517780277, 0.954291838192767, 0.9828...",0.963134,0.961766,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",288.414711,287.32893,0.305474,"[1, 1, 1, 1, 1, 1]",83460
1,1,periodic-sw,6/6,"[0.937405939683103, 0.9839566393358964, 0.9823...",0.970212,0.970026,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",293.298577,292.287884,0.262072,"[1, 1, 1, 1, 1, 1]",83460
2,0,periodic-sw,6/6,"[0.9073920729743763, 0.9399112884020004, 0.982...",0.958032,0.956053,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",270.606751,269.576997,0.278546,"[1, 1, 1, 1, 1, 1]",83460
3,1,periodic-sw,6/6,"[0.937405939683103, 0.967804834484401, 0.96719...",0.964976,0.964259,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",270.132688,269.617659,0.137549,"[1, 1, 1, 1, 1, 1]",83460
4,2,periodic-sw,6/6,"[0.9378396850745583, 0.969286905045419, 0.9828...",0.968188,0.967431,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",276.899207,275.992308,0.245746,"[1, 1, 1, 1, 1, 1]",83460
5,3,periodic-sw,6/6,"[0.9369721942916479, 0.9104995236961181, 0.982...",0.961227,0.958919,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",291.450793,290.608667,0.189059,"[1, 1, 1, 1, 1, 1]",83460


# Build Drift Detection based Model Update

### KS on all features

# DF Results

In [41]:
df_results_disk = pd.DataFrame()
df_results_disk

In [43]:
initial_training_batches_list = list(range(0, num_chunks//2))


for random_seed in random_seeds:

    print('Random Seed:', random_seed)
    necessary_label_annotation_effort = 0
    total_time_training = 0
    no_necessary_retrainings = 0
    lengths_training_ks_all = []
    partial_roc_auc_ks_all_model = []
    
    
    predictions_test_ks_all_model = []
    
    

    total_train_fh_all = 0
    total_hyperparam_fh_ks_all = 0
    total_test_time_ks_all = 0
    
    total_drift_detection_time = 0
    total_distribution_extraction_time = 0
    total_stat_test_time = 0
    
    
    detected_drifts = []


    for i in tqdm(range(num_chunks//2, num_chunks)):
    
        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        
        # init drift alert
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init
            current_training_batches_list = initial_training_batches_list.copy()
            print('Initial Training Batches', current_training_batches_list)

        #print('Training for Model before Scaling', training_features)
        

        # scaler and downsampling on training data
        update_scaler = StandardScaler()
        training_features_model = update_scaler.fit_transform(training_features)
        training_features_model, training_labels_model = downsampling(training_features_model, training_labels)

        print('LEN TRAINING', len(training_features_model))
        
        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        
        # scaling testing features
        testing_features_model = update_scaler.transform(testing_features)
        testing_labels_model = testing_labels


         # training model
        begin_train_fh_ks_all = time.time()


        if(i==num_chunks//2 or need_to_retrain == 1):
            print('RETRAINING MODEL')
            
            begin_train_fh_ks_all = time.time()
        
            begin_hyperparam_tunning_update = time.time()
            model = RandomForestClassifier(random_state = random_seed)
            random_search = RandomizedSearchCV(model,
                                                       param_distributions = param_dist_rf,
                                                       n_iter=N_ITER_SEARCH,
                                                       scoring='roc_auc',
                                                       cv=4, n_jobs=1, random_state = random_seed)

            
            random_search.fit(training_features_model, training_labels_model)
            
            update_model_ks_all = random_search.best_estimator_
            
            

            end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update
            
            total_hyperparam_fh_ks_all = total_hyperparam_fh_ks_all + end_hyperparam_tunning_update
            
            
            
            
            update_model_ks_all.fit(training_features_model, training_labels_model)
            
            end_train_fh_ks_all = time.time() - begin_train_fh_ks_all
        
            total_train_fh_all = total_train_fh_all + end_train_fh_ks_all
        
        
        # evaluate model on testing data
        
        begin_test_time_ks_all = time.time()
        predictions_test_updated = update_model_ks_all.predict(testing_features_model)
        
        end_test_time_ks_all = time.time() - begin_test_time_ks_all
        total_test_time_ks_all = total_test_time_ks_all + end_test_time_ks_all

        partial_roc_auc_ks_all_model.append(roc_auc_score(testing_labels_model, predictions_test_updated))
        
        predictions_test_ks_all_model = np.concatenate([predictions_test_ks_all_model, predictions_test_updated])
        
        
        print('Predictions Test Batch', len(predictions_test_updated))
        print('Prediction Test All', len(predictions_test_ks_all_model))
        
        
        # Drift Detection
        
        need_to_retrain = 0
        
        print('MODEL', update_model_ks_all)
        
        
        drift_time_start = time.time()
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(training_features_model, testing_features_model)
        drift_time_end = time.time() - drift_time_start
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
        
        detected_drifts.append(drift_alert)
        
        
        
                
        if(drift_alert==1):
        
            need_to_retrain = 1
            drift_alert = 0

       
       
            
            print('CHANGE OF TRAINING')

            no_necessary_retrainings = no_necessary_retrainings + 1
            necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)

            # add new data to the training for full history approach
            current_training_batches_list.append(i)
                    
            
            training_features_list_updated = [feature_list[i] for i in current_training_batches_list]
            training_labels_list_updated = [label_list[i] for i in current_training_batches_list]
        
            training_features = np.vstack(training_features_list_updated)
            training_labels = np.hstack(training_labels_list_updated)

        
        print('Current Training Batches',current_training_batches_list)
    
    
    df_results_ks_all_model = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts_Overall',  'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Drift_Detection_Total_Time', 'Distribution_Extraction_Time', 'Statistical_Test_Time', 'Label_Costs'])
    df_results_ks_all_model.loc[0] = [random_seed, 'KS_ALL', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), partial_roc_auc_ks_all_model, np.mean(partial_roc_auc_ks_all_model), roc_auc_score(true_testing_labels, predictions_test_ks_all_model), predictions_test_ks_all_model, true_testing_labels, total_train_fh_all, total_hyperparam_fh_ks_all, total_test_time_ks_all, detected_drifts, total_drift_detection_time, total_distribution_extraction_time, total_stat_test_time, necessary_label_annotation_effort]
    
    
    
    df_results_disk = pd.concat([df_results_disk, df_results_ks_all_model])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/ks_all_fh_model_disk_data_green.csv')
    


Random Seed: 0


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=95, random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:38<03:12, 38.45s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6]
LEN TRAINING 2145
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(bootstrap=False, max_depth=90, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=95,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [01:20<02:42, 40.71s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7]
LEN TRAINING 2519
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=89,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [02:13<02:18, 46.17s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8]
LEN TRAINING 2871
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=30,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=67,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [03:04<01:35, 47.93s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=110, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=30,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [03:58<00:50, 50.16s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
LEN TRAINING 3443
RETRAINING MODEL
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(criterion='entropy', max_depth=50, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=65,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [04:56<00:00, 49.41s/it]


Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Random Seed: 1


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(class_weight='balanced', max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=63,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:38<03:14, 38.83s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6]
LEN TRAINING 2145
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(criterion='entropy', max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=13,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [01:28<03:01, 45.29s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7]
LEN TRAINING 2519
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=52,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [02:18<02:22, 47.35s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8]
LEN TRAINING 2871
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=90, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=63, random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [03:09<01:37, 48.69s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=50,
                       min_samples_leaf=2, min_samples_split=8, n_estimators=50,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [04:04<00:50, 50.91s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
LEN TRAINING 3443
RETRAINING MODEL
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=55,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [05:04<00:00, 50.72s/it]


Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Random Seed: 2


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(criterion='entropy', max_depth=90, min_samples_split=8,
                       n_estimators=75, random_state=2)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:36<03:02, 36.43s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6]
LEN TRAINING 2145
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(class_weight='balanced', min_samples_leaf=2,
                       min_samples_split=8, n_estimators=53, random_state=2)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [01:17<02:35, 38.89s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7]
LEN TRAINING 2519
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=110,
                       min_samples_leaf=2, min_samples_split=8, n_estimators=32,
                       random_state=2)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [02:01<02:04, 41.51s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8]
LEN TRAINING 2871
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(bootstrap=False, max_features='log2', min_samples_leaf=2,
                       n_estimators=83, random_state=2)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [02:51<01:29, 44.70s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(class_weight='balanced', min_samples_leaf=2,
                       min_samples_split=8, n_estimators=53, random_state=2)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [03:42<00:47, 47.19s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
LEN TRAINING 3443
RETRAINING MODEL
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=110,
                       min_samples_leaf=2, min_samples_split=8, n_estimators=32,
                       random_state=2)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [04:39<00:00, 46.56s/it]


CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Random Seed: 3


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL


  0%|          | 0/6 [00:18<?, ?it/s]


KeyboardInterrupt: 

In [44]:
df_results_disk

Unnamed: 0,Random_Seed,Model,Drifts_Overall,ROC_AUC_Batch,ROC_AUC_BATCH_MEAN,ROC_AUC_Total,Predictions,True_Testing_Labels,Train_Time,Hyperparam_Tunning_Time,Test_Time,Drifts_Detected,Drift_Detection_Total_Time,Distribution_Extraction_Time,Statistical_Test_Time,Label_Costs
0,0,KS_ALL,5/6,"[0.9224351517780277, 0.95450872656755, 0.98253...",0.963122,0.961754,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",291.762608,290.709915,0.276728,"[1, 1, 1, 1, 1, 0]",4.042772,4.041776,0.000969,69507
1,1,KS_ALL,5/6,"[0.937405939683103, 0.9252054060490593, 0.9817...",0.960324,0.958769,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",299.695743,299.015962,0.190906,"[1, 1, 1, 1, 1, 0]",3.986231,3.985239,0.000967,69507
2,2,KS_ALL,6/6,"[0.9229773335173466, 0.9826191610247338, 0.982...",0.967957,0.967274,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",274.669569,273.880413,0.220179,"[1, 1, 1, 1, 1, 1]",4.029029,4.02803,0.000977,83460


# DF Results

In [45]:
df_results_disk = pd.DataFrame()
df_results_disk

# KS on PCA Features

In [46]:
initial_training_batches_list = list(range(0, num_chunks//2))


for random_seed in random_seeds:
    
    
    print('Random Seed:', random_seed)
    necessary_label_annotation_effort = 0
    no_necessary_retrainings = 0
    
    
    partial_roc_auc_ks_pca_model = []
    predictions_test_ks_pca_model = []
    
    

    total_train_fh_pca = 0
    total_hyperparam_fh_ks_pca = 0
    total_test_time_ks_pca = 0
    
    total_drift_detection_time = 0
    total_distribution_extraction_time = 0
    total_stat_test_time = 0
    total_pca_time = 0
    
    
    detected_drifts = []


    for i in tqdm(range(num_chunks//2, num_chunks)):
    
        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init
            current_training_batches_list = initial_training_batches_list.copy()
            print('Initial Training Batches', current_training_batches_list)
        

        # scaler and downsampling for training data
        update_scaler = StandardScaler()
        training_features_model = update_scaler.fit_transform(training_features)
        training_features_model, training_labels_model = downsampling(training_features_model, training_labels)

        print('LEN TRAINING', len(training_features_model))
        
        
        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        
        # scaling testing features
        testing_features_model = update_scaler.transform(testing_features)
        testing_labels_model = testing_labels


        # training model
        begin_train_fh_ks_pca = time.time()


        if(i==num_chunks//2 or need_to_retrain == 1):
            print('RETRAINING MODEL')
            
            begin_train_fh_ks_pca = time.time()
        
            begin_hyperparam_tunning_update = time.time()
            model = RandomForestClassifier(random_state = random_seed)
            random_search = RandomizedSearchCV(model,
                                                       param_distributions = param_dist_rf,
                                                       n_iter=N_ITER_SEARCH,
                                                       scoring='roc_auc',
                                                       cv=4, n_jobs=1, random_state = random_seed)

            
            random_search.fit(training_features_model, training_labels_model)
            
            update_model_ks_pca = random_search.best_estimator_
            
            

            end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update
            
            total_hyperparam_fh_ks_pca = total_hyperparam_fh_ks_pca + end_hyperparam_tunning_update
            
            
            
            
            update_model_ks_pca.fit(training_features_model, training_labels_model)
            
            end_train_fh_ks_pca = time.time() - begin_train_fh_ks_pca
        
            total_train_fh_pca = total_train_fh_pca + end_train_fh_ks_pca
        
        
        # evaluate model on testing data & measure testing time
        begin_test_time_ks_pca = time.time()
        predictions_test_updated = update_model_ks_pca.predict(testing_features_model)
        end_test_time_ks_pca = time.time() - begin_test_time_ks_pca
        
        total_test_time_ks_pca = total_test_time_ks_pca + end_test_time_ks_pca

        
        # ROC AUC
        partial_roc_auc_ks_pca_model.append(roc_auc_score(testing_labels_model, predictions_test_updated))
        predictions_test_ks_pca_model = np.concatenate([predictions_test_ks_pca_model, predictions_test_updated])
        
        
        print('Predictions Test Batch', len(predictions_test_updated))
        print('Prediction Test All', len(predictions_test_ks_pca_model))
        
        
        # Drift Detection
        
        need_to_retrain = 0
        
        print('MODEL', update_model_ks_pca)
        
        drift_time_start = time.time()
        
        # Extract PCA Features
        
        pca_computing_time_start = time.time()
        
        pca = PCA(n_components = 0.95, random_state = random_seed)
        pca.fit(training_features_model)

        df_train_features_sorted_pca = pca.transform(training_features_model)
        df_test_features_sorted_pca = pca.transform(testing_features_model)
        
        pca_computing_time_end = time.time() - pca_computing_time_start
        
        
        # Detect Drift
        
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(df_train_features_sorted_pca, df_test_features_sorted_pca)
        drift_time_end = time.time() - drift_time_start
        
        
        
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_pca_time = total_pca_time + pca_computing_time_end
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
        
        detected_drifts.append(drift_alert)
        
        
        
        if(drift_alert==1):
        
            need_to_retrain = 1
            drift_alert = 0

       
       
            
            print('CHANGE OF TRAINING')

            no_necessary_retrainings = no_necessary_retrainings + 1
            necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)

            
            # add new data to the training for full history approach
            current_training_batches_list.append(i)
                    
            
            training_features_list_updated = [feature_list[i] for i in current_training_batches_list]
            training_labels_list_updated = [label_list[i] for i in current_training_batches_list]
        
            training_features = np.vstack(training_features_list_updated)
            training_labels = np.hstack(training_labels_list_updated)

        
        print('Current Training Batches',current_training_batches_list)
    
    
    df_results_ks_pca_model = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts_Overall',  'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Drift_Detection_Total_Time', 'PCA_Computing_time', 'Distribution_Extraction_Time', 'Statistical_Test_Time', 'Label_Costs'])
    df_results_ks_pca_model.loc[0] = [random_seed, 'KS_PCA', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), partial_roc_auc_ks_pca_model, np.mean(partial_roc_auc_ks_pca_model), roc_auc_score(true_testing_labels, predictions_test_ks_pca_model), predictions_test_ks_pca_model, true_testing_labels, total_train_fh_pca, total_hyperparam_fh_ks_pca, total_test_time_ks_pca, detected_drifts, total_drift_detection_time, total_pca_time, total_distribution_extraction_time, total_stat_test_time, necessary_label_annotation_effort]
    
    
    df_results_disk = pd.concat([df_results_disk, df_results_ks_pca_model])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/ks_pca_fh_model_backblaze_data_green.csv')
    

Random Seed: 0


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=95, random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:38<03:12, 38.59s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6]
LEN TRAINING 2145
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(bootstrap=False, max_depth=90, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=95,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [01:22<02:46, 41.71s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7]
LEN TRAINING 2519
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=89,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [02:09<02:12, 44.24s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8]
LEN TRAINING 2871
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=30,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=67,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [03:00<01:33, 46.93s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=110, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=30,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [03:55<00:49, 49.80s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
LEN TRAINING 3443
RETRAINING MODEL
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(criterion='entropy', max_depth=50, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=65,
                       random_state=0)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [04:54<00:00, 49.02s/it]


CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Random Seed: 1


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(class_weight='balanced', max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=63,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:43<03:35, 43.18s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6]
LEN TRAINING 2145
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(criterion='entropy', max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=13,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [01:30<03:03, 45.89s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7]
LEN TRAINING 2519
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=52,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [02:19<02:21, 47.08s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8]
LEN TRAINING 2871
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=90, min_samples_leaf=2, min_samples_split=4,
                       n_estimators=63, random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [03:11<01:38, 49.13s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=50,
                       min_samples_leaf=2, min_samples_split=8, n_estimators=50,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [04:07<00:51, 51.54s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
LEN TRAINING 3443
RETRAINING MODEL
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=55,
                       random_state=1)



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [05:18<00:00, 53.16s/it]


CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Random Seed: 2


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL


  0%|          | 0/6 [00:30<?, ?it/s]


KeyboardInterrupt: 

In [47]:
df_results_disk

Unnamed: 0,Random_Seed,Model,Drifts_Overall,ROC_AUC_Batch,ROC_AUC_BATCH_MEAN,ROC_AUC_Total,Predictions,True_Testing_Labels,Train_Time,Hyperparam_Tunning_Time,Test_Time,Drifts_Detected,Drift_Detection_Total_Time,PCA_Computing_time,Distribution_Extraction_Time,Statistical_Test_Time,Label_Costs
0,0,KS_PCA,6/6,"[0.9224351517780277, 0.95450872656755, 0.98253...",0.963122,0.961754,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",290.710704,289.659747,0.277338,"[1, 1, 1, 1, 1, 1]",2.732353,0.032512,2.698766,0.001036,83460
1,1,KS_PCA,6/6,"[0.937405939683103, 0.9252054060490593, 0.9817...",0.960324,0.958769,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",315.461386,314.717954,0.201767,"[1, 1, 1, 1, 1, 1]",2.841369,0.030664,2.809602,0.001071,83460


# DF Results

In [48]:
df_results_disk = pd.DataFrame()
df_results_disk

# KS on Most Important Features

In [49]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']

In [50]:
len(features_disk_failure)

19

In [51]:
initial_training_batches_list = list(range(0, num_chunks//2))


for random_seed in random_seeds:


    print('Random Seed', random_seed)
    no_necessary_retrainings = 0
    necessary_label_annotation_effort = 0
    

    partial_roc_auc_ks_fi_model = []    
    predictions_test_ks_fi_model = []
    


    total_train_fh_fi = 0
    total_hyperparam_fh_ks_fi = 0
    total_test_time_ks_fi = 0
    
    total_feature_importance_extraction_time = 0
    total_distribution_extraction_time = 0
    total_stat_test_time = 0
    
    total_drift_detection_time = 0

    
    detected_drifts = []


    for i in tqdm(range(num_chunks//2, num_chunks)):


        
        # obtain training features and labels
        training_features_init = np.vstack(feature_list[0: i])
        training_labels_init = np.hstack(label_list[0//2: i])
        
        # init drift alert
        drift_alert = 0

        # check if it is the first batch
        if(i==num_chunks//2):
            training_features = training_features_init
            training_labels = training_labels_init
            current_training_batches_list = initial_training_batches_list.copy()
            print('Initial Training Batches', current_training_batches_list)
        

        # scaler and downsampling for training data
        update_scaler = StandardScaler()
        training_features_model = update_scaler.fit_transform(training_features)
        training_features_model, training_labels_model = downsampling(training_features_model, training_labels)

        print('LEN TRAINING', len(training_features_model))
        
        
        # obtain testing features and labels
        testing_features = feature_list[i]
        testing_labels = label_list[i]

        
        # scaling testing features
        testing_features_model = update_scaler.transform(testing_features)
        testing_labels_model = testing_labels


        # training model
        begin_train_fh_ks_fi = time.time()


        if(i==num_chunks//2 or need_to_retrain == 1):
            print('RETRAINING MODEL')
            
            begin_train_fh_ks_fi = time.time()
        
            begin_hyperparam_tunning_update = time.time()
            model = RandomForestClassifier(random_state = random_seed)
            random_search = RandomizedSearchCV(model,
                                                       param_distributions = param_dist_rf,
                                                       n_iter=N_ITER_SEARCH,
                                                       scoring='roc_auc',
                                                       cv=4, n_jobs=1, random_state = random_seed)

            
            random_search.fit(training_features_model, training_labels_model)
            
            update_model_ks_fi = random_search.best_estimator_
            
            

            end_hyperparam_tunning_update = time.time() - begin_hyperparam_tunning_update
            
            total_hyperparam_fh_ks_fi = total_hyperparam_fh_ks_fi + end_hyperparam_tunning_update
            
            
            
            
            update_model_ks_fi.fit(training_features_model, training_labels_model)
            
            end_train_fh_ks_fi = time.time() - begin_train_fh_ks_fi
        
            total_train_fh_fi = total_train_fh_fi + end_train_fh_ks_fi
        
        
        # evaluate model on testing data
        
        begin_test_time_ks_fi = time.time()
        predictions_test_updated = update_model_ks_fi.predict(testing_features_model)
        end_test_time_ks_fi = time.time() - begin_test_time_ks_fi
        
        total_test_time_ks_fi = total_test_time_ks_fi + end_test_time_ks_fi
        

        partial_roc_auc_ks_fi_model.append(roc_auc_score(testing_labels_model, predictions_test_updated))
        predictions_test_ks_fi_model = np.concatenate([predictions_test_ks_fi_model, predictions_test_updated])
        
        
        print('Predictions Test Batch', len(predictions_test_updated))
        print('Prediction Test All', len(predictions_test_ks_fi_model))
        
        
        # Drift Detection
        
        need_to_retrain = 0
        
        print('MODEL', update_model_ks_fi)
        
        drift_time_start = time.time()
        
        # Extract Most Important Features
        feature_importance_extraction_start = time.time()
        important_features = important_features_extraction(update_model_ks_fi, features_disk_failure)
        print('Important Features', important_features)
        print(len(important_features))

        # filter non-important features from train and test

        training_important_features_model = filtering_non_important_features(training_features_model, features_disk_failure, important_features)
        testing_important_features_model = filtering_non_important_features(testing_features_model, features_disk_failure, important_features)

        feature_importance_extraction_end = time.time() - feature_importance_extraction_start
        
        
        # Detect Drift
        
        drift_alert, distribution_extraction_time, ks_test_time = ks_drift_detection(training_important_features_model, testing_important_features_model)
        drift_time_end = time.time() - drift_time_start
        
        
        
        
        
        total_distribution_extraction_time = total_distribution_extraction_time + distribution_extraction_time
        total_stat_test_time = total_stat_test_time + ks_test_time
        total_feature_importance_extraction_time = total_feature_importance_extraction_time + feature_importance_extraction_end
        total_drift_detection_time = total_drift_detection_time + drift_time_end
        
        
        detected_drifts.append(drift_alert)
        
        
        
        if(drift_alert==1):
        
            need_to_retrain = 1
            drift_alert = 0

       
       
            
            print('CHANGE OF TRAINING')

            no_necessary_retrainings = no_necessary_retrainings + 1
            necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)
            
            # add new data to the training for full history approach
            current_training_batches_list.append(i)
            
            training_features_list_updated = [feature_list[i] for i in current_training_batches_list]
            training_labels_list_updated = [label_list[i] for i in current_training_batches_list]
        
            training_features = np.vstack(training_features_list_updated)
            training_labels = np.hstack(training_labels_list_updated)

        
        print('Current Training Batches',current_training_batches_list)
    
    
    df_results_ks_fi_model = pd.DataFrame(columns=['Random_Seed', 'Model', 'Drifts_Overall',  'ROC_AUC_Batch', 'ROC_AUC_BATCH_MEAN', 'ROC_AUC_Total', 'Predictions', 'True_Testing_Labels', 'Train_Time', 'Hyperparam_Tunning_Time', 'Test_Time', 'Drifts_Detected', 'Drift_Detection_Total_Time', 'FI_Extraction_Time', 'Distribution_Extraction_Time', 'Statistical_Test_Time', 'Label_Costs'])
    df_results_ks_fi_model.loc[0] = [random_seed, 'KS_FI', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), partial_roc_auc_ks_fi_model, np.mean(partial_roc_auc_ks_fi_model), roc_auc_score(true_testing_labels, predictions_test_ks_fi_model), predictions_test_ks_fi_model, true_testing_labels, total_train_fh_fi, total_hyperparam_fh_ks_fi, total_test_time_ks_fi, detected_drifts, total_drift_detection_time, total_feature_importance_extraction_time, total_distribution_extraction_time, total_stat_test_time, necessary_label_annotation_effort]
    
    
    df_results_disk = pd.concat([df_results_disk, df_results_ks_fi_model])
    df_results_disk = df_results_disk.reset_index(drop=True)
    df_results_disk.to_csv('./results/ks_FI_fh_model_backblaze_data_green.csv')
    


Random Seed 0


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=95, random_state=0)
Important Features ['smart_5_raw', 'smart_5_raw_diff', 'smart_193_raw', 'smart_9_raw', 'smart_4_raw']
5



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:41<03:25, 41.08s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6]
LEN TRAINING 2145
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(bootstrap=False, max_depth=90, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=95,
                       random_state=0)
Important Features ['smart_5_raw', 'smart_5_raw_diff', 'smart_193_raw', 'smart_9_raw', 'smart_187_raw']
5



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [01:28<02:58, 44.58s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7]
LEN TRAINING 2519
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=89,
                       random_state=0)
Important Features ['smart_5_raw', 'smart_193_raw', 'smart_9_raw', 'smart_5_raw_diff', 'smart_4_raw']
5



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [02:16<02:19, 46.44s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8]
LEN TRAINING 2871
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced', max_depth=30,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=67,
                       random_state=0)
Important Features ['smart_5_raw', 'smart_193_raw', 'smart_9_raw', 'smart_4_raw', 'smart_5_raw_diff', 'smart_187_raw', 'smart_7_raw']
7



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [03:10<01:38, 49.16s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=110, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=30,
                       random_state=0)
Important Features ['smart_5_raw', 'smart_5_raw_diff', 'smart_193_raw', 'smart_9_raw', 'smart_4_raw', 'smart_7_raw']
6



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [04:04<00:51, 51.08s/it]

Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
LEN TRAINING 3157
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=110, max_features='log2',
                       min_samples_leaf=2, min_samples_split=8, n_estimators=30,
                       random_state=0)
Important Features ['smart_5_raw', 'smart_5_raw_diff', 'smart_193_raw', 'smart_9_raw', 'smart_4_raw', 'smart_7_raw']
6



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [04:05<00:00, 40.84s/it]


Current Training Batches [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Random Seed 1


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 13866
MODEL RandomForestClassifier(class_weight='balanced', max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=63,
                       random_state=1)
Important Features ['smart_5_raw', 'smart_9_raw', 'smart_193_raw', 'smart_5_raw_diff', 'smart_4_raw', 'smart_12_raw', 'smart_187_raw']
7



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 17%|█▋        | 1/6 [00:45<03:46, 45.35s/it]

Current Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
Predictions Test Batch 13866
Prediction Test All 27732
MODEL RandomForestClassifier(class_weight='balanced', max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=63,
                       random_state=1)
Important Features ['smart_5_raw', 'smart_9_raw', 'smart_193_raw', 'smart_5_raw_diff', 'smart_4_raw', 'smart_12_raw', 'smart_187_raw']
7



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 33%|███▎      | 2/6 [00:45<01:15, 19.00s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 7]
LEN TRAINING 2156
RETRAINING MODEL
Predictions Test Batch 13866
Prediction Test All 41598
MODEL RandomForestClassifier(criterion='entropy', max_depth=10, max_features='log2',
                       min_samples_leaf=2, min_samples_split=4, n_estimators=24,
                       random_state=1)
Important Features ['smart_5_raw', 'smart_9_raw', 'smart_5_raw_diff', 'smart_193_raw', 'smart_187_raw']
5



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 50%|█████     | 3/6 [01:35<01:38, 32.97s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 7, 8]
LEN TRAINING 2508
RETRAINING MODEL
Predictions Test Batch 13955
Prediction Test All 55553
MODEL RandomForestClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=2, min_samples_split=4, n_estimators=52,
                       random_state=1)
Important Features ['smart_5_raw', 'smart_9_raw', 'smart_193_raw', 'smart_5_raw_diff', 'smart_187_raw', 'smart_12_raw', 'smart_4_raw']
7



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 67%|██████▋   | 4/6 [02:23<01:17, 38.82s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 7, 8, 9]
LEN TRAINING 2794
RETRAINING MODEL
Predictions Test Batch 13954
Prediction Test All 69507
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=110, min_samples_leaf=2,
                       min_samples_split=8, n_estimators=66, random_state=1)
Important Features ['smart_5_raw', 'smart_9_raw', 'smart_5_raw_diff', 'smart_193_raw', 'smart_187_raw', 'smart_4_raw', 'smart_12_raw', 'smart_7_raw']
8



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
 83%|████████▎ | 5/6 [03:17<00:44, 44.37s/it]

CHANGE OF TRAINING
Current Training Batches [0, 1, 2, 3, 4, 5, 7, 8, 9, 10]
LEN TRAINING 3080
RETRAINING MODEL
Predictions Test Batch 13953
Prediction Test All 83460
MODEL RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=50, max_features='log2',
                       min_samples_leaf=4, n_estimators=85, random_state=1)
Important Features ['smart_5_raw', 'smart_5_raw_diff', 'smart_9_raw', 'smart_193_raw', 'smart_187_raw', 'smart_12_raw', 'smart_4_raw', 'smart_7_raw']
8



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
100%|██████████| 6/6 [04:19<00:00, 43.33s/it]


Current Training Batches [0, 1, 2, 3, 4, 5, 7, 8, 9, 10]
Random Seed 2


  0%|          | 0/6 [00:00<?, ?it/s]

Initial Training Batches [0, 1, 2, 3, 4, 5]
LEN TRAINING 1782
RETRAINING MODEL


  0%|          | 0/6 [00:04<?, ?it/s]

KeyboardInterrupt



In [52]:
df_results_disk

Unnamed: 0,Random_Seed,Model,Drifts_Overall,ROC_AUC_Batch,ROC_AUC_BATCH_MEAN,ROC_AUC_Total,Predictions,True_Testing_Labels,Train_Time,Hyperparam_Tunning_Time,Test_Time,Drifts_Detected,Drift_Detection_Total_Time,FI_Extraction_Time,Distribution_Extraction_Time,Statistical_Test_Time,Label_Costs
0,0,KS_FI,4/6,"[0.9224351517780277, 0.95450872656755, 0.98253...",0.957259,0.956257,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",242.211063,241.291457,0.269554,"[1, 1, 1, 1, 0, 0]",2.162277,0.037782,2.123311,0.001153,55553
1,1,KS_FI,4/6,"[0.937405939683103, 0.9539303575681284, 0.9664...",0.965515,0.964043,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...",256.807663,255.915696,0.271374,"[0, 1, 1, 1, 1, 0]",2.499732,0.037984,2.460605,0.001109,55641


!!!! all tested, clean code and push on github