In [None]:
from datetime import date, datetime, timedelta
import pandas as pd

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier

from scipy import stats
import seaborn as sns
import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

import time

In [None]:
#random_seeds = ['1234', '4887', '597', '1959', '413', '44', '2969', '4971', '4913', '9591']

In [None]:
random_seed = 1234

In [None]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: week, tracing time: one year (50 weeks)
        start_time = 1
        unit_period = 7  # unit period: one week (7 days)
        end_time = start_time + 50*unit_period
    # original 1 month
    '''
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period
    '''
    
    
    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [None]:
def obtain_model(model_name):
    '''
    This function instantiate a specific model 
    Note: the MODEL_TYPE global variable must be set first
    Args:
        model_name (str): [rf, nn, svm, cart, rgf]
    Returns:
        (instance): instance of given model with preset parameters.
        Return None if the model name is not in the option
    '''
    if model_name == 'rf':
        return RandomForestClassifier(n_estimators=50, criterion='gini', class_weight=None, max_depth=None, 
                                      min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, 
                                      n_jobs=N_WORKERS, random_state = random_seed)

    return None

In [None]:
def obtain_model_tuned(model_name, dataset):
    if dataset == 'g':
        if model_name == 'rf':
            return RandomForestClassifier(n_estimators=165, criterion='gini', bootstrap=True, class_weight='balanced', 
                                          max_depth=40, max_features='auto', min_samples_leaf=4, min_samples_split=8, 
                                          n_jobs=N_WORKERS, random_state = random_seed)
    elif dataset == 'b':
        if model_name == 'rf':
            return RandomForestClassifier(n_estimators=160, criterion='gini', bootstrap=False, class_weight='balanced', 
                                          max_depth=10, max_features='sqrt', min_samples_leaf=4, min_samples_split=8, 
                                          n_jobs=N_WORKERS, random_state = random_seed)

    return None

In [None]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [None]:
def downsampling(training_features, training_labels, ratio=10):
    #return training_features, training_labels

    idx_true = np.where(training_labels == True)[0]
    idx_false = np.where(training_labels == False)[0]
    #print('Before dowmsampling:', len(idx_true), len(idx_false))
    idx_false_resampled = resample(idx_false, n_samples=len(idx_true)*ratio, replace=False, random_state = random_seed)
    idx_resampled = np.concatenate([idx_false_resampled, idx_true])
    idx_resampled.sort()
    resampled_features = training_features[idx_resampled]
    resampled_labels = training_labels[idx_resampled]
    #print('After dowmsampling:', len(idx_true), len(idx_false_resampled))
    return resampled_features, resampled_labels

Feature Importance Functions

In [None]:
def important_features_extraction(model, features_input):
    
    # extract features and their importances
    
    feature_importance_ranking = model.feature_importances_
    zipped_features = list(zip(feature_importance_ranking, features_input))
    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)
    
    # extract mean of importances
    
    importances = [i[0] for i in sorted_features_zip]
    mean_importances = np.mean(importances)
    
    # extract most important features and return
    
    most_important_features = [i[1] for i in sorted_features_zip if i[0]>= mean_importances]
    
    return most_important_features

In [None]:
def filtering_non_important_features(features_array, features_names, important_features_names):
    # transform array into dataframe and attach features
    df_features = pd.DataFrame(np.array(features_array), columns = features_names)
    
    # filter out columns with non-relevant features
    df_important_features = df_features[df_features.columns[~df_features.columns.isin(important_features)==0]]
    
    # transform dataframe with only into features back into array
    important_features_array = df_important_features.to_numpy()
    
    return important_features_array

In [None]:
def ks_drift_detection(reference_data, testing_data):
    
    # extract distributions from reference and testing data
    
    distribution_reference = sns.distplot(np.array(reference_data)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_test = sns.distplot(np.array(testing_data)).get_lines()[0].get_data()[1]
    plt.close()
    
    # apply KS statistical test
    
    stat_test = stats.kstest
    
    v, p = stat_test(distribution_reference, distribution_test)
    
    # check if drift
    
    if(p<0.05):
        drift_alert = 1
    else:
        drift_alert = 0

    return drift_alert

In [None]:
N_WORKERS = 1

# Reading Data

In [None]:
DATASET_PATH = '../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/disk_failure_2015.csv'
interval = 'm'

In [None]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
columns = ['serial_number', 'date'] + features_disk_failure + ['label']

In [None]:
len(features_disk_failure)

In [None]:
df = pd.read_csv(DATASET_PATH, header=None)
# put columns names
df.columns = columns
# ignore serial number
df = df[df.columns[1:]]

In [None]:
len(df)

In [None]:
# transform date to date time
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [None]:
df

In [None]:
# divide on days of year

# original implementation
#df['date'] = pd.Series(pd.DatetimeIndex(df['date']).month)

# divide on weeks
df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)

In [None]:
features = df[df.columns[:-1]].to_numpy()
labels = df[df.columns[-1]].to_numpy()

In [None]:
# DIVIDE FEATURES INTO WEEKS 

feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))

In [None]:
# original implementation
#months = ['M1_2', 'M2_3', 'M3_4', 'M4_5', 'M5_6', 'M6_7', 'M7_8', 'M8_9', 'M9_10', 'M10_11', 'M11_12']

# divide on weeks
weeks = []
for i in range(0, len(feature_list)-1):
    string_week = 'W' + str(i+1) + '_' + str(i+2)
    weeks.append(string_week)
len(weeks)

In [None]:
num_chunks = len(feature_list)
num_chunks

## True Labels

In [None]:
true_testing_labels = np.hstack(label_list[num_chunks//2:])
true_testing_labels

# DF Results

In [None]:
df_results_disk = pd.DataFrame()
df_results_disk

# Building Static Model

In [None]:
begin = time.time()

# extracting training features and labels
training_features = np.vstack(feature_list[0: num_chunks//2])
training_labels = np.hstack(label_list[0: num_chunks//2])

# scaling training data
scaler = StandardScaler()
training_features = scaler.fit_transform(training_features)

# downsampling training data
training_features_downsampling, training_labels_downsampling = downsampling(training_features, training_labels)

# training model
t = time.time()
static_model = obtain_model_tuned('rf', 'b')
static_model.fit(training_features_downsampling, training_labels_downsampling)
elapsed = time.time() - t
print('Training time: ', elapsed)

total_time_training = 0
predictions_test_static_model = []

# true testing labels
true_testing_labels = np.hstack(label_list[num_chunks//2:])


# lengths of tests
len_test = 0

for i in tqdm(range(num_chunks//2, num_chunks)):
    
    # obtain testing features and labels
    testing_features = feature_list[i]
    #len_test = len_test + len(testing_features)
    testing_labels = label_list[i]
    
    # scaling testing features
    testing_features = scaler.transform(testing_features)
    
    # evaluate model on testing data
    print('Testing models on period', i + 1)
    predictions_test_updated = static_model.predict(testing_features)
    predictions_test_static_model = np.concatenate([predictions_test_static_model, predictions_test_updated])
    print(len(testing_labels))
    print(len(predictions_test_static_model))
    
end = time.time() - begin


In [None]:
df_results_static = pd.DataFrame(columns=['Random_Seed', 'Model', 'Scenario', 'Drifts', 'ROC_AUC', 'Run_Time', 'Drifts_Detected', 'Label_Costs'])
df_results_static.loc[0] = [random_seed, 'static', '-', '0/25', roc_auc_score(true_testing_labels, predictions_test_static_model), end, np.zeros(25, dtype=int), 0.0]

In [None]:
df_results_disk = pd.concat([df_results_disk, df_results_static])
df_results_disk

# Build Periodical Model Update

In [None]:
total_time_training = 0
predictions_test = []


begin = time.time()


for i in tqdm(range(num_chunks//2, num_chunks)):
    
    # obtain training features and labels
    training_features_init = np.vstack(feature_list[0: i])
    training_labels_init = np.hstack(label_list[0//2: i])
    drift_alert = 0
    
    # check if it is the first batch
    if(i==num_chunks//2):
        training_features = training_features_init
        training_labels = training_labels_init
    
    # scaler and downsampling for training data
    update_scaler = StandardScaler()
    training_features = update_scaler.fit_transform(training_features)
    training_features, training_labels = downsampling(training_features, training_labels)
    
    # obtain testing features and labels
    testing_features = feature_list[i]
    testing_labels = label_list[i]
    
    # scaling testing features
    testing_features = update_scaler.transform(testing_features)
    
    # train model & track time
    t = time.time()
    update_model = obtain_model_tuned('rf', 'b')
    update_model.fit(training_features, training_labels)
    elapsed = time.time() - t
    
    total_time_training = total_time_training + elapsed
    
    # evaluate model on testing data
    print('Testing models on period', i + 1)
    predictions_test_updated = update_model.predict(testing_features)
    predictions_test = np.concatenate([predictions_test, predictions_test_updated])
    
    training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i+1])
    training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i+1])

end = time.time() - begin


In [None]:
df_results_periodic = pd.DataFrame(columns=['Random_Seed', 'Model', 'Scenario', 'Drifts', 'ROC_AUC', 'Run_Time', 'Drifts_Detected', 'Label_Costs'])
#df_results_periodic = pd.DataFrame()
df_results_periodic.loc[0] = [random_seed, 'periodic', '-', '25/25', roc_auc_score(true_testing_labels, predictions_test), end, np.ones(25, dtype=int), len(true_testing_labels)]

In [None]:
df_results_disk = pd.concat([df_results_disk, df_results_periodic])
df_results_disk

# Build Drift Detection based Model Update

### KS on all features

In [None]:
detected_drifts = []
total_time_training = 0
predictions_test_dd_sc1 = []

no_necessary_retrainings = 0
necessary_label_annotation_effort = 0
overall_total_time_training = 0

begin = time.time()

length_drifts_detected = 0


for i in tqdm(range(num_chunks//2, num_chunks)):
    
    print('Evaluated Period', i + 1)
    print(i)
    print(num_chunks//2)
    
    # obtain training features and labels
    training_features_init = np.vstack(feature_list[0: i])
    training_labels_init = np.hstack(label_list[0//2: i])
    drift_alert = 0
    
    # check if it is the first batch
    if(i==num_chunks//2):
        training_features = training_features_init
        training_labels = training_labels_init
        
    print('Training for Model before Scaling', training_features)
    print(len(training_features))
    
    # scaler and downsampling for training data
    update_scaler = StandardScaler()
    training_features_model = update_scaler.fit_transform(training_features)
    training_features_model, training_labels_model = downsampling(training_features_model, training_labels)
    
    # obtain testing features and labels
    testing_features = feature_list[i]
    testing_labels = label_list[i]
    
    # scaling testing features
    testing_features_model = update_scaler.transform(testing_features)
    
    
    
    # model train and prediction extractions
    t = time.time()
    update_model_dd = obtain_model_tuned('rf', 'b')
    update_model_dd.fit(training_features_model, training_labels_model)
    elapsed = time.time() - t
    total_time_training = total_time_training + elapsed
    
    overall_total_time_training = overall_total_time_training + total_time_training
    
    predictions_test_current = update_model_dd.predict(testing_features_model)
    predictions_test_dd_sc1 = np.concatenate([predictions_test_dd_sc1, predictions_test_current])
    
    # check for concept drift in the data
    
    # extract distributions
    distribution_training = sns.distplot(np.array(training_features_model)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_test = sns.distplot(np.array(testing_features_model)).get_lines()[0].get_data()[1]
    plt.close()
    
    stat_test = stats.kstest
    
    v, p = stat_test(distribution_training, distribution_test)
    if(p<0.05):
        detected_drifts.append(1)
        drift_alert = 1
        
        length_drifts_detected = length_drifts_detected + len(testing_labels)
        
    else:
        detected_drifts.append(0)
        
    # Adjust Training in case of Concept Drift, otherwise keep the previous training
    
    if(drift_alert==1):
        
        print('CHANGE OF TRAINING')
        
        no_necessary_retrainings = no_necessary_retrainings + 1
        necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)
        
        #new_training_features = np.concatenate([training_features[len(testing_features):], testing_features])
        #new_training_labels = np.concatenate([training_labels[len(testing_labels):], testing_labels])
        
        training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i+1])
        training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i+1])
        
        '''
        print('Initial Train', training_features)
        print(len(training_features))
        print('Train Remaining', training_features[len(testing_features):])
        print(len(training_features[len(testing_features):]))
        print('Test', testing_features)
        print(len(testing_features))
        
        '''
        
        print('New Training', training_features)
        print(len(training_features))
        
        #training_features = new_training_features
        #training_labels = new_training_labels
        
        drift_alert = 0
        
        
        #training_features = np.vstack(feature_list[i - num_chunks//2: i])
        #training_labels = np.hstack(label_list[i - num_chunks//2: i])
        
end = time.time() - begin


In [None]:
df_results_ede_sc1 = pd.DataFrame(columns=['Random_Seed', 'Model', 'Scenario', 'Drifts', 'ROC_AUC', 'Run_Time', 'Drifts_Detected', 'Label_Costs'])
#df_results_periodic = pd.DataFrame()
df_results_ede_sc1.loc[0] = [random_seed, 'EDE', '1', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), roc_auc_score(true_testing_labels, predictions_test_dd_sc1), end, detected_drifts, length_drifts_detected]

In [None]:
df_results_disk = pd.concat([df_results_disk, df_results_ede_sc1])
df_results_disk

### McUDI

In [None]:
detected_drifts = []
total_time_training = 0
predictions_test_dd2_sc1 = []

no_necessary_retrainings = 0
necessary_label_annotation_effort = 0
overall_total_time_training = 0

length_drifts_detected = 0


begin = time.time()


for i in tqdm(range(num_chunks//2, num_chunks)):
    
    print('Evaluated Period', i + 1)

    
    # obtain training features and labels
    training_features_init = np.vstack(feature_list[0: i])
    training_labels_init = np.hstack(label_list[0//2: i])
    drift_alert = 0
    
    # check if it is the first batch
    if(i==num_chunks//2):
        training_features = training_features_init
        training_labels = training_labels_init

        
        
    print('Training for Model before Scaling', training_features)
    print(len(training_features))
    
    # scaler and downsampling for training data
    update_scaler = StandardScaler()
    training_features_model = update_scaler.fit_transform(training_features)
    training_features_model, training_labels_model = downsampling(training_features_model, training_labels)
    
    #print('Training for Model after Scaling', training_features_model)
    #print(len(training_features_model))
    
    # obtain testing features and labels
    testing_features = feature_list[i]
    testing_labels = label_list[i]
    
    # scaling testing features
    testing_features_model = update_scaler.transform(testing_features)
    
    # model train and prediction extractions
    t = time.time()
    update_model_dd = obtain_model_tuned('rf', 'b')
    update_model_dd.fit(training_features_model, training_labels_model)
    elapsed = time.time() - t
    total_time_training = total_time_training + elapsed
    
    overall_total_time_training = overall_total_time_training + total_time_training
    
    predictions_test_current = update_model_dd.predict(testing_features_model)
    predictions_test_dd2_sc1 = np.concatenate([predictions_test_dd2_sc1, predictions_test_current])
    
    # check for concept drift in the data
    
    # extract important features
    
    important_features = important_features_extraction(update_model_dd, features_disk_failure)
    print('Important Features', important_features)
    print('len imp feats', len(important_features))
    
    # filter non-important features from train and test
    
    training_important_features_model = filtering_non_important_features(training_features_model, features_disk_failure, important_features)
    testing_important_features_model = filtering_non_important_features(testing_features_model, features_disk_failure, important_features)

    #print("TRAINING IMPORTANT FEATURES", training_important_features_model)
    #print("LEN TRAINING IMPORTANT FEATURES", len(training_important_features_model[0]))
    #print("LEN Testing IMPORTANT FEATURES", len(testing_important_features_model[0]))
    
    
    
    drift_alert = ks_drift_detection(training_important_features_model, testing_important_features_model)
    detected_drifts.append(drift_alert)
    
    if(drift_alert==1):
        
        length_drifts_detected = length_drifts_detected + len(testing_labels)
        
        print('CHANGE OF TRAINING AT ', i - num_chunks//2 + 1)
        
        no_necessary_retrainings = no_necessary_retrainings + 1
        necessary_label_annotation_effort = necessary_label_annotation_effort + len(testing_labels)
        
        
        training_features = np.vstack(feature_list[i + 1 - num_chunks//2: i + 1])
        training_labels = np.hstack(label_list[i + 1 - num_chunks//2: i + 1])
    
        print('New Training', training_features)
        print(len(training_features))
        
end = time.time() - begin

   

In [None]:
df_results_mcudi_sc1 = pd.DataFrame(columns=['Random_Seed', 'Model', 'Scenario', 'Drifts', 'ROC_AUC', 'Run_Time', 'Drifts_Detected', 'Label_Costs'])
#df_results_periodic = pd.DataFrame()


In [None]:
df_results_mcudi_sc1.loc[0] = [random_seed, 'McUDI', '1', str(no_necessary_retrainings)+'/'+str(len(detected_drifts)), roc_auc_score(true_testing_labels, predictions_test_dd2_sc1), end, detected_drifts, length_drifts_detected]

In [None]:
df_results_disk = pd.concat([df_results_disk, df_results_mcudi_sc1])
df_results_disk

### ROC AUC Scenario 1

#### Static

In [None]:
np.mean(df_results_disk[df_results_disk.Model=='static'].ROC_AUC)

#### Periodic

In [None]:
np.mean(df_results_disk[df_results_disk.Model=='periodic'].ROC_AUC)

#### EDE

In [None]:
np.mean(df_results_disk[df_results_disk.Model=='EDE'].ROC_AUC)

#### McUDI

In [None]:
np.mean(df_results_disk[df_results_disk.Model=='McUDI'].ROC_AUC)