In [1]:
from datetime import date, datetime, timedelta
import pandas as pd

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.decomposition import PCA
from scipy import stats
import seaborn as sns

import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [4]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [5]:
def sorting_features_by_importance(df_feature_importance, features_name):
    # Extract Ranks
    feature_imp_array_string = np.array(((df_feature_importance.FI[period].replace('[','')).replace(']','')).replace('\n', '').split(' '))
    #print('feature_imp_array_string', feature_imp_array_string)

    # convert to float
    feature_imp_array = [float(i) for i in feature_imp_array_string if i != '' ]

    # consider only the most important features (importance > mean(feature_importances))
    # extract mean?? of feature importance
    mean_importance = np.mean(feature_imp_array)

    zipped_features = list(zip(feature_imp_array, features_name))


    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)

   
    return sorted_features_zip, mean_importance

In [6]:
def KS_on_features(df_train, df_test):
    stat_test = stats.kstest
    distribution_training = sns.distplot(np.array(df_train)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_test = sns.distplot(np.array(df_test)).get_lines()[0].get_data()[1]
    plt.close()
    v, p = stat_test(distribution_training, distribution_test)
    if(p<0.05):
        return 1
    else:
        return 0

In [10]:
def features_labels_preprocessing(DATASET_PATH, dataset):
    
    if(dataset=='b'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
        columns = ['serial_number', 'date'] + features_disk_failure + ['label']
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None, dtype = 'str').iloc[1:,1:]
        df.columns = columns
        
        # ignore serial number
        df = df[df.columns[1:]]
        
        for feature in features_disk_failure:
            df[feature] = df[feature].astype(float)


        d = {'True': True, 'False': False}
        df['label'] = df['label'].map(d)

        df['label'].unique()

        # transform date to date time
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        # divide on weeks
        df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        features = df[df.columns[:-1]].to_numpy()
        labels = df[df.columns[-1]].to_numpy()
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))
        
    elif(dataset=='g'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']
        columns_initial = ['Job ID', 'Status', 'Start Time', 'End Time'] + features_job_failure
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None)
        df.columns = columns_initial
        df = df.tail(-1)
        # ignore Job ID
        df = df.drop(['Job ID'], axis = 1)
        columns = features_job_failure

        include_end_time = False
        
        print('Features and Labels Preprocessing')
        
        # features and labels preprocessing
        features = df[(['Start Time']+ features_job_failure)].to_numpy()
        labels = (df['Status']==3).to_numpy()

        # FEATURES PREPROCESSING
        offset = (1 if include_end_time else 0)

        # ENCODE USER ID
        le = preprocessing.LabelEncoder()
        features[:, 1+offset] = le.fit_transform(features[:, 1+offset])

        # ENCODE JOB NAME
        le = preprocessing.LabelEncoder()
        features[:, 2+offset] = le.fit_transform(features[:, 2+offset])

        features = features.astype(float)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('g'))
        
    else:
        print('Incorrect Dataset')
    
    return feature_list, label_list

In [11]:
DATASET_PATH = '../../../Documents/phd_related/AIOps_disk_failure_prediction/raw_data_2015_2017/disk_2015_complete.csv'
interval = 'm'

In [12]:
feature_list, label_list = features_labels_preprocessing(DATASET_PATH, 'b')

Data Reading and Preprocessing
Features and Labels Computing


In [16]:
data_path_drift_localization = '../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_disk_r_2015.csv'
random_seeds = ['1234', '4887', '597', '1959', '413', '44', '2969', '4971', '4913', '9591']

In [17]:
df_concept_drift = pd.read_csv(data_path_drift_localization)
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,P,FI,Dataset,Model
0,2,True,0.684234,M1_2,[2.87743809e-02 6.64635439e-02 3.27660839e-01 ...,Backblaze,Random Forests
1,3,False,0.545522,M2_3,[1.84850482e-02 3.55154174e-02 4.27410848e-01 ...,Backblaze,Random Forests
2,4,False,0.578978,M3_4,[0.04567131 0.03080426 0.2707051 0.05105274 0...,Backblaze,Random Forests
3,5,False,0.347826,M4_5,[0.06618511 0.01426565 0.29028097 0.07428556 0...,Backblaze,Random Forests
4,6,False,0.145008,M5_6,[0.06813396 0.06776009 0.28317387 0.04438929 0...,Backblaze,Random Forests
...,...,...,...,...,...,...,...
105,107,True,0.540541,M7_8,[4.09005123e-02 6.43137864e-02 3.10339821e-01 ...,Backblaze,Random Forests
106,108,False,0.333333,M8_9,[4.31084346e-02 3.64856431e-02 3.74805455e-01 ...,Backblaze,Random Forests
107,109,False,0.374386,M9_10,[8.35422822e-02 4.73650430e-02 2.76309777e-01 ...,Backblaze,Random Forests
108,110,False,0.206840,M10_11,[2.04560417e-02 3.25895462e-02 3.75140082e-01 ...,Backblaze,Random Forests


In [19]:
df_concept_drift[df_concept_drift.P == 'M1_2']

Unnamed: 0,X,Sig,Y,P,FI,Dataset,Model
0,2,True,0.684234,M1_2,[2.87743809e-02 6.64635439e-02 3.27660839e-01 ...,Backblaze,Random Forests
11,13,True,0.666691,M1_2,[0.03542423 0.03311551 0.20080723 0.04755055 0...,Backblaze,Random Forests
22,24,False,0.473723,M1_2,[2.35320422e-02 7.51096731e-02 3.07647903e-01 ...,Backblaze,Random Forests
33,35,False,0.190536,M1_2,[2.78214643e-02 4.32873798e-02 2.50674294e-01 ...,Backblaze,Random Forests
44,46,False,0.166728,M1_2,[0.03056104 0.03631676 0.33774967 0.0203512 0...,Backblaze,Random Forests
55,57,False,0.500037,M1_2,[1.97284161e-02 4.51047141e-02 3.14581126e-01 ...,Backblaze,Random Forests
66,68,False,0.450041,M1_2,[2.79540220e-02 6.39735788e-02 3.80769634e-01 ...,Backblaze,Random Forests
77,79,False,0.500037,M1_2,[1.03765207e-02 4.25848171e-02 2.81302782e-01 ...,Backblaze,Random Forests
88,90,False,0.421095,M1_2,[2.78912653e-02 6.92914835e-02 3.44057436e-01 ...,Backblaze,Random Forests
99,101,False,0.380998,M1_2,[2.01282349e-02 7.04049712e-02 3.92014386e-01 ...,Backblaze,Random Forests


In [22]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                     'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']


In [24]:
no_overall_correct_all_total = []
no_overall_correct_important_total = []
no_overall_correct_pca_total = []

no_drift_correct_all_total = []
no_drift_correct_important_total = []
no_drift_correct_pca_total = []

no_non_drift_correct_all_total = []
no_non_drift_correct_important_total = []
no_non_drift_correct_pca_total = []


random_seed_all = []



drifts_true_job = []
for k in range(0, len(df_concept_drift.Sig.values)):
    if(df_concept_drift.Sig.values[k]==True):
        drifts_true_job.append(1)
    else:
        drifts_true_job.append(0)

scaler = StandardScaler()


ks_results_all_features = []
ks_results_important_features = []
ks_results_pca_features = []


for period in tqdm(range(0, len(feature_list)-1)):
#for period in tqdm(range(1,2)):


    #print('Before Scale Train', feature_list[period])
    #print('Before Scale Test', feature_list[period+1])

    # extract features train and test
    training_features = scaler.fit_transform(feature_list[period])
    testing_features = scaler.transform(feature_list[period+1])

    # convert numpy array to Pandas Dataframe
    df_train_features = pd.DataFrame(training_features, columns = features_disk_failure)
    df_test_features = pd.DataFrame(testing_features, columns = features_disk_failure)

    # Sort by Feature Importance to avoid         
    sorted_features_zip, mean_importance = sorting_features_by_importance(df_concept_drift, features_disk_failure)



    # Using All Features
    sorted_features_all = [i[1] for i in sorted_features_zip]
    #print('length sorted_features All', len(sorted_features_all))
    df_train_features_sorted_all = df_train_features[sorted_features_all]
    df_test_features_sorted_all = df_test_features[sorted_features_all]

    # Using Most Important Features

    sorted_important_features_filter = [x for x in sorted_features_zip if x[0]>=mean_importance]
    sorted_features_important = [i[1] for i in sorted_important_features_filter]
    #print('length sorted_features Important', len(sorted_features_important))

    df_train_features_sorted_important = df_train_features[sorted_features_important]
    df_test_features_sorted_important = df_test_features[sorted_features_important]

    # Using PCA on Features
    # reduce features dimensionality using PCA
    pca = PCA(n_components = 0.95)
    pca.fit(df_train_features_sorted_all)

    df_train_features_sorted_pca = pca.transform(df_train_features_sorted_all)
    df_test_features_sorted_pca = pca.transform(df_test_features_sorted_all)


    ks_results_all_features.append(KS_on_features(df_train_features_sorted_all, df_test_features_sorted_all))
    ks_results_important_features.append(KS_on_features(df_train_features_sorted_important, df_test_features_sorted_important))
    ks_results_pca_features.append(KS_on_features(df_train_features_sorted_pca, df_test_features_sorted_pca))


print('Results All Features', ks_results_all_features)
print('Results Important Features', ks_results_important_features)
print('Results PCA Features', ks_results_pca_features)


100%|██████████| 11/11 [00:26<00:00,  2.43s/it]

Results All Features [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]
Results Important Features [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
Results PCA Features [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]





In [25]:
len(ks_results_all_features)

11

In [26]:
len(ks_results_important_features)

11

In [27]:
len(ks_results_pca_features)

11

In [28]:
len(ks_results_all_features * 10)

110

In [40]:
ks_results_all_features

[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]

In [41]:
ks_results_important_features

[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]

In [42]:
ks_results_pca_features

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

In [29]:
ks_results_all_features_all_rs = ks_results_all_features * 10
ks_results_important_features_all_rs = ks_results_important_features * 10
ks_results_pca_features_all_rs = ks_results_pca_features * 10

In [30]:


no_overall_correct_all = 0
no_drift_correct_all = 0
no_non_drift_correct_all = 0


no_overall_correct_important = 0
no_drift_correct_important = 0
no_non_drift_correct_important = 0

no_overall_correct_pca = 0
no_drift_correct_pca = 0
no_non_drift_correct_pca = 0

for j in tqdm(range(0, len(ks_results_all_features_all_rs))):

    # Overall correct all 3 cases
    if(ks_results_all_features_all_rs[j]==drifts_true_job[j]):
        no_overall_correct_all = no_overall_correct_all + 1

    if(ks_results_important_features_all_rs[j]==drifts_true_job[j]):
        no_overall_correct_important = no_overall_correct_important + 1

    if(ks_results_pca_features_all_rs[j]==drifts_true_job[j]):
        no_overall_correct_pca = no_overall_correct_pca + 1

    # Correctly Identified Drifts all 3 cases

    # All Features
    if(drifts_true_job[j]==1):
        if(ks_results_all_features_all_rs[j]==drifts_true_job[j]):
            no_drift_correct_all = no_drift_correct_all + 1

    # Most Important Features
    if(drifts_true_job[j]==1):
        if(ks_results_important_features_all_rs[j]==drifts_true_job[j]):
            no_drift_correct_important = no_drift_correct_important + 1

    # PCA on Features
    if(drifts_true_job[j]==1):
        if(ks_results_pca_features_all_rs[j]==drifts_true_job[j]):
            no_drift_correct_pca = no_drift_correct_pca + 1

     # Correctly Identified Non-Drift all 3 cases

    # All Features
    if(drifts_true_job[j]==0):
        if(ks_results_all_features_all_rs[j]==drifts_true_job[j]):
            no_non_drift_correct_all = no_non_drift_correct_all + 1

    # Most Important Features
    if(drifts_true_job[j]==0):
        if(ks_results_important_features_all_rs[j]==drifts_true_job[j]):
            no_non_drift_correct_important = no_non_drift_correct_important + 1

    # PCA on Features
    if(drifts_true_job[j]==0):
        if(ks_results_pca_features_all_rs[j]==drifts_true_job[j]):
            no_non_drift_correct_pca = no_non_drift_correct_pca + 1

# Compute Metrics for all Random Seeds

# Overall Correct Predictions
overall_correct_prediction_score_all = no_overall_correct_all/len(drifts_true_job)
overall_correct_prediction_score_important = no_drift_correct_important/len(drifts_true_job)
overall_correct_prediction_score_pca = no_overall_correct_pca/len(drifts_true_job)

no_overall_correct_all_total.append(overall_correct_prediction_score_all)
no_overall_correct_important_total.append(overall_correct_prediction_score_important)
no_overall_correct_pca_total.append(overall_correct_prediction_score_pca)

# Correctly Identified Drifts

drift_correct_prediction_score_all = no_drift_correct_all/len(np.nonzero(drifts_true_job)[0])
drift_correct_prediction_score_important = no_drift_correct_important/len(np.nonzero(drifts_true_job)[0])
drift_correct_prediction_score_pca = no_drift_correct_pca/len(np.nonzero(drifts_true_job)[0])

no_drift_correct_all_total.append(drift_correct_prediction_score_all)
no_drift_correct_important_total.append(drift_correct_prediction_score_important)
no_drift_correct_pca_total.append(drift_correct_prediction_score_pca)


# Correctly Identified Non-Drifts

non_drift_correct_prediction_score_all = no_non_drift_correct_all/(len(drifts_true_job) - len(np.nonzero(drifts_true_job)[0]))
non_drift_correct_prediction_score_important = no_non_drift_correct_important/(len(drifts_true_job) - len(np.nonzero(drifts_true_job)[0]))
non_drift_correct_prediction_score_pca = no_non_drift_correct_pca/(len(drifts_true_job) - len(np.nonzero(drifts_true_job)[0]))


no_non_drift_correct_all_total.append(non_drift_correct_prediction_score_all)
no_non_drift_correct_important_total.append(non_drift_correct_prediction_score_important)
no_non_drift_correct_pca_total.append(non_drift_correct_prediction_score_pca)

#random_seed_all.append([random_seeds[i-1]]*len(feature_list))


100%|██████████| 110/110 [00:00<00:00, 183667.77it/s]


### Results All Features

In [31]:
np.mean(no_overall_correct_all_total)

0.6545454545454545

In [32]:
np.mean(no_drift_correct_all_total)

0.09090909090909091

In [33]:
np.mean(no_non_drift_correct_all_total)

0.7954545454545454

### Results Important Features

In [34]:
np.mean(no_overall_correct_important_total)

0.00909090909090909

In [35]:
np.mean(no_drift_correct_important_total)

0.045454545454545456

In [36]:
np.mean(no_non_drift_correct_important_total)

0.7840909090909091

### Results PCA Features

In [37]:
np.mean(no_overall_correct_pca_total)

0.7272727272727273

In [38]:
np.mean(no_drift_correct_pca_total)

0.045454545454545456

In [39]:
np.mean(no_non_drift_correct_pca_total)

0.8977272727272727