In [1]:
from datetime import date, datetime, timedelta
import pandas as pd
from utilities import obtain_period_data, obtain_metrics

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.decomposition import PCA
from scipy import stats
import seaborn as sns

import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period
    
    elif dataset == 'a':
        # time unit in Alibaba: second, tracing time: 
        start_time = 494319
        unit_period = 7 * 24 * 60 * 60  # unit period: one week
        start_time += 3 * 24 * 60 * 60  # the first week contains only 1642 samples
        end_time = start_time + 8*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [4]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [5]:
def sorting_features_by_importance(df_feature_importance, features_name):
    # Extract Ranks
    feature_imp_array_string = np.array(((df_feature_importance.FI[period].replace('[','')).replace(']','')).replace('\n', '').split(' '))
    #print('feature_imp_array_string', feature_imp_array_string)

    # convert to float
    feature_imp_array = [float(i) for i in feature_imp_array_string if i != '' ]

    # consider only the most important features (importance > mean(feature_importances))
    # extract mean?? of feature importance
    mean_importance = np.mean(feature_imp_array)

    zipped_features = list(zip(feature_imp_array, features_name))


    sorted_features_zip = sorted(zipped_features, key = lambda x: x[0], reverse = True)

   
    return sorted_features_zip, mean_importance

In [6]:
def KS_on_features(df_train, df_test):
    stat_test = stats.kstest
    distribution_training = sns.distplot(np.array(df_train)).get_lines()[0].get_data()[1]
    plt.close()
    distribution_test = sns.distplot(np.array(df_test)).get_lines()[0].get_data()[1]
    plt.close()
    v, p = stat_test(distribution_training, distribution_test)
    if(p<0.05):
        return 1
    else:
        return 0

In [7]:
DATASET_PATH = '../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/alibaba_job_data.csv'
interval = 'd'

In [8]:
columns = ['start_time', 
        'user', 'task_name', 'inst_num', 'plan_cpu', 'plan_mem', 'plan_gpu', 
        'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem', 'avg_gpu_wrk_mem', 'max_gpu_wrk_mem']

In [9]:
len(columns)

13

In [10]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df

Unnamed: 0,job_name,start_time,end_time,status,user,task_name,inst_num,plan_cpu,plan_mem,plan_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem
0,31fe7a4e1565891f332f2b33,494319.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,6be512ebc1890951ef8fe9a3,494326.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,418cf3def0eaa3389c8c743f,516002.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,6ad04e6d3dd2c35e3a0c3e5f,516023.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,193e6fdd5cb271f54d85f739,531744.0,1136554.0,Failed,74238accb90b,2,2.0,1200.0,58.593750,5.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701367,ee183237255c4377b9637594,6450685.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,19.241379,5.166667,0.553563,1.958984,0.232641,0.768555
701368,1df4ef827ff8fb19211ad760,6450729.0,6451098.0,Terminated,c4cbaac9966d,1,1.0,1000.0,19.550781,50.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
701369,4113ec499c025e364a97b440,6450736.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,18.534050,3.215488,0.597005,2.086914,0.304398,1.254883
701370,c1577ac376105aabc2390246,6450758.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [11]:
len(df.columns)

16

In [12]:
features_job_failure = columns[1:]
len(features_job_failure)

12

# Data Preprocessing

In [13]:
feature_list, label_list = obtain_period_data('a')

Loading data from ./../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/alibaba_job_data.csv
Load complete
Preprocessing features
Preprocessing complete



In [14]:
data_path_drift_localization = '../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_job_alibaba_r_avg.csv'
random_seeds = ['1234', '4887', '597', '1959', '413', '44', '2969', '4971', '4913', '9591']

In [15]:
df_concept_drift = pd.read_csv(data_path_drift_localization)
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,P,FI,Dataset,Model
0,2,False,0.067012,W1_2,[0.50017907 0.03178638 0.08468882 0.12511323 0...,Alibaba,Random Forests
1,3,True,0.177129,W2_3,[0.5076077 0.02704315 0.11099802 0.10480298 0...,Alibaba,Random Forests
2,4,True,0.126323,W3_4,[0.52997286 0.03372237 0.09005644 0.0925118 0...,Alibaba,Random Forests
3,5,True,1.590020,W4_5,[0.53187617 0.02120032 0.08967998 0.11200477 0...,Alibaba,Random Forests
4,6,True,0.087596,W5_6,[0.50906009 0.01642509 0.04265409 0.22677249 0...,Alibaba,Random Forests
...,...,...,...,...,...,...,...
65,67,True,0.115250,W3_4,[0.54680288 0.03169961 0.08175573 0.09081205 0...,Alibaba,Random Forests
66,68,True,1.598895,W4_5,[0.55059148 0.01965702 0.08594441 0.10424598 0...,Alibaba,Random Forests
67,69,True,0.054217,W5_6,[0.5894447 0.02380166 0.04022902 0.18274514 0...,Alibaba,Random Forests
68,70,True,0.495258,W6_7,[0.51463558 0.02823887 0.08189004 0.09881903 0...,Alibaba,Random Forests


In [16]:
no_overall_correct_all_total = []
no_overall_correct_important_total = []
no_overall_correct_pca_total = []

no_drift_correct_all_total = []
no_drift_correct_important_total = []
no_drift_correct_pca_total = []

no_non_drift_correct_all_total = []
no_non_drift_correct_important_total = []
no_non_drift_correct_pca_total = []


random_seed_all = []



drifts_true_job = []
for k in range(0, len(df_concept_drift.Sig.values)):
    if(df_concept_drift.Sig.values[k]==True):
        drifts_true_job.append(1)
    else:
        drifts_true_job.append(0)

scaler = StandardScaler()


ks_results_all_features = []
ks_results_important_features = []
ks_results_pca_features = []


for period in tqdm(range(0, len(feature_list)-1)):
#for period in tqdm(range(1,2)):


    #print('Before Scale Train', feature_list[period])
    #print('Before Scale Test', feature_list[period+1])

    # extract features train and test
    training_features = scaler.fit_transform(feature_list[period])
    testing_features = scaler.transform(feature_list[period+1])

    # convert numpy array to Pandas Dataframe
    df_train_features = pd.DataFrame(training_features, columns = features_job_failure)
    df_test_features = pd.DataFrame(testing_features, columns = features_job_failure)

    # Sort by Feature Importance to avoid         
    sorted_features_zip, mean_importance = sorting_features_by_importance(df_concept_drift, features_job_failure)



    # Using All Features
    sorted_features_all = [i[1] for i in sorted_features_zip]
    #print('length sorted_features All', len(sorted_features_all))
    df_train_features_sorted_all = df_train_features[sorted_features_all]
    df_test_features_sorted_all = df_test_features[sorted_features_all]

    # Using Most Important Features

    sorted_important_features_filter = [x for x in sorted_features_zip if x[0]>=mean_importance]
    sorted_features_important = [i[1] for i in sorted_important_features_filter]
    #print('length sorted_features Important', len(sorted_features_important))

    df_train_features_sorted_important = df_train_features[sorted_features_important]
    df_test_features_sorted_important = df_test_features[sorted_features_important]

    # Using PCA on Features
    # reduce features dimensionality using PCA
    pca = PCA(n_components = 0.95)
    pca.fit(df_train_features_sorted_all)

    df_train_features_sorted_pca = pca.transform(df_train_features_sorted_all)
    df_test_features_sorted_pca = pca.transform(df_test_features_sorted_all)


    ks_results_all_features.append(KS_on_features(df_train_features_sorted_all, df_test_features_sorted_all))
    ks_results_important_features.append(KS_on_features(df_train_features_sorted_important, df_test_features_sorted_important))
    ks_results_pca_features.append(KS_on_features(df_train_features_sorted_pca, df_test_features_sorted_pca))


print('Results All Features', ks_results_all_features)
print('Results Important Features', ks_results_important_features)
print('Results PCA Features', ks_results_pca_features)


100%|██████████| 7/7 [00:44<00:00,  6.35s/it]

Results All Features [1, 1, 1, 0, 0, 1, 1]
Results Important Features [1, 1, 0, 1, 0, 0, 0]
Results PCA Features [1, 1, 0, 0, 1, 0, 0]





In [17]:
ks_results_all_features

[1, 1, 1, 0, 0, 1, 1]

In [18]:
ks_results_important_features

[1, 1, 0, 1, 0, 0, 0]

In [19]:
ks_results_pca_features

[1, 1, 0, 0, 1, 0, 0]

In [20]:
len(ks_results_all_features)

7

In [21]:
len(ks_results_important_features)

7

In [22]:
len(ks_results_pca_features)

7

In [23]:
len(ks_results_all_features * 10)

70

In [24]:
ks_results_all_features_all_rs = ks_results_all_features * 10
ks_results_important_features_all_rs = ks_results_important_features * 10
ks_results_pca_features_all_rs = ks_results_pca_features * 10

In [25]:


no_overall_correct_all = 0
no_drift_correct_all = 0
no_non_drift_correct_all = 0


no_overall_correct_important = 0
no_drift_correct_important = 0
no_non_drift_correct_important = 0

no_overall_correct_pca = 0
no_drift_correct_pca = 0
no_non_drift_correct_pca = 0

for j in tqdm(range(0, len(ks_results_all_features_all_rs))):

    # Overall correct all 3 cases
    if(ks_results_all_features_all_rs[j]==drifts_true_job[j]):
        no_overall_correct_all = no_overall_correct_all + 1

    if(ks_results_important_features_all_rs[j]==drifts_true_job[j]):
        no_overall_correct_important = no_overall_correct_important + 1

    if(ks_results_pca_features_all_rs[j]==drifts_true_job[j]):
        no_overall_correct_pca = no_overall_correct_pca + 1

    # Correctly Identified Drifts all 3 cases

    # All Features
    if(drifts_true_job[j]==1):
        if(ks_results_all_features_all_rs[j]==drifts_true_job[j]):
            no_drift_correct_all = no_drift_correct_all + 1

    # Most Important Features
    if(drifts_true_job[j]==1):
        if(ks_results_important_features_all_rs[j]==drifts_true_job[j]):
            no_drift_correct_important = no_drift_correct_important + 1

    # PCA on Features
    if(drifts_true_job[j]==1):
        if(ks_results_pca_features_all_rs[j]==drifts_true_job[j]):
            no_drift_correct_pca = no_drift_correct_pca + 1

     # Correctly Identified Non-Drift all 3 cases

    # All Features
    if(drifts_true_job[j]==0):
        if(ks_results_all_features_all_rs[j]==drifts_true_job[j]):
            no_non_drift_correct_all = no_non_drift_correct_all + 1

    # Most Important Features
    if(drifts_true_job[j]==0):
        if(ks_results_important_features_all_rs[j]==drifts_true_job[j]):
            no_non_drift_correct_important = no_non_drift_correct_important + 1

    # PCA on Features
    if(drifts_true_job[j]==0):
        if(ks_results_pca_features_all_rs[j]==drifts_true_job[j]):
            no_non_drift_correct_pca = no_non_drift_correct_pca + 1

# Compute Metrics for all Random Seeds

# Overall Correct Predictions
overall_correct_prediction_score_all = no_overall_correct_all/len(drifts_true_job)
overall_correct_prediction_score_important = no_drift_correct_important/len(drifts_true_job)
overall_correct_prediction_score_pca = no_overall_correct_pca/len(drifts_true_job)

no_overall_correct_all_total.append(overall_correct_prediction_score_all)
no_overall_correct_important_total.append(overall_correct_prediction_score_important)
no_overall_correct_pca_total.append(overall_correct_prediction_score_pca)

# Correctly Identified Drifts

drift_correct_prediction_score_all = no_drift_correct_all/len(np.nonzero(drifts_true_job)[0])
drift_correct_prediction_score_important = no_drift_correct_important/len(np.nonzero(drifts_true_job)[0])
drift_correct_prediction_score_pca = no_drift_correct_pca/len(np.nonzero(drifts_true_job)[0])

no_drift_correct_all_total.append(drift_correct_prediction_score_all)
no_drift_correct_important_total.append(drift_correct_prediction_score_important)
no_drift_correct_pca_total.append(drift_correct_prediction_score_pca)


# Correctly Identified Non-Drifts

non_drift_correct_prediction_score_all = no_non_drift_correct_all/(len(drifts_true_job) - len(np.nonzero(drifts_true_job)[0]))
non_drift_correct_prediction_score_important = no_non_drift_correct_important/(len(drifts_true_job) - len(np.nonzero(drifts_true_job)[0]))
non_drift_correct_prediction_score_pca = no_non_drift_correct_pca/(len(drifts_true_job) - len(np.nonzero(drifts_true_job)[0]))


no_non_drift_correct_all_total.append(non_drift_correct_prediction_score_all)
no_non_drift_correct_important_total.append(non_drift_correct_prediction_score_important)
no_non_drift_correct_pca_total.append(non_drift_correct_prediction_score_pca)

#random_seed_all.append([random_seeds[i-1]]*len(feature_list))


100%|██████████| 70/70 [00:00<00:00, 255527.66it/s]


### Results All Features

In [26]:
np.mean(no_overall_correct_all_total)

0.5714285714285714

In [27]:
np.mean(no_drift_correct_all_total)

0.6666666666666666

In [28]:
np.mean(no_non_drift_correct_all_total)

0.0

### Results Important Features

In [29]:
np.mean(no_overall_correct_important_total)

0.2857142857142857

In [30]:
np.mean(no_drift_correct_important_total)

0.3333333333333333

In [31]:
np.mean(no_non_drift_correct_important_total)

0.0

### Results PCA Features

In [32]:
np.mean(no_overall_correct_pca_total)

0.2857142857142857

In [33]:
np.mean(no_drift_correct_pca_total)

0.3333333333333333

In [34]:
np.mean(no_non_drift_correct_pca_total)

0.0