In [7]:
import pandas as pd
from sklearn import metrics, preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm
from utilities import obtain_period_data, obtain_metrics


In [8]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [9]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [10]:
DATASET_PATH = '../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/alibaba_job_data.csv'
interval = 'd'

In [11]:
alibaba_data = pd.read_csv(DATASET_PATH)
alibaba_data = alibaba_data.loc[:, ~alibaba_data.columns.str.contains('^Unnamed')]
alibaba_data

Unnamed: 0,job_name,start_time,end_time,status,user,task_name,inst_num,plan_cpu,plan_mem,plan_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem
0,31fe7a4e1565891f332f2b33,494319.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,6be512ebc1890951ef8fe9a3,494326.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,418cf3def0eaa3389c8c743f,516002.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,6ad04e6d3dd2c35e3a0c3e5f,516023.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,193e6fdd5cb271f54d85f739,531744.0,1136554.0,Failed,74238accb90b,2,2.0,1200.0,58.593750,5.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701367,ee183237255c4377b9637594,6450685.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,19.241379,5.166667,0.553563,1.958984,0.232641,0.768555
701368,1df4ef827ff8fb19211ad760,6450729.0,6451098.0,Terminated,c4cbaac9966d,1,1.0,1000.0,19.550781,50.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
701369,4113ec499c025e364a97b440,6450736.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,18.534050,3.215488,0.597005,2.086914,0.304398,1.254883
701370,c1577ac376105aabc2390246,6450758.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [12]:
feature_list, label_list = obtain_period_data('a')

Loading data from ./../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/alibaba_job_data.csv
Load complete
Preprocessing features
Preprocessing complete



In [13]:
len(feature_list[0][0])

12

# Data Preprocessing

In [14]:
df_concept_drift = pd.read_csv('../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_job_alibaba_r_avg_plot.csv')
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,W,Dataset,Model
0,2,Non-Drift,0.016279,W1_2,Alibaba,Random Forests Alibaba Job Data
1,3,Drift,0.177957,W2_3,Alibaba,Random Forests Alibaba Job Data
2,4,Drift,0.152879,W3_4,Alibaba,Random Forests Alibaba Job Data
3,5,Drift,1.584986,W4_5,Alibaba,Random Forests Alibaba Job Data
4,6,Drift,0.054189,W5_6,Alibaba,Random Forests Alibaba Job Data
5,7,Drift,0.491319,W6_7,Alibaba,Random Forests Alibaba Job Data
6,8,Drift,0.160296,W7_8,Alibaba,Random Forests Alibaba Job Data


In [15]:
features_job_failure = ['user', 'task_name', 'inst_num', 'plan_cpu', 'plan_mem', 'plan_gpu', 
        'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem', 'avg_gpu_wrk_mem', 'max_gpu_wrk_mem']

In [16]:
len(features_job_failure)

12

In [17]:
scaler = StandardScaler()
stat_test = stats.kstest

# Monitoring the No of Features that Change in Each Period

In [18]:
no_times_changes_in_each_feature = []

drift_times_changes_in_each_feature = []

non_drift_times_changes_in_each_feature = []

for feature in tqdm(features_job_failure):
    
    no_times_changes = 0
    
    no_times_drift_changes = 0
    
    no_times_non_drift_changes = 0
    
    for period in tqdm(range(0, len(feature_list)-1)):
        
        # extract features train and test
        training_features = scaler.fit_transform(feature_list[period])
        testing_features = scaler.transform(feature_list[period+1])
        
        # convert numpy array to Pandas Dataframe
        df_train_features = pd.DataFrame(training_features, columns = features_job_failure)
        df_test_features = pd.DataFrame(testing_features, columns = features_job_failure)
        
        v, p = stat_test(df_train_features[feature], df_test_features[feature])
        if(p<0.05):
            no_times_changes = no_times_changes + 1
            
            if(df_concept_drift['Sig'][period]=='Drift'):
                no_times_drift_changes = no_times_drift_changes + 1
            
            if(df_concept_drift['Sig'][period]=='Non-Drift'):
                no_times_non_drift_changes = no_times_non_drift_changes + 1
            
    no_times_changes_in_each_feature.append(no_times_changes)
    
    drift_times_changes_in_each_feature.append(no_times_drift_changes)
    
    non_drift_times_changes_in_each_feature.append(no_times_non_drift_changes)

  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:00<00:00,  8.01it/s][A
 29%|██▊       | 2/7 [00:00<00:00,  7.00it/s][A
 43%|████▎     | 3/7 [00:00<00:00,  6.98it/s][A
 57%|█████▋    | 4/7 [00:00<00:00,  6.55it/s][A
 71%|███████▏  | 5/7 [00:00<00:00,  6.46it/s][A
 86%|████████▌ | 6/7 [00:00<00:00,  6.54it/s][A
100%|██████████| 7/7 [00:01<00:00,  6.26it/s][A
  8%|▊         | 1/12 [00:01<00:12,  1.12s/it]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:00<00:00,  9.69it/s][A
 29%|██▊       | 2/7 [00:00<00:00,  7.54it/s][A
 43%|████▎     | 3/7 [00:00<00:00,  8.44it/s][A
 57%|█████▋    | 4/7 [00:00<00:00,  7.70it/s][A
 71%|███████▏  | 5/7 [00:00<00:00,  6.89it/s][A
 86%|████████▌ | 6/7 [00:00<00:00,  6.74it/s][A
100%|██████████| 7/7 [00:01<00:00,  6.79it/s][A
 17%|█▋        | 2/12 [00:02<00:10,  1.07s/it]
  0%|          | 0/7 [00:00<?, ?it/s][A
 29%|██▊       | 2/7 [00:00<00:00,  8.87it/s][A
 43%|████

In [19]:
period

6

In [20]:
weeks = ['W1_2', 'W2_3', 'W3_4', 'W4_5', 'W5_6', 'W6_7', 'W7_8']

In [21]:
len(no_times_changes_in_each_feature)

12

In [22]:
no_times_changes_in_each_feature

[7, 6, 7, 7, 7, 6, 2, 1, 2, 2, 1, 1]

In [23]:
print([i/len(df_concept_drift) for i in no_times_changes_in_each_feature])

[1.0, 0.8571428571428571, 1.0, 1.0, 1.0, 0.8571428571428571, 0.2857142857142857, 0.14285714285714285, 0.2857142857142857, 0.2857142857142857, 0.14285714285714285, 0.14285714285714285]


In [24]:
drift_times_changes_in_each_feature

[6, 5, 6, 6, 6, 5, 2, 1, 2, 2, 1, 1]

In [25]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature])

[1.0, 0.8333333333333334, 1.0, 1.0, 1.0, 0.8333333333333334, 0.3333333333333333, 0.16666666666666666, 0.3333333333333333, 0.3333333333333333, 0.16666666666666666, 0.16666666666666666]


In [26]:
non_drift_times_changes_in_each_feature

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]

In [27]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [68]:
features_job_failure[11]

'max_gpu_wrk_mem'

In [69]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature][11])

0.16666666666666666


In [70]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature][11])

0.0
