In [1]:
import pandas as pd
from sklearn import metrics, preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm
from utilities import obtain_period_data, obtain_metrics


In [2]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [3]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [4]:
DATASET_PATH = '../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/alibaba_job_data.csv'
interval = 'd'

In [5]:
alibaba_data = pd.read_csv(DATASET_PATH)
alibaba_data = alibaba_data.loc[:, ~alibaba_data.columns.str.contains('^Unnamed')]
alibaba_data

Unnamed: 0,job_name,start_time,end_time,status,user,task_name,inst_num,plan_cpu,plan_mem,plan_gpu,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem
0,31fe7a4e1565891f332f2b33,494319.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,6be512ebc1890951ef8fe9a3,494326.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,418cf3def0eaa3389c8c743f,516002.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,6ad04e6d3dd2c35e3a0c3e5f,516023.0,,Failed,74238accb90b,1,1.0,600.0,29.296875,25.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,193e6fdd5cb271f54d85f739,531744.0,1136554.0,Failed,74238accb90b,2,2.0,1200.0,58.593750,5.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
701367,ee183237255c4377b9637594,6450685.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,19.241379,5.166667,0.553563,1.958984,0.232641,0.768555
701368,1df4ef827ff8fb19211ad760,6450729.0,6451098.0,Terminated,c4cbaac9966d,1,1.0,1000.0,19.550781,50.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
701369,4113ec499c025e364a97b440,6450736.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,18.534050,3.215488,0.597005,2.086914,0.304398,1.254883
701370,c1577ac376105aabc2390246,6450758.0,,Failed,d4d51aca8806,1,10.0,50.0,29.296875,100.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [6]:
feature_list, label_list = obtain_period_data('a')

Loading data from ./../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/alibaba_job_data.csv
Load complete
Preprocessing features
Preprocessing complete



In [7]:
features_names = ['user', 'task_name', 'inst_num', 'plan_cpu', 'plan_mem', 'plan_gpu', 
        'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem', 'avg_gpu_wrk_mem', 'max_gpu_wrk_mem']

In [8]:
len(features_names)

12

In [9]:
len(feature_list[0][0])

12

# Data Preprocessing

In [10]:
df_concept_drift = pd.read_csv('../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_job_alibaba_r_avg_plot.csv')
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,W,Dataset,Model
0,2,Non-Drift,0.016279,W1_2,Alibaba,Random Forests Alibaba Job Data
1,3,Drift,0.177957,W2_3,Alibaba,Random Forests Alibaba Job Data
2,4,Drift,0.152879,W3_4,Alibaba,Random Forests Alibaba Job Data
3,5,Drift,1.584986,W4_5,Alibaba,Random Forests Alibaba Job Data
4,6,Drift,0.054189,W5_6,Alibaba,Random Forests Alibaba Job Data
5,7,Drift,0.491319,W6_7,Alibaba,Random Forests Alibaba Job Data
6,8,Drift,0.160296,W7_8,Alibaba,Random Forests Alibaba Job Data


In [11]:
df_concept_drift[0:7]

Unnamed: 0,X,Sig,Y,W,Dataset,Model
0,2,Non-Drift,0.016279,W1_2,Alibaba,Random Forests Alibaba Job Data
1,3,Drift,0.177957,W2_3,Alibaba,Random Forests Alibaba Job Data
2,4,Drift,0.152879,W3_4,Alibaba,Random Forests Alibaba Job Data
3,5,Drift,1.584986,W4_5,Alibaba,Random Forests Alibaba Job Data
4,6,Drift,0.054189,W5_6,Alibaba,Random Forests Alibaba Job Data
5,7,Drift,0.491319,W6_7,Alibaba,Random Forests Alibaba Job Data
6,8,Drift,0.160296,W7_8,Alibaba,Random Forests Alibaba Job Data


In [12]:
scaler = StandardScaler()
stat_test = stats.kstest

# Monitoring the No of Features that Change in Each Period

In [13]:
perc_changed_features_per_period = []
no_changed_features_per_period = []

for period in tqdm(range(0, len(feature_list)-1)):

    # extract features train and test
    training_features = scaler.fit_transform(feature_list[period])
    testing_features = scaler.transform(feature_list[period+1])
    
    # convert numpy array to Pandas Dataframe
    df_train_features = pd.DataFrame(training_features, columns = features_names)
    df_test_features = pd.DataFrame(testing_features, columns = features_names)
    
    no_changed_features = 0
    for feature in features_names:
        #print(feature)
        #print(df_train_features[feature])
        #print(df_test_features[feature])
        v, p = stat_test(df_train_features[feature], df_test_features[feature])
        if(p<0.05):
            no_changed_features = no_changed_features + 1
    
    no_changed_features_per_period.append(no_changed_features)
    perc_changed_features_per_period.append(no_changed_features/len(features_names))

100%|██████████| 7/7 [00:04<00:00,  1.75it/s]


In [14]:
len(perc_changed_features_per_period)

7

In [15]:
weeks = ['W1_2', 'W2_3', 'W3_4', 'W4_5', 'W5_6', 'W6_7', 'W7_8']

In [16]:
drift_grount_truth = list(df_concept_drift.Sig)
len(drift_grount_truth)

7

In [17]:
df_results_monitoring_all_individual_features = pd.DataFrame()
df_results_monitoring_all_individual_features['Period'] = weeks
df_results_monitoring_all_individual_features['Drift Ground Truth'] = drift_grount_truth[0:7]
df_results_monitoring_all_individual_features['No Changed Features Per Period'] = no_changed_features_per_period
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'] = perc_changed_features_per_period
df_results_monitoring_all_individual_features

Unnamed: 0,Period,Drift Ground Truth,No Changed Features Per Period,Percentage Changed Features Per Period
0,W1_2,Non-Drift,6,0.5
1,W2_3,Drift,12,1.0
2,W3_4,Drift,4,0.333333
3,W4_5,Drift,6,0.5
4,W5_6,Drift,9,0.75
5,W6_7,Drift,6,0.5
6,W7_8,Drift,6,0.5


## Correlation Coefficient Percentage Features that Change and Drift Ground Truth

In [18]:
from sklearn.preprocessing import LabelEncoder
from scipy import stats

In [19]:
le = LabelEncoder()

In [20]:
drift_ground_truth = le.fit_transform(df_results_monitoring_all_individual_features['Drift Ground Truth'])
drift_ground_truth

array([1, 0, 0, 0, 0, 0, 0])

In [21]:
res = stats.spearmanr(df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'], drift_ground_truth)
res

SignificanceResult(statistic=-0.1126106541153627, pvalue=0.8100360594259686)

In [22]:
df_results_monitoring_all_individual_features['Drift_Non_Drift_Label'] = drift_ground_truth

In [23]:
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'].corr(df_results_monitoring_all_individual_features['Drift_Non_Drift_Label'])

-0.16666666666666669

In [18]:
df_results_monitoring_all_individual_features.to_csv('df_percentage_of_changed_features_job_alibaba.csv')