In [1]:
import pandas as pd
from sklearn import metrics, preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm


In [2]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]
    return terminals

In [3]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [4]:
def features_labels_preprocessing(DATASET_PATH, dataset):
    
    if(dataset=='b'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
        columns = ['serial_number', 'date'] + features_disk_failure + ['label']
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None, dtype = 'str').iloc[1:,1:]
        df.columns = columns
        
        # ignore serial number
        df = df[df.columns[1:]]
        
        for feature in features_disk_failure:
            df[feature] = df[feature].astype(float)


        d = {'True': True, 'False': False}
        df['label'] = df['label'].map(d)

        df['label'].unique()

        # transform date to date time
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        # divide on weeks
        df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        features = df[df.columns[:-1]].to_numpy()
        labels = df[df.columns[-1]].to_numpy()
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))
        
    elif(dataset=='g'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']
        columns_initial = ['Job ID', 'Status', 'Start Time', 'End Time'] + features_job_failure
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None)
        df.columns = columns_initial
        df = df.tail(-1)
        # ignore Job ID
        df = df.drop(['Job ID'], axis = 1)
        columns = features_job_failure

        include_end_time = False
        
        print('Features and Labels Preprocessing')
        
        # features and labels preprocessing
        features = df[(['Start Time']+ features_job_failure)].to_numpy()
        labels = (df['Status']==3).to_numpy()

        # FEATURES PREPROCESSING
        offset = (1 if include_end_time else 0)

        # ENCODE USER ID
        le = preprocessing.LabelEncoder()
        features[:, 1+offset] = le.fit_transform(features[:, 1+offset])

        # ENCODE JOB NAME
        le = preprocessing.LabelEncoder()
        features[:, 2+offset] = le.fit_transform(features[:, 2+offset])

        features = features.astype(float)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('g'))
        
    else:
        print('Incorrect Dataset')
    
    return df, feature_list, label_list

In [6]:
DATASET_PATH = '../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/google_job_failure.csv'
interval = 'd'


In [7]:
df, feature_list, label_list = features_labels_preprocessing(DATASET_PATH, 'g')

Data Reading and Preprocessing


  df = pd.read_csv(DATASET_PATH, header=None)


Features and Labels Preprocessing
Features and Labels Computing


In [21]:
df['Diff Machine'].value_counts()

Diff Machine
0.0    586396
0.0     32226
1.0      7946
1.0       541
Name: count, dtype: int64

In [7]:
len(feature_list)

28

# Data Preprocessing

In [8]:
data_path_drift_localization = '../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_job_google_r_avg_plot.csv'
random_seeds = ['1234', '4887', '597', '1959', '413', '44', '2969', '4971', '4913', '9591']

In [9]:
df_concept_drift = pd.read_csv(data_path_drift_localization)
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,W,Dataset,Model
0,3,Drift,2.207221,D2_3,Google,Random Forests Google Job Data
1,4,Non-Drift,0.158528,D3_4,Google,Random Forests Google Job Data
2,5,Non-Drift,0.13276,D4_5,Google,Random Forests Google Job Data
3,6,Drift,0.242976,D5_6,Google,Random Forests Google Job Data
4,7,Drift,0.305662,D6_7,Google,Random Forests Google Job Data
5,8,Drift,1.010056,D7_8,Google,Random Forests Google Job Data
6,9,Drift,0.464647,D8_9,Google,Random Forests Google Job Data
7,10,Drift,1.437104,D9_10,Google,Random Forests Google Job Data
8,11,Drift,0.545631,D10_11,Google,Random Forests Google Job Data
9,12,Non-Drift,0.15575,D11_12,Google,Random Forests Google Job Data


In [10]:
len(df_concept_drift)

26

In [11]:
scaler = StandardScaler()
stat_test = stats.kstest

# Monitoring Changes in Each Feature

In [12]:
features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']

In [13]:
no_times_changes_in_each_feature = []

drift_times_changes_in_each_feature = []

non_drift_times_changes_in_each_feature = []

for feature in tqdm(features_job_failure):
    
    no_times_changes = 0
    
    no_times_drift_changes = 0
    
    no_times_non_drift_changes = 0
    
    for period in tqdm(range(1, len(feature_list)-1)):
        
        # extract features train and test
        training_features = scaler.fit_transform(feature_list[period])
        testing_features = scaler.transform(feature_list[period+1])
        
        # convert numpy array to Pandas Dataframe
        df_train_features = pd.DataFrame(training_features, columns = features_job_failure)
        df_test_features = pd.DataFrame(testing_features, columns = features_job_failure)
        
        v, p = stat_test(df_train_features[feature], df_test_features[feature])
        if(p<0.05):
            no_times_changes = no_times_changes + 1
            
            if(df_concept_drift['Sig'][period-1]=='Drift'):
                no_times_drift_changes = no_times_drift_changes + 1
            
            if(df_concept_drift['Sig'][period-1]=='Non-Drift'):
                no_times_non_drift_changes = no_times_non_drift_changes + 1
            
    no_times_changes_in_each_feature.append(no_times_changes)
    
    drift_times_changes_in_each_feature.append(no_times_drift_changes)
    
    non_drift_times_changes_in_each_feature.append(no_times_non_drift_changes)

  0%|          | 0/15 [00:00<?, ?it/s]
  0%|          | 0/26 [00:00<?, ?it/s][A
 12%|█▏        | 3/26 [00:00<00:00, 29.68it/s][A
 27%|██▋       | 7/26 [00:00<00:00, 33.31it/s][A
 42%|████▏     | 11/26 [00:00<00:00, 32.55it/s][A
 62%|██████▏   | 16/26 [00:00<00:00, 36.23it/s][A
 77%|███████▋  | 20/26 [00:00<00:00, 35.72it/s][A
100%|██████████| 26/26 [00:00<00:00, 35.22it/s][A
  7%|▋         | 1/15 [00:00<00:10,  1.35it/s]
  0%|          | 0/26 [00:00<?, ?it/s][A
 15%|█▌        | 4/26 [00:00<00:00, 31.77it/s][A
 31%|███       | 8/26 [00:00<00:00, 33.99it/s][A
 46%|████▌     | 12/26 [00:00<00:00, 34.45it/s][A
 62%|██████▏   | 16/26 [00:00<00:00, 33.17it/s][A
 77%|███████▋  | 20/26 [00:00<00:00, 31.37it/s][A
100%|██████████| 26/26 [00:00<00:00, 32.80it/s][A
 13%|█▎        | 2/15 [00:01<00:10,  1.29it/s]
  0%|          | 0/26 [00:00<?, ?it/s][A
 19%|█▉        | 5/26 [00:00<00:00, 42.66it/s][A
 38%|███▊      | 10/26 [00:00<00:00, 40.09it/s][A
 58%|█████▊    | 15/26 [00:00<00

In [14]:
period

26

In [15]:
len(no_times_changes_in_each_feature)

15

In [16]:
no_times_changes_in_each_feature

[26, 25, 23, 23, 24, 1, 26, 26, 25, 26, 26, 21, 26, 26, 17]

In [17]:
print([i/len(df_concept_drift) for i in no_times_changes_in_each_feature])

[1.0, 0.9615384615384616, 0.8846153846153846, 0.8846153846153846, 0.9230769230769231, 0.038461538461538464, 1.0, 1.0, 0.9615384615384616, 1.0, 1.0, 0.8076923076923077, 1.0, 1.0, 0.6538461538461539]


In [18]:
drift_times_changes_in_each_feature

[16, 16, 15, 14, 15, 1, 16, 16, 15, 16, 16, 14, 16, 16, 10]

In [19]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature])

[1.0, 1.0, 0.9375, 0.875, 0.9375, 0.0625, 1.0, 1.0, 0.9375, 1.0, 1.0, 0.875, 1.0, 1.0, 0.625]


In [20]:
non_drift_times_changes_in_each_feature

[10, 9, 8, 9, 9, 0, 10, 10, 10, 10, 10, 7, 10, 10, 7]

In [21]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature])

[1.0, 0.9, 0.8, 0.9, 0.9, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 0.7]


In [22]:
features_job_failure

['User ID',
 'Job Name',
 'Scheduling Class',
 'Num Tasks',
 'Priority',
 'Diff Machine',
 'CPU Requested',
 'Mem Requested',
 'Disk Requested',
 'Avg CPU',
 'Avg Mem',
 'Avg Disk',
 'Std CPU',
 'Std Mem',
 'Std Disk']

In [68]:
features_job_failure[14]

'Std Disk'

In [69]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature][14])

0.625


In [70]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature][14])

0.7


In [14]:
len(months)

11

In [15]:
drift_grount_truth = list(df_concept_drift.Sig)

In [16]:
df_results_monitoring_all_individual_features = pd.DataFrame()
df_results_monitoring_all_individual_features['Period'] = months
df_results_monitoring_all_individual_features['Drift Ground Truth'] = drift_grount_truth
df_results_monitoring_all_individual_features['No Changed Features Per Period'] = no_changed_features_per_period
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'] = perc_changed_features_per_period
df_results_monitoring_all_individual_features

Unnamed: 0,Period,Drift Ground Truth,No Changed Features Per Period,Percentage Changed Features Per Period
0,M1_2,Drift,2,0.105263
1,M2_3,Drift,1,0.052632
2,M3_4,Non-Drift,2,0.105263
3,M4_5,Non-Drift,2,0.105263
4,M5_6,Non-Drift,2,0.105263
5,M6_7,Non-Drift,2,0.105263
6,M7_8,Drift,2,0.105263
7,M8_9,Drift,2,0.105263
8,M9_10,Non-Drift,2,0.105263
9,M10_11,Non-Drift,2,0.105263


In [17]:
df_results_monitoring_all_individual_features.to_csv('df_percentage_of_changed_features_disk_backblaze.csv')

Correlation Calculation

TODO!!!!