In [1]:
import pandas as pd
from sklearn import metrics, preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm


In [2]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [3]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [4]:
DATASET_PATH = '../../../Documents/phd_related/data_sets_concept_drift/AIOps_failure_prediction/google_job_failure.csv'
interval = 'd'

In [5]:
features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']
columns_initial = ['Job ID', 'Status', 'Start Time', 'End Time'] + features_job_failure

# READ DATA

df = pd.read_csv(DATASET_PATH, header=None)
df.columns = columns_initial
df = df.tail(-1)

df = df.drop(['Job ID'], axis = 1)
columns = features_job_failure

include_end_time = False

# EXTRACT FEATURES AND LABELS

features = df[(['Start Time']+ features_job_failure)].to_numpy()
labels = (df['Status']==3).to_numpy()


# FEATURES PREPROCESSING
offset = (1 if include_end_time else 0)

# ENCODE USER ID
le = preprocessing.LabelEncoder()
features[:, 1+offset] = le.fit_transform(features[:, 1+offset])

# ENCODE JOB NAME
le = preprocessing.LabelEncoder()
features[:, 2+offset] = le.fit_transform(features[:, 2+offset])

features = features.astype(float)


  df = pd.read_csv(DATASET_PATH, header=None)


In [6]:
df

Unnamed: 0,Status,Start Time,End Time,User ID,Job Name,Scheduling Class,Num Tasks,Priority,Diff Machine,CPU Requested,Mem Requested,Disk Requested,Avg CPU,Avg Mem,Avg Disk,Std CPU,Std Mem,Std Disk
1,4,604046279,675251710,0yO12LxPGQxL+XkI4hS6Tlyx5V3KOvjHS0DQSOJeMbs=,evBe1ewvZ0q51pdt/HQXLBschAlhrg5nR0Sj7efCvfE=,1,1,9,0.0,0.1875,0.047700000000000006,1.1445e-05,0.007800700000000001,0.0004083333333333333,0.0,0.01041434489762398,0.0005774705379690138,0.0
2,4,612141652,662266479,JcpU581OuLad5EhNHS+e+YrY4kEW8pN6ymudnkWMDXs=,PqaHcqiH62FESqU41XFHq+UbXe0VvgZXuw+kdGrOucs=,1,1,0,0.0,0.1875,0.013985999999999998,2.2887e-05,0.0086698,0.001007,3.4969999999999997e-06,0.012063922361598102,0.0014241130573097067,4.49719912834644e-07
3,4,617115444,653284668,ZpQmujQYX55FcN2RFvqqUkcz5z/Tovj2FaG8sFtlG48=,Id4Jo+D+JJ4Xh+VfKZk+ByyUzeeI+CZCCPqnXMN3bsg=,0,1,1,0.0,0.09375,0.047700000000000006,0.00011445,0.0007594999999999999,0.00010966666666666667,0.0,0.0009113298890449422,0.0001550920873402495,0.0
4,4,619396391,649029930,jVEIdGnEYLp+j9YJHh5dEBhUdpD2fs+PKTWwQo5ZrJk=,RfQ/XZs68BkB9UwJNf0YK1rt2SSwD/DSIZa1cMtfba8=,1,1,2,0.0,0.018744,0.004662,2.861100000000001e-06,0.0021797666666666664,0.00015829999999999994,0.0,0.00275950578465702,0.00022387000692366093,0.0
5,4,633214029,687862270,jVEIdGnEYLp+j9YJHh5dEBhUdpD2fs+PKTWwQo5ZrJk=,1H3WBVcen2RS9lximFVb5A/HIAxc6rH8XBp0IStE/Co=,1,1,2,0.0,0.018744,0.009326999999999998,2.8610999999999998e-05,0.0019135,0.00032996666666666665,1.5892333333333334e-06,0.0024135163489536725,0.00046664333513104226,4.493899296700905e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627105,4,2419796510564,2419812185725,F2+Gv53Pxd4KDRb/UsGECThH/XUOpcWKElUXJkhkt1c=,2N2Ubknr/Ly7V95poeYFNAXQISDBZ8iSTPs6KaZ1wQw=,0,1,1,0.0,0.0,0.000466,0.0,0.000036,0.0,0.0,0.000051,0.0,0.0
627106,4,2419801833487,2419902238681,JcpU581OuLad5EhNHS+e+YrY4kEW8pN6ymudnkWMDXs=,PqaHcqiH62FESqU41XFHq+UbXe0VvgZXuw+kdGrOucs=,1,1,0,0.0,0.1875,0.013986,0.000023,0.004023,0.000938,0.000004,0.004497,0.001327,0.0
627107,4,2419801839825,2419830750933,ZpQmujQYX55FcN2RFvqqUkcz5z/Tovj2FaG8sFtlG48=,P6CjWI0Zxr2nRvIBuXTJUU0KJNs2h43nZVBJpztHvG8=,0,1,1,0.0,0.0,0.013986,0.000034,0.001318,0.00007,0.0,0.001793,0.000098,0.0
627108,3,2419802495386,2419826567489,0yO12LxPGQxL+XkI4hS6Tlyx5V3KOvjHS0DQSOJeMbs=,BIl6NzV4dRLZtHfvkiKmn7wwW8hzsDp4vqj8REm3gv4=,1,1,9,0.0,0.1875,0.006993,0.000011,0.005246,0.000347,0.000003,0.005978,0.000491,0.000002


In [7]:
# DIVIDE FEATURES INTO DAYS 

feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('g'))

In [8]:
len(feature_list)

28

# Data Preprocessing

In [9]:
df_concept_drift = pd.read_csv('../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_job_google_r_avg_plot.csv')
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,W,Dataset,Model
0,3,Drift,2.207221,D2_3,Google,Random Forests Google Job Data
1,4,Non-Drift,0.158528,D3_4,Google,Random Forests Google Job Data
2,5,Non-Drift,0.13276,D4_5,Google,Random Forests Google Job Data
3,6,Drift,0.242976,D5_6,Google,Random Forests Google Job Data
4,7,Drift,0.305662,D6_7,Google,Random Forests Google Job Data
5,8,Drift,1.010056,D7_8,Google,Random Forests Google Job Data
6,9,Drift,0.464647,D8_9,Google,Random Forests Google Job Data
7,10,Drift,1.437104,D9_10,Google,Random Forests Google Job Data
8,11,Drift,0.545631,D10_11,Google,Random Forests Google Job Data
9,12,Non-Drift,0.15575,D11_12,Google,Random Forests Google Job Data


In [10]:
len(df_concept_drift)

26

In [11]:
scaler = StandardScaler()
stat_test = stats.kstest

# Monitoring the No of Features that Change in Each Period

In [12]:
perc_changed_features_per_period = []
no_changed_features_per_period = []

for period in tqdm(range(0, len(feature_list)-1)):

    # extract features train and test
    training_features = scaler.fit_transform(feature_list[period])
    testing_features = scaler.transform(feature_list[period+1])
    
    # convert numpy array to Pandas Dataframe
    df_train_features = pd.DataFrame(training_features, columns = features_job_failure)
    df_test_features = pd.DataFrame(testing_features, columns = features_job_failure)
    
    no_changed_features = 0
    for feature in features_job_failure:
        #print(feature)
        #print(df_train_features[feature])
        #print(df_test_features[feature])
        v, p = stat_test(df_train_features[feature], df_test_features[feature])
        if(p<0.05):
            no_changed_features = no_changed_features + 1
    
    no_changed_features_per_period.append(no_changed_features)
    perc_changed_features_per_period.append(no_changed_features/len(features_job_failure))

100%|██████████| 27/27 [00:08<00:00,  3.29it/s]


In [13]:
len(perc_changed_features_per_period)

27

In [14]:
days = ['D2_3',
 'D3_4',
 'D4_5',
 'D5_6',
 'D6_7',
 'D7_8',
 'D8_9',
 'D9_10',
 'D10_11',
 'D11_12',
 'D12_13',
 'D13_14',
 'D14_15',
 'D15_16',
 'D16_17',
 'D17_18',
 'D18_19',
 'D20_21',
 'D21_22',
 'D22_23',
 'D23_24',
 'D24_25',
 'D25_26',
 'D26_27',
 'D27_28',
 'D28_29']
days

['D2_3',
 'D3_4',
 'D4_5',
 'D5_6',
 'D6_7',
 'D7_8',
 'D8_9',
 'D9_10',
 'D10_11',
 'D11_12',
 'D12_13',
 'D13_14',
 'D14_15',
 'D15_16',
 'D16_17',
 'D17_18',
 'D18_19',
 'D20_21',
 'D21_22',
 'D22_23',
 'D23_24',
 'D24_25',
 'D25_26',
 'D26_27',
 'D27_28',
 'D28_29']

In [15]:
len(days)

26

In [16]:
drift_grount_truth = list(df_concept_drift.Sig)

In [17]:
df_results_monitoring_all_individual_features = pd.DataFrame()
df_results_monitoring_all_individual_features['Period'] = days
df_results_monitoring_all_individual_features['Drift Ground Truth'] = drift_grount_truth
df_results_monitoring_all_individual_features['No Changed Features Per Period'] = no_changed_features_per_period[1:]
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'] = perc_changed_features_per_period[1:]
df_results_monitoring_all_individual_features

Unnamed: 0,Period,Drift Ground Truth,No Changed Features Per Period,Percentage Changed Features Per Period
0,D2_3,Drift,14,0.933333
1,D3_4,Non-Drift,9,0.6
2,D4_5,Non-Drift,13,0.866667
3,D5_6,Drift,14,0.933333
4,D6_7,Drift,14,0.933333
5,D7_8,Drift,14,0.933333
6,D8_9,Drift,10,0.666667
7,D9_10,Drift,11,0.733333
8,D10_11,Drift,11,0.733333
9,D11_12,Non-Drift,12,0.8


## Correlation Coefficient Percentage Features that Change and Drift Ground Truth

In [18]:
from sklearn.preprocessing import LabelEncoder
from scipy import stats

In [19]:
le = LabelEncoder()

In [20]:
drift_ground_truth = le.fit_transform(df_results_monitoring_all_individual_features['Drift Ground Truth'])
drift_ground_truth

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1])

In [21]:
res = stats.spearmanr(df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'], drift_ground_truth)
res

SignificanceResult(statistic=-0.193674944237914, pvalue=0.34312884490034723)

In [22]:
df_results_monitoring_all_individual_features['Drift_Non_Drift_Label'] = drift_ground_truth

In [23]:
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'].corr(df_results_monitoring_all_individual_features['Drift_Non_Drift_Label'])

-0.12199306650947703

In [18]:
df_results_monitoring_all_individual_features.to_csv('df_percentage_of_changed_features_job_google.csv')