In [1]:
import pandas as pd
from sklearn import metrics, preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm


In [2]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]
    return terminals

In [3]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [4]:
def features_labels_preprocessing(DATASET_PATH, dataset):
    
    if(dataset=='b'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
        columns = ['serial_number', 'date'] + features_disk_failure + ['label']
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None, dtype = 'str').iloc[1:,1:]
        df.columns = columns
        
        # ignore serial number
        df = df[df.columns[1:]]
        
        for feature in features_disk_failure:
            df[feature] = df[feature].astype(float)


        d = {'True': True, 'False': False}
        df['label'] = df['label'].map(d)

        df['label'].unique()

        # transform date to date time
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        # divide on weeks
        df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        features = df[df.columns[:-1]].to_numpy()
        labels = df[df.columns[-1]].to_numpy()
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))
        
    elif(dataset=='g'):
        
        print('Data Reading and Preprocessing')
        
        # set data paths and columns names
        features_job_failure = ['User ID', 'Job Name', 'Scheduling Class',
                   'Num Tasks', 'Priority', 'Diff Machine', 'CPU Requested', 'Mem Requested', 'Disk Requested',
                   'Avg CPU', 'Avg Mem', 'Avg Disk', 'Std CPU', 'Std Mem', 'Std Disk']
        columns_initial = ['Job ID', 'Status', 'Start Time', 'End Time'] + features_job_failure
        
        # read dataset
        df = pd.read_csv(DATASET_PATH, header=None)
        df.columns = columns_initial
        df = df.tail(-1)
        # ignore Job ID
        df = df.drop(['Job ID'], axis = 1)
        columns = features_job_failure

        include_end_time = False
        
        print('Features and Labels Preprocessing')
        
        # features and labels preprocessing
        features = df[(['Start Time']+ features_job_failure)].to_numpy()
        labels = (df['Status']==3).to_numpy()

        # FEATURES PREPROCESSING
        offset = (1 if include_end_time else 0)

        # ENCODE USER ID
        le = preprocessing.LabelEncoder()
        features[:, 1+offset] = le.fit_transform(features[:, 1+offset])

        # ENCODE JOB NAME
        le = preprocessing.LabelEncoder()
        features[:, 2+offset] = le.fit_transform(features[:, 2+offset])

        features = features.astype(float)
        
        print('Features and Labels Computing')
        
        # features and labels extraction and computation
        feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('g'))
        
    else:
        print('Incorrect Dataset')
    
    return feature_list, label_list

In [5]:
DATASET_PATH = '../../../Documents/phd_related/AIOps_disk_failure_prediction/raw_data_2015_2017/disk_2015_complete.csv'
interval = 'm'

In [6]:
feature_list, label_list = features_labels_preprocessing(DATASET_PATH, 'b')

Data Reading and Preprocessing
Features and Labels Computing


In [7]:
len(feature_list)

12

# Data Preprocessing

In [8]:
df_concept_drift = pd.read_csv('../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_backblaze_2015_r_avg_plot.csv')
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,M,Dataset,Model
0,2,Drift,0.443412,M1_2,Backblaze Disk Data,Random Forests Backblaze Disk Data
1,3,Drift,0.512604,M2_3,Backblaze Disk Data,Random Forests Backblaze Disk Data
2,4,Non-Drift,0.396403,M3_4,Backblaze Disk Data,Random Forests Backblaze Disk Data
3,5,Non-Drift,0.280091,M4_5,Backblaze Disk Data,Random Forests Backblaze Disk Data
4,6,Non-Drift,0.191991,M5_6,Backblaze Disk Data,Random Forests Backblaze Disk Data
5,7,Non-Drift,0.219476,M6_7,Backblaze Disk Data,Random Forests Backblaze Disk Data
6,8,Drift,0.536009,M7_8,Backblaze Disk Data,Random Forests Backblaze Disk Data
7,9,Drift,0.460164,M8_9,Backblaze Disk Data,Random Forests Backblaze Disk Data
8,10,Non-Drift,0.314639,M9_10,Backblaze Disk Data,Random Forests Backblaze Disk Data
9,11,Non-Drift,0.212075,M10_11,Backblaze Disk Data,Random Forests Backblaze Disk Data


In [9]:
len(df_concept_drift)

11

In [10]:
scaler = StandardScaler()
stat_test = stats.kstest

# Monitoring Changes in Each Feature

In [11]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']

In [12]:
no_times_changes_in_each_feature = []

drift_times_changes_in_each_feature = []

non_drift_times_changes_in_each_feature = []

for feature in tqdm(features_disk_failure):
    
    no_times_changes = 0
    
    no_times_drift_changes = 0
    
    no_times_non_drift_changes = 0
    
    for period in tqdm(range(0, len(feature_list)-1)):
        
        # extract features train and test
        training_features = scaler.fit_transform(feature_list[period])
        testing_features = scaler.transform(feature_list[period+1])
        
        # convert numpy array to Pandas Dataframe
        df_train_features = pd.DataFrame(training_features, columns = features_disk_failure)
        df_test_features = pd.DataFrame(testing_features, columns = features_disk_failure)
        
        v, p = stat_test(df_train_features[feature], df_test_features[feature])
        if(p<0.05):
            no_times_changes = no_times_changes + 1
            
            if(df_concept_drift['Sig'][period]=='Drift'):
                no_times_drift_changes = no_times_drift_changes + 1
            
            if(df_concept_drift['Sig'][period]=='Non-Drift'):
                no_times_non_drift_changes = no_times_non_drift_changes + 1
            
    no_times_changes_in_each_feature.append(no_times_changes)
    
    drift_times_changes_in_each_feature.append(no_times_drift_changes)
    
    non_drift_times_changes_in_each_feature.append(no_times_non_drift_changes)

  0%|          | 0/19 [00:00<?, ?it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 36%|███▋      | 4/11 [00:00<00:00, 38.68it/s][A
100%|██████████| 11/11 [00:00<00:00, 51.40it/s][A
  5%|▌         | 1/19 [00:00<00:03,  4.61it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
100%|██████████| 11/11 [00:00<00:00, 68.62it/s][A
 11%|█         | 2/19 [00:00<00:03,  5.38it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
100%|██████████| 11/11 [00:00<00:00, 61.93it/s][A
 16%|█▌        | 3/19 [00:00<00:02,  5.45it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
100%|██████████| 11/11 [00:00<00:00, 67.66it/s][A
 21%|██        | 4/19 [00:00<00:02,  5.65it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 36%|███▋      | 4/11 [00:00<00:00, 39.41it/s][A
100%|██████████| 11/11 [00:00<00:00, 39.82it/s][A
 26%|██▋       | 5/19 [00:01<00:03,  4.64it/s]
  0%|          | 0/11 [00:00<?, ?it/s][A
 36%|███▋      | 4/11 [00:00<00:00, 39.23it/s][A
100%|██████████| 11/11 [00:00<00:00, 53.29it/s][A
 32%|███▏      | 6/19 

In [13]:
len(no_times_changes_in_each_feature)

19

In [14]:
no_times_changes_in_each_feature

[0, 0, 0, 0, 11, 0, 0, 0, 1, 0, 0, 0, 0, 6, 0, 0, 4, 0, 0]

In [15]:
print([i/len(df_concept_drift) for i in no_times_changes_in_each_feature])

[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.09090909090909091, 0.0, 0.0, 0.0, 0.0, 0.5454545454545454, 0.0, 0.0, 0.36363636363636365, 0.0, 0.0]


In [16]:
drift_times_changes_in_each_feature

[0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0]

In [17]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature])

[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0, 0.0, 0.5, 0.0, 0.0]


In [30]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature][4])

1.0


In [31]:
non_drift_times_changes_in_each_feature

[0, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0, 0, 5, 0, 0, 2, 0, 0]

In [32]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature])

[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.14285714285714285, 0.0, 0.0, 0.0, 0.0, 0.7142857142857143, 0.0, 0.0, 0.2857142857142857, 0.0, 0.0]


In [33]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature][4])

1.0


In [78]:
features_disk_failure[18]

'smart_199_raw_diff'

In [75]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Drift']) for i in drift_times_changes_in_each_feature][18])

0.0


In [76]:
print([i/len(df_concept_drift[df_concept_drift.Sig == 'Non-Drift']) for i in non_drift_times_changes_in_each_feature][18])

0.0


In [21]:
features_disk_failure

['smart_1_raw',
 'smart_4_raw',
 'smart_5_raw',
 'smart_7_raw',
 'smart_9_raw',
 'smart_12_raw',
 'smart_187_raw',
 'smart_193_raw',
 'smart_194_raw',
 'smart_197_raw',
 'smart_199_raw',
 'smart_4_raw_diff',
 'smart_5_raw_diff',
 'smart_9_raw_diff',
 'smart_12_raw_diff',
 'smart_187_raw_diff',
 'smart_193_raw_diff',
 'smart_197_raw_diff',
 'smart_199_raw_diff']

In [22]:
months = ['M1_2', 'M2_3', 'M3_4', 'M4_5', 'M5_6', 'M6_7', 'M7_8', 'M8_9', 'M9_10', 'M10_11', 'M11_12']


In [None]:
len(months)

In [None]:
drift_grount_truth = list(df_concept_drift.Sig)

In [None]:
df_results_monitoring_all_individual_features = pd.DataFrame()
df_results_monitoring_all_individual_features['Period'] = months
df_results_monitoring_all_individual_features['Drift Ground Truth'] = drift_grount_truth
df_results_monitoring_all_individual_features['No Changed Features Per Period'] = no_changed_features_per_period
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'] = perc_changed_features_per_period
df_results_monitoring_all_individual_features

In [None]:
df_results_monitoring_all_individual_features.to_csv('df_percentage_of_changed_features_disk_backblaze.csv')

Correlation Calculation

TODO!!!!