In [1]:
import pandas as pd
from sklearn import metrics, preprocessing
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm


In [2]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period

    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]
    return terminals

In [3]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [4]:
DATASET_PATH = '../../../Documents/phd_related/AIOps_disk_failure_prediction/raw_data_2015_2017/disk_2015_complete.csv'
interval = 'd'

In [5]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
# uncomment for 2015, 2016 
columns = ['serial_number', 'date'] + features_disk_failure + ['label']
# uncomment for 2017 (missing serial number for some reason)
#columns = ['date'] + features_disk_failure + ['label']

# READ DATA
df = pd.read_csv(DATASET_PATH, header=None, dtype = 'str').iloc[1:,1:]

# put columns names
df.columns = columns
# ignore serial number
df = df[df.columns[1:]]

for feature in features_disk_failure:
    df[feature] = df[feature].astype(float)
    
df['label'].unique()

d = {'True': True, 'False': False}
df['label'] = df['label'].map(d)

df['label'].unique()

# transform date to date time
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# divide on days of year

# original implementation
#df['date'] = pd.Series(pd.DatetimeIndex(df['date']).month)

# divide on weeks
df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)

features = df[df.columns[:-1]].to_numpy()
labels = df[df.columns[-1]].to_numpy()


In [6]:
# DIVIDE FEATURES INTO DAYS 

feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))

In [7]:
len(feature_list)

12

# Data Preprocessing

In [8]:
df_concept_drift = pd.read_csv('../../../Documents/phd_related/alibaba/results/results_r/rf_concept_drift_localization_backblaze_2015_r_avg_plot.csv')
df_concept_drift = df_concept_drift.loc[:, ~df_concept_drift.columns.str.contains('^Unnamed')]
df_concept_drift

Unnamed: 0,X,Sig,Y,M,Dataset,Model
0,2,Drift,0.443412,M1_2,Backblaze Disk Data,Random Forests Backblaze Disk Data
1,3,Drift,0.512604,M2_3,Backblaze Disk Data,Random Forests Backblaze Disk Data
2,4,Non-Drift,0.396403,M3_4,Backblaze Disk Data,Random Forests Backblaze Disk Data
3,5,Non-Drift,0.280091,M4_5,Backblaze Disk Data,Random Forests Backblaze Disk Data
4,6,Non-Drift,0.191991,M5_6,Backblaze Disk Data,Random Forests Backblaze Disk Data
5,7,Non-Drift,0.219476,M6_7,Backblaze Disk Data,Random Forests Backblaze Disk Data
6,8,Drift,0.536009,M7_8,Backblaze Disk Data,Random Forests Backblaze Disk Data
7,9,Drift,0.460164,M8_9,Backblaze Disk Data,Random Forests Backblaze Disk Data
8,10,Non-Drift,0.314639,M9_10,Backblaze Disk Data,Random Forests Backblaze Disk Data
9,11,Non-Drift,0.212075,M10_11,Backblaze Disk Data,Random Forests Backblaze Disk Data


In [9]:
len(df_concept_drift)

11

In [10]:
scaler = StandardScaler()
stat_test = stats.kstest

# Monitoring the No of Features that Change in Each Period

In [11]:
perc_changed_features_per_period = []
no_changed_features_per_period = []

for period in tqdm(range(0, len(feature_list)-1)):

    # extract features train and test
    training_features = scaler.fit_transform(feature_list[period])
    testing_features = scaler.transform(feature_list[period+1])
    
    # convert numpy array to Pandas Dataframe
    df_train_features = pd.DataFrame(training_features, columns = features_disk_failure)
    df_test_features = pd.DataFrame(testing_features, columns = features_disk_failure)
    
    no_changed_features = 0
    for feature in features_disk_failure:
        #print(feature)
        #print(df_train_features[feature])
        #print(df_test_features[feature])
        v, p = stat_test(df_train_features[feature], df_test_features[feature])
        if(p<0.05):
            no_changed_features = no_changed_features + 1
    
    no_changed_features_per_period.append(no_changed_features)
    perc_changed_features_per_period.append(no_changed_features/len(features_disk_failure))

100%|██████████| 11/11 [00:01<00:00,  6.91it/s]


In [12]:
len(perc_changed_features_per_period)

11

In [13]:
months = ['M1_2', 'M2_3', 'M3_4', 'M4_5', 'M5_6', 'M6_7', 'M7_8', 'M8_9', 'M9_10', 'M10_11', 'M11_12']


In [14]:
len(months)

11

In [15]:
drift_grount_truth = list(df_concept_drift.Sig)

In [16]:
df_results_monitoring_all_individual_features = pd.DataFrame()
df_results_monitoring_all_individual_features['Period'] = months
df_results_monitoring_all_individual_features['Drift Ground Truth'] = drift_grount_truth
df_results_monitoring_all_individual_features['No Changed Features Per Period'] = no_changed_features_per_period
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'] = perc_changed_features_per_period
df_results_monitoring_all_individual_features

Unnamed: 0,Period,Drift Ground Truth,No Changed Features Per Period,Percentage Changed Features Per Period
0,M1_2,Drift,2,0.105263
1,M2_3,Drift,1,0.052632
2,M3_4,Non-Drift,2,0.105263
3,M4_5,Non-Drift,2,0.105263
4,M5_6,Non-Drift,2,0.105263
5,M6_7,Non-Drift,2,0.105263
6,M7_8,Drift,2,0.105263
7,M8_9,Drift,2,0.105263
8,M9_10,Non-Drift,2,0.105263
9,M10_11,Non-Drift,2,0.105263


## Determine correlation Perc Features vs Drift Ground Truth

In [23]:
from sklearn.preprocessing import LabelEncoder
from scipy import stats

In [21]:
le = LabelEncoder()

In [22]:
drift_ground_truth = le.fit_transform(df_results_monitoring_all_individual_features['Drift Ground Truth'])
drift_ground_truth

array([0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1])

In [25]:
res = stats.spearmanr(df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'], drift_ground_truth)
res

SignificanceResult(statistic=0.4432026302139591, pvalue=0.1721607156534795)

In [28]:
df_results_monitoring_all_individual_features['Drift_Non_Drift_Label'] = drift_ground_truth

In [29]:
df_results_monitoring_all_individual_features['Percentage Changed Features Per Period'].corr(df_results_monitoring_all_individual_features['Drift_Non_Drift_Label'])

0.4432026302139591

In [17]:
df_results_monitoring_all_individual_features.to_csv('df_percentage_of_changed_features_disk_backblaze.csv')

Correlation Calculation

TODO!!!!