In [None]:
from datetime import date, datetime, timedelta
import pandas as pd

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from rgf.sklearn import RGFClassifier
from sklearn.svm import SVC

import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

In [None]:
# random_seeds = [1234, 4887, 597, 1959, 413, 44, 2969, 4971, 4913, 9591]

In [None]:
# example with random seed 1234
random_seed = 1234

In [None]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: week, tracing time: one year (50 weeks)
        start_time = 1
        unit_period = 7  # unit period: one week (7 days)
        end_time = start_time + 50*unit_period
    # original 1 month
    '''
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period
    '''
    
    
    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [None]:
def obtain_model(model_name):
    '''
    This function instantiate a specific model 
    Note: the MODEL_TYPE global variable must be set first
    Args:
        model_name (str): [rf, nn, svm, cart, rgf]
    Returns:
        (instance): instance of given model with preset parameters.
        Return None if the model name is not in the option
    '''
    if model_name == 'rf':
        return RandomForestClassifier(n_estimators=50, criterion='gini', class_weight=None, max_depth=None, 
                                      min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, 
                                      n_jobs=N_WORKERS, random_state = random_seed)
        #return RandomForestClassifier(n_jobs=N_WORKERS)
    elif model_name == 'nn':
        return MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, learning_rate='adaptive')
        #return MLPClassifier()
    elif model_name == 'svm':
        return SVC(max_iter=100000, probability=True)
        #return SVC(max_iter=10000, probability=True)
    elif model_name == 'cart':
        return DecisionTreeClassifier(criterion='gini', class_weight=None, max_depth=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2)
        #return DecisionTreeClassifier()
    elif model_name == 'rgf':
        return SafeRGF()

    return None


In [None]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [None]:
def cross_validation_original(model, features, labels):
    kf = KFold(n_splits=10, shuffle=False)
    error_num = 0
    total_num = 0
    for training_index, testing_index in kf.split(features):
        training_features, training_labels = features[training_index], labels[training_index]
        testing_features, testing_labels = features[testing_index], labels[testing_index]
        model.fit(training_features, training_labels)
        testing_preds = model.predict(testing_features)
        error_num += np.count_nonzero(testing_preds != testing_labels)
        total_num += len(testing_labels)

    return error_num, total_num

In [None]:
# MODIFIED
def cross_validation(model, features, labels):
    kf = KFold(n_splits=10, shuffle=False)
    error_num = 0
    total_num = 0
    testing_preds_cross_val = []
    testing_true_cross_val = []
    
    counter = 0;
    
    for training_index, testing_index in kf.split(features):
        
        counter = counter + 1
        #print(counter)
        
        training_features, training_labels = features[training_index], labels[training_index]
        testing_features, testing_labels = features[testing_index], labels[testing_index]
        model.fit(training_features, training_labels)
        testing_preds = model.predict(testing_features)
        
        testing_preds_cross_val.append(testing_preds)
        testing_true_cross_val.append(testing_labels)
        
        
        error_num += np.count_nonzero(testing_preds != testing_labels)
        total_num += len(testing_labels)

        
    return testing_true_cross_val, testing_preds_cross_val, error_num, total_num
        
        


# Reading Data

In [None]:
DATASET_PATH = 'datasets/disk_failure_2015.csv'

In [None]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
columns = ['serial_number', 'date'] + features_disk_failure + ['label']

In [None]:
df = pd.read_csv(DATASET_PATH, header=None)
# put columns names
df.columns = columns
# ignore serial number
df = df[df.columns[1:]]

In [None]:
len(df)

In [None]:
# transform date to date time
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [None]:
df

In [None]:
# divide on days of year

# original implementation
#df['date'] = pd.Series(pd.DatetimeIndex(df['date']).month)

# divide on weeks
df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)

In [None]:
df_test = pd.read_csv(DATASET_PATH, header=None)
df_test

In [None]:
features = df[df.columns[:-1]].to_numpy()
labels = df[df.columns[-1]].to_numpy()

In [None]:
labels

In [None]:
features

In [None]:
# DIVIDE FEATURES INTO WEEKS 

feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))

In [None]:
# define dataframe to store results
df_expected_actual = pd.DataFrame()
df_expected_actual

In [None]:
# original implementation
#months = ['M1_2', 'M2_3', 'M3_4', 'M4_5', 'M5_6', 'M6_7', 'M7_8', 'M8_9', 'M9_10', 'M10_11', 'M11_12']

# our implementation
# divide on weeks
weeks = []
for i in range(0, len(feature_list)-1):
    string_week = 'W' + str(i+1) + '_' + str(i+2)
    weeks.append(string_week)
len(weeks)

In [None]:
model_name = 'rf'
N_WORKERS = 1

training_error_rate = []
testing_error_rate = []
training_size = []
testing_size = []

#expected_roc_auc_cross_val = []

#actual_roc_auc = []

feature_importance = []

for i in tqdm(range(0, len(weeks))):
        
    # OBTAIN TRAINING DATA + LABELS
    month = i
    print('Train month', month)
    
    
    # SCALE TRAINING DATA

    scaler = StandardScaler()
    training_features = scaler.fit_transform(feature_list[month])
    
    
    # OBTAIN TRAINING LABELS

    training_labels = label_list[month]
    
    # DOWNSAMPLING TRAINING
    # the model learns quite well on the training set but it cannot extrapolate that well on the testing set
    # skip this part
    #training_features, training_labels = downsampling(training_features, training_labels)
    
    # CROSS VALIDATION TO OBTAIN THE EXPECTED ERROR
    
    print('Calculating Expected ROC AUC')

    testing_true_cross_val, testing_pred_cross_val, error_train, total_len_train = cross_validation(obtain_model(model_name), training_features, training_labels)
    
    # OBTAIN ERROR RATE TRAIN + SIZE
    
    training_error_rate.append(error_train)
    training_size.append(total_len_train)
    
    print('Error Rate Train', error_train)
    print('Error Rate Train Size', total_len_train)
    
    # CALCULATE EXPECTED ROC AUC

    roc_auc_cross_validation = []

    # WE GIVE UP ON ROC AUC BECAUSE IT CAN BE THE CASE THAT ONLY ONE CLASS IS REPRESENTED
    '''
    # CHECK IF THERE WAS ONLY ONE CLASS IN THE TRAINING
    if(error_train == 0):
        roc_auc_cross_validation.append(1.0)
    else:
        for i in range(0, len(testing_true_cross_val)):
            roc_auc_cross_validation.append(roc_auc_score(testing_true_cross_val[i], testing_pred_cross_val[i]))
    
    #EXTRACT EXPECTED ROC AUC
    
    expected_roc_auc_cross_val.append(np.mean(roc_auc_cross_validation))
    '''
    # FIT MODEL
    
    print('Model Training')

    model = obtain_model(model_name)
    model.fit(training_features, training_labels)
    
    # OBTAIN TESTING DATA + LABELS

    month_test = month + 1
    
    print('Test month', month_test)

    # SCALE TESTING DATA

    testing_features = scaler.transform(feature_list[month_test])

    # OBTAIN TESTING LABELS

    testing_labels = label_list[month_test]
    
    # OBTAIN PREDICTION

    predictions_test = model.predict(testing_features)
    
    # OBTAIN ERROR RATE TEST + SIZE
    
    testing_err = np.count_nonzero(testing_labels != predictions_test)
    testing_error_rate.append(testing_err)
    testing_size.append(len(testing_labels))
    
    print('Error Rate Test', testing_err)
    print('Error Rate Test Size', len(testing_labels))
    
    # GIVE UP BECAUSE OF REASONS LISTED ABOVE
    # CALCULATE ACTUAL ROC AUC
    '''
    print('Calculating Actual ROC AUC')

    actual_roc_auc.append(roc_auc_score(testing_labels, predictions_test))
    '''
    # EXTRACT FEATURE IMPORTANCE
    
    feature_importance.append(model.feature_importances_)

In [None]:
df_expected_actual['Weeks'] = weeks
df_expected_actual['Training_Error_Rate'] = training_error_rate
df_expected_actual['Testing_Error_Rate'] = testing_error_rate
df_expected_actual['Training_Size'] = training_size
df_expected_actual['Testing_Size'] = testing_size
df_expected_actual['Feature_Importance'] = feature_importance
df_expected_actual['Random_Seed'] = [random_seed]*len(feature_importance)

In [None]:
df_expected_actual

In [None]:
df_expected_actual.to_csv('./results/concept_drift_disk_2015_rf_week_feature_importance_rs1.csv')