In [None]:
from datetime import date, datetime, timedelta
import pandas as pd

import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import metrics, preprocessing
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from rgf.sklearn import RGFClassifier
from sklearn.svm import SVC

import os
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

In [None]:
# random_seeds = [1234, 4887, 597, 1959, 413, 44, 2969, 4971, 4913, 9591]

In [None]:
# example with random seed 1234
random_seed = 1234

In [None]:
def obtain_intervals(dataset):
    '''
    Generate interval terminals, so that samples in each interval have:
        interval_i = (timestamp >= terminal_i) and (timestamp < terminal_{i+1})

    Args:
        dataset (chr): Assuming only Backblaze (b) and Google (g) datasets exists
    '''
    if dataset == 'g':
        # time unit in Google: millisecond, tracing time: 29 days
        start_time = 604046279
        unit_period = 24 * 60 * 60 * 1000 * 1000  # unit period: one day
        end_time = start_time + 28*unit_period
    elif dataset == 'b':
        # time unit in Backblaze: week, tracing time: one year (50 weeks)
        start_time = 1
        unit_period = 7  # unit period: one week (7 days)
        end_time = start_time + 50*unit_period
    # original 1 month
    '''
    elif dataset == 'b':
        # time unit in Backblaze: month, tracing time: one year (12 months)
        start_time = 1
        unit_period = 1  # unit period: one month
        end_time = start_time + 12*unit_period
    '''
    
    
    # add one unit for the open-end of range function
    terminals = [i for i in range(start_time, end_time+unit_period, unit_period)]

    return terminals

In [None]:
def obtain_model(model_name):
    '''
    This function instantiate a specific model 
    Note: the MODEL_TYPE global variable must be set first
    Args:
        model_name (str): [rf, nn, svm, cart, rgf]
    Returns:
        (instance): instance of given model with preset parameters.
        Return None if the model name is not in the option
    '''
    if model_name == 'rf':
        return RandomForestClassifier(n_estimators=50, criterion='gini', class_weight=None, max_depth=None, 
                                      min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, 
                                      n_jobs=N_WORKERS, random_state = random_seed)
        #return RandomForestClassifier(n_jobs=N_WORKERS)
    elif model_name == 'nn':
        return MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, learning_rate='adaptive')
        #return MLPClassifier()
    elif model_name == 'svm':
        return SVC(max_iter=100000, probability=True)
        #return SVC(max_iter=10000, probability=True)
    elif model_name == 'cart':
        return DecisionTreeClassifier(criterion='gini', class_weight=None, max_depth=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2)
        #return DecisionTreeClassifier()
    elif model_name == 'rgf':
        return SafeRGF()

    return None


In [None]:
def obtain_natural_chunks(features, labels, terminals):
    feature_list = []
    label_list = []
    for i in range(len(terminals) - 1):
        idx = np.logical_and(features[:, 0] >= terminals[i], features[:, 0] < terminals[i + 1])
        feature_list.append(features[idx][:, 1:])
        label_list.append(labels[idx])
    return feature_list, label_list


In [None]:
def cross_validation_original(model, features, labels):
    kf = KFold(n_splits=10, shuffle=False)
    error_num = 0
    total_num = 0
    for training_index, testing_index in kf.split(features):
        training_features, training_labels = features[training_index], labels[training_index]
        testing_features, testing_labels = features[testing_index], labels[testing_index]
        model.fit(training_features, training_labels)
        testing_preds = model.predict(testing_features)
        error_num += np.count_nonzero(testing_preds != testing_labels)
        total_num += len(testing_labels)

    return error_num, total_num

In [None]:
# MODIFIED
def cross_validation(model, features, labels):
    kf = KFold(n_splits=10, shuffle=False)
    error_num = 0
    total_num = 0
    testing_preds_cross_val = []
    testing_true_cross_val = []
    
    counter = 0;
    
    for training_index, testing_index in kf.split(features):
        
        counter = counter + 1
        #print(counter)
        
        training_features, training_labels = features[training_index], labels[training_index]
        testing_features, testing_labels = features[testing_index], labels[testing_index]
        model.fit(training_features, training_labels)
        testing_preds = model.predict(testing_features)
        
        testing_preds_cross_val.append(testing_preds)
        testing_true_cross_val.append(testing_labels)
        
        
        error_num += np.count_nonzero(testing_preds != testing_labels)
        total_num += len(testing_labels)

        
    return testing_true_cross_val, testing_preds_cross_val, error_num, total_num
        
        


# Reading Data

In [None]:
DATASET_PATH = 'datasets/disk_failure_2015.csv'

In [None]:
features_disk_failure = ['smart_1_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw', 'smart_9_raw', 'smart_12_raw', 'smart_187_raw', 'smart_193_raw', 'smart_194_raw', 'smart_197_raw', 'smart_199_raw', 
                         'smart_4_raw_diff', 'smart_5_raw_diff', 'smart_9_raw_diff', 'smart_12_raw_diff', 'smart_187_raw_diff', 'smart_193_raw_diff', 'smart_197_raw_diff', 'smart_199_raw_diff']
columns = ['serial_number', 'date'] + features_disk_failure + ['label']

In [None]:
df = pd.read_csv(DATASET_PATH, header=None)
# put columns names
df.columns = columns
# ignore serial number
df = df[df.columns[1:]]

In [11]:
len(df)

7036272

In [12]:
# transform date to date time
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [13]:
df

Unnamed: 0,date,smart_1_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_9_raw,smart_12_raw,smart_187_raw,smart_193_raw,smart_194_raw,...,smart_199_raw,smart_4_raw_diff,smart_5_raw_diff,smart_9_raw_diff,smart_12_raw_diff,smart_187_raw_diff,smart_193_raw_diff,smart_197_raw_diff,smart_199_raw_diff,label
0,2015-01-01,8.984068e+07,1.0,0.0,1.139112e+08,1951.714286,1.0,0.0,21951.0,22.000000,...,0.0,0.0,0.0,143.0,0.0,0.0,0.0,0.0,0.0,False
1,2015-01-01,1.316887e+08,7.0,0.0,5.268222e+07,11557.000000,7.0,0.0,359.0,25.714286,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
2,2015-01-01,1.003851e+08,8.0,0.0,5.879862e+08,8786.285714,7.0,0.0,22985.0,23.000000,...,0.0,0.0,0.0,145.0,0.0,0.0,0.0,0.0,0.0,False
3,2015-01-01,7.431668e+07,4.0,0.0,7.833152e+08,9220.142857,4.0,0.0,685.0,23.000000,...,0.0,0.0,0.0,145.0,0.0,0.0,0.0,0.0,0.0,False
4,2015-01-01,1.333185e+08,34.0,0.0,1.836819e+08,27408.714286,33.0,0.0,34.0,27.142857,...,1.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7036267,2015-12-18,1.433012e+08,6.0,0.0,5.124199e+08,10137.000000,6.0,0.0,3053.0,21.000000,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
7036268,2015-12-18,1.437354e+08,3.0,0.0,1.316852e+08,3727.000000,3.0,0.0,38821.0,32.000000,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
7036269,2015-12-18,9.491236e+07,3.0,0.0,4.019294e+07,1092.000000,3.0,0.0,10376.0,27.571429,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
7036270,2015-12-18,6.574741e+07,1.0,0.0,1.621107e+07,774.428571,1.0,0.0,6741.0,25.142857,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False


In [14]:
# divide on days of year

# original implementation
#df['date'] = pd.Series(pd.DatetimeIndex(df['date']).month)

# divide on weeks
df['date'] = pd.Series(pd.DatetimeIndex(df['date']).day_of_year)

In [15]:
df_test = pd.read_csv(DATASET_PATH, header=None)
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,Z3023VGH,2015-01-01,8.984068e+07,1.0,0.0,1.139112e+08,1951.714286,1.0,0.0,21951.0,...,0.0,0.0,0.0,143.0,0.0,0.0,0.0,0.0,0.0,False
1,Z300JA8E,2015-01-01,1.316887e+08,7.0,0.0,5.268222e+07,11557.000000,7.0,0.0,359.0,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
2,Z300H2KS,2015-01-01,1.003851e+08,8.0,0.0,5.879862e+08,8786.285714,7.0,0.0,22985.0,...,0.0,0.0,0.0,145.0,0.0,0.0,0.0,0.0,0.0,False
3,Z300X922,2015-01-01,7.431668e+07,4.0,0.0,7.833152e+08,9220.142857,4.0,0.0,685.0,...,0.0,0.0,0.0,145.0,0.0,0.0,0.0,0.0,0.0,False
4,Z290YGNR,2015-01-01,1.333185e+08,34.0,0.0,1.836819e+08,27408.714286,33.0,0.0,34.0,...,1.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7036267,Z3010VAH,2015-12-18,1.433012e+08,6.0,0.0,5.124199e+08,10137.000000,6.0,0.0,3053.0,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
7036268,S300Z5A9,2015-12-18,1.437354e+08,3.0,0.0,1.316852e+08,3727.000000,3.0,0.0,38821.0,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
7036269,Z304JN6Q,2015-12-18,9.491236e+07,3.0,0.0,4.019294e+07,1092.000000,3.0,0.0,10376.0,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False
7036270,Z304JN6P,2015-12-18,6.574741e+07,1.0,0.0,1.621107e+07,774.428571,1.0,0.0,6741.0,...,0.0,0.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,False


In [16]:
features = df[df.columns[:-1]].to_numpy()
labels = df[df.columns[-1]].to_numpy()

In [17]:
labels

array([False, False, False, ..., False, False, False])

In [18]:
features

array([[1.00000000e+00, 8.98406777e+07, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.31688750e+08, 7.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 1.00385139e+08, 8.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [3.52000000e+02, 9.49123577e+07, 3.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.52000000e+02, 6.57474069e+07, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.52000000e+02, 1.21146398e+08, 4.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [19]:
# DIVIDE FEATURES INTO WEEKS 

feature_list, label_list = obtain_natural_chunks(features, labels, obtain_intervals('b'))

In [20]:
# define dataframe to store results
df_expected_actual = pd.DataFrame()
df_expected_actual

In [21]:
# original implementation
#months = ['M1_2', 'M2_3', 'M3_4', 'M4_5', 'M5_6', 'M6_7', 'M7_8', 'M8_9', 'M9_10', 'M10_11', 'M11_12']

# our implementation
# divide on weeks
weeks = []
for i in range(0, len(feature_list)-1):
    string_week = 'W' + str(i+1) + '_' + str(i+2)
    weeks.append(string_week)
len(weeks)

49

In [22]:
model_name = 'rf'
N_WORKERS = 1

training_error_rate = []
testing_error_rate = []
training_size = []
testing_size = []

#expected_roc_auc_cross_val = []

#actual_roc_auc = []

feature_importance = []

for i in tqdm(range(0, len(weeks))):
        
    # OBTAIN TRAINING DATA + LABELS
    month = i
    print('Train month', month)
    
    
    # SCALE TRAINING DATA

    scaler = StandardScaler()
    training_features = scaler.fit_transform(feature_list[month])
    
    
    # OBTAIN TRAINING LABELS

    training_labels = label_list[month]
    
    # DOWNSAMPLING TRAINING
    # the model learns quite well on the training set but it cannot extrapolate that well on the testing set
    # skip this part
    #training_features, training_labels = downsampling(training_features, training_labels)
    
    # CROSS VALIDATION TO OBTAIN THE EXPECTED ERROR
    
    print('Calculating Expected ROC AUC')

    testing_true_cross_val, testing_pred_cross_val, error_train, total_len_train = cross_validation(obtain_model(model_name), training_features, training_labels)
    
    # OBTAIN ERROR RATE TRAIN + SIZE
    
    training_error_rate.append(error_train)
    training_size.append(total_len_train)
    
    print('Error Rate Train', error_train)
    print('Error Rate Train Size', total_len_train)
    
    # CALCULATE EXPECTED ROC AUC

    roc_auc_cross_validation = []

    # WE GIVE UP ON ROC AUC BECAUSE IT CAN BE THE CASE THAT ONLY ONE CLASS IS REPRESENTED
    '''
    # CHECK IF THERE WAS ONLY ONE CLASS IN THE TRAINING
    if(error_train == 0):
        roc_auc_cross_validation.append(1.0)
    else:
        for i in range(0, len(testing_true_cross_val)):
            roc_auc_cross_validation.append(roc_auc_score(testing_true_cross_val[i], testing_pred_cross_val[i]))
    
    #EXTRACT EXPECTED ROC AUC
    
    expected_roc_auc_cross_val.append(np.mean(roc_auc_cross_validation))
    '''
    # FIT MODEL
    
    print('Model Training')

    model = obtain_model(model_name)
    model.fit(training_features, training_labels)
    
    # OBTAIN TESTING DATA + LABELS

    month_test = month + 1
    
    print('Test month', month_test)

    # SCALE TESTING DATA

    testing_features = scaler.transform(feature_list[month_test])

    # OBTAIN TESTING LABELS

    testing_labels = label_list[month_test]
    
    # OBTAIN PREDICTION

    predictions_test = model.predict(testing_features)
    
    # OBTAIN ERROR RATE TEST + SIZE
    
    testing_err = np.count_nonzero(testing_labels != predictions_test)
    testing_error_rate.append(testing_err)
    testing_size.append(len(testing_labels))
    
    print('Error Rate Test', testing_err)
    print('Error Rate Test Size', len(testing_labels))
    
    # GIVE UP BECAUSE OF REASONS LISTED ABOVE
    # CALCULATE ACTUAL ROC AUC
    '''
    print('Calculating Actual ROC AUC')

    actual_roc_auc.append(roc_auc_score(testing_labels, predictions_test))
    '''
    # EXTRACT FEATURE IMPORTANCE
    
    feature_importance.append(model.feature_importances_)

  0%|          | 0/49 [00:00<?, ?it/s]

Train month 0
Calculating Expected ROC AUC
Error Rate Train 60
Error Rate Train Size 95296
Model Training
Test month 1


  2%|▏         | 1/49 [00:31<25:26, 31.80s/it]

Error Rate Test 169
Error Rate Test Size 97665
Train month 1
Calculating Expected ROC AUC
Error Rate Train 45
Error Rate Train Size 97665
Model Training
Test month 2


  4%|▍         | 2/49 [01:05<25:55, 33.10s/it]

Error Rate Test 122
Error Rate Test Size 99802
Train month 2
Calculating Expected ROC AUC
Error Rate Train 67
Error Rate Train Size 99802
Model Training
Test month 3


  6%|▌         | 3/49 [01:39<25:30, 33.28s/it]

Error Rate Test 134
Error Rate Test Size 102616
Train month 3
Calculating Expected ROC AUC
Error Rate Train 61
Error Rate Train Size 102616
Model Training
Test month 4


  8%|▊         | 4/49 [02:13<25:11, 33.58s/it]

Error Rate Test 120
Error Rate Test Size 104596
Train month 4
Calculating Expected ROC AUC
Error Rate Train 50
Error Rate Train Size 104596
Model Training
Test month 5


 10%|█         | 5/49 [02:55<26:59, 36.81s/it]

Error Rate Test 81
Error Rate Test Size 106254
Train month 5
Calculating Expected ROC AUC
Error Rate Train 40
Error Rate Train Size 106254
Model Training
Test month 6


 12%|█▏        | 6/49 [03:32<26:22, 36.80s/it]

Error Rate Test 126
Error Rate Test Size 112397
Train month 6
Calculating Expected ROC AUC
Error Rate Train 40
Error Rate Train Size 112397
Model Training
Test month 7


 14%|█▍        | 7/49 [04:09<25:41, 36.70s/it]

Error Rate Test 93
Error Rate Test Size 112790
Train month 7
Calculating Expected ROC AUC
Error Rate Train 48
Error Rate Train Size 112790
Model Training
Test month 8


 16%|█▋        | 8/49 [04:55<27:04, 39.62s/it]

Error Rate Test 111
Error Rate Test Size 112953
Train month 8
Calculating Expected ROC AUC
Error Rate Train 46
Error Rate Train Size 112953
Model Training
Test month 9


 18%|█▊        | 9/49 [05:44<28:27, 42.69s/it]

Error Rate Test 117
Error Rate Test Size 114059
Train month 9
Calculating Expected ROC AUC
Error Rate Train 41
Error Rate Train Size 114059
Model Training
Test month 10


 20%|██        | 10/49 [06:29<28:11, 43.38s/it]

Error Rate Test 130
Error Rate Test Size 114420
Train month 10
Calculating Expected ROC AUC
Error Rate Train 50
Error Rate Train Size 114420
Model Training
Test month 11


 22%|██▏       | 11/49 [07:12<27:21, 43.21s/it]

Error Rate Test 72
Error Rate Test Size 113828
Train month 11
Calculating Expected ROC AUC
Error Rate Train 28
Error Rate Train Size 113828
Model Training
Test month 12


 24%|██▍       | 12/49 [07:56<26:47, 43.45s/it]

Error Rate Test 82
Error Rate Test Size 114371
Train month 12
Calculating Expected ROC AUC
Error Rate Train 33
Error Rate Train Size 114371
Model Training
Test month 13


 27%|██▋       | 13/49 [08:39<26:05, 43.48s/it]

Error Rate Test 89
Error Rate Test Size 113900
Train month 13
Calculating Expected ROC AUC
Error Rate Train 23
Error Rate Train Size 113900
Model Training
Test month 14


 29%|██▊       | 14/49 [09:18<24:30, 42.00s/it]

Error Rate Test 59
Error Rate Test Size 115008
Train month 14
Calculating Expected ROC AUC
Error Rate Train 25
Error Rate Train Size 115008
Model Training
Test month 15


 31%|███       | 15/49 [09:53<22:35, 39.86s/it]

Error Rate Test 59
Error Rate Test Size 123638
Train month 15
Calculating Expected ROC AUC
Error Rate Train 37
Error Rate Train Size 123638
Model Training
Test month 16


 33%|███▎      | 16/49 [10:33<21:55, 39.87s/it]

Error Rate Test 51
Error Rate Test Size 129923
Train month 16
Calculating Expected ROC AUC
Error Rate Train 24
Error Rate Train Size 129923
Model Training
Test month 17


 35%|███▍      | 17/49 [11:16<21:49, 40.93s/it]

Error Rate Test 45
Error Rate Test Size 131041
Train month 17
Calculating Expected ROC AUC
Error Rate Train 25
Error Rate Train Size 131041
Model Training
Test month 18


 37%|███▋      | 18/49 [12:01<21:43, 42.06s/it]

Error Rate Test 57
Error Rate Test Size 131180
Train month 18
Calculating Expected ROC AUC
Error Rate Train 30
Error Rate Train Size 131180
Model Training
Test month 19


 39%|███▉      | 19/49 [12:47<21:36, 43.22s/it]

Error Rate Test 52
Error Rate Test Size 130536
Train month 19
Calculating Expected ROC AUC
Error Rate Train 15
Error Rate Train Size 130536
Model Training
Test month 20


 41%|████      | 20/49 [13:28<20:40, 42.77s/it]

Error Rate Test 60
Error Rate Test Size 135950
Train month 20
Calculating Expected ROC AUC
Error Rate Train 28
Error Rate Train Size 135950
Model Training
Test month 21


 43%|████▎     | 21/49 [14:17<20:45, 44.48s/it]

Error Rate Test 68
Error Rate Test Size 136179
Train month 21
Calculating Expected ROC AUC
Error Rate Train 22
Error Rate Train Size 136179
Model Training
Test month 22


 45%|████▍     | 22/49 [15:12<21:29, 47.75s/it]

Error Rate Test 62
Error Rate Test Size 137793
Train month 22
Calculating Expected ROC AUC
Error Rate Train 40
Error Rate Train Size 137793
Model Training
Test month 23


 47%|████▋     | 23/49 [16:09<21:48, 50.33s/it]

Error Rate Test 79
Error Rate Test Size 142204
Train month 23
Calculating Expected ROC AUC
Error Rate Train 45
Error Rate Train Size 142204
Model Training
Test month 24


 49%|████▉     | 24/49 [17:24<24:04, 57.80s/it]

Error Rate Test 75
Error Rate Test Size 143125
Train month 24
Calculating Expected ROC AUC
Error Rate Train 35
Error Rate Train Size 143125
Model Training
Test month 25


 51%|█████     | 25/49 [18:38<25:08, 62.84s/it]

Error Rate Test 53
Error Rate Test Size 142901
Train month 25
Calculating Expected ROC AUC
Error Rate Train 24
Error Rate Train Size 142901
Model Training
Test month 26


 53%|█████▎    | 26/49 [19:29<22:38, 59.04s/it]

Error Rate Test 64
Error Rate Test Size 142714
Train month 26
Calculating Expected ROC AUC
Error Rate Train 21
Error Rate Train Size 142714
Model Training
Test month 27


 55%|█████▌    | 27/49 [20:22<21:03, 57.42s/it]

Error Rate Test 38
Error Rate Test Size 142705
Train month 27
Calculating Expected ROC AUC
Error Rate Train 14
Error Rate Train Size 142705
Model Training
Test month 28


 57%|█████▋    | 28/49 [21:07<18:45, 53.58s/it]

Error Rate Test 44
Error Rate Test Size 142674
Train month 28
Calculating Expected ROC AUC
Error Rate Train 20
Error Rate Train Size 142674
Model Training
Test month 29


 59%|█████▉    | 29/49 [21:57<17:32, 52.63s/it]

Error Rate Test 48
Error Rate Test Size 148914
Train month 29
Calculating Expected ROC AUC
Error Rate Train 22
Error Rate Train Size 148914
Model Training
Test month 30


 61%|██████    | 30/49 [22:55<17:09, 54.20s/it]

Error Rate Test 67
Error Rate Test Size 148873
Train month 30
Calculating Expected ROC AUC
Error Rate Train 39
Error Rate Train Size 148873
Model Training
Test month 31


 63%|██████▎   | 31/49 [23:55<16:45, 55.89s/it]

Error Rate Test 66
Error Rate Test Size 152571
Train month 31
Calculating Expected ROC AUC
Error Rate Train 35
Error Rate Train Size 152571
Model Training
Test month 32


 65%|██████▌   | 32/49 [25:00<16:37, 58.67s/it]

Error Rate Test 66
Error Rate Test Size 153879
Train month 32
Calculating Expected ROC AUC
Error Rate Train 34
Error Rate Train Size 153879
Model Training
Test month 33


 67%|██████▋   | 33/49 [25:53<15:11, 56.94s/it]

Error Rate Test 52
Error Rate Test Size 153862
Train month 33
Calculating Expected ROC AUC
Error Rate Train 31
Error Rate Train Size 153862
Model Training
Test month 34


 69%|██████▉   | 34/49 [26:50<14:15, 57.03s/it]

Error Rate Test 26
Error Rate Test Size 158074
Train month 34
Calculating Expected ROC AUC
Error Rate Train 18
Error Rate Train Size 158074
Model Training
Test month 35


 71%|███████▏  | 35/49 [27:47<13:16, 56.89s/it]

Error Rate Test 50
Error Rate Test Size 160210
Train month 35
Calculating Expected ROC AUC
Error Rate Train 7
Error Rate Train Size 160210
Model Training
Test month 36


 73%|███████▎  | 36/49 [28:37<11:53, 54.89s/it]

Error Rate Test 25
Error Rate Test Size 161498
Train month 36
Calculating Expected ROC AUC
Error Rate Train 15
Error Rate Train Size 161498
Model Training
Test month 37


 76%|███████▌  | 37/49 [29:22<10:21, 51.81s/it]

Error Rate Test 49
Error Rate Test Size 163161
Train month 37
Calculating Expected ROC AUC
Error Rate Train 20
Error Rate Train Size 163161
Model Training
Test month 38


 78%|███████▊  | 38/49 [30:25<10:07, 55.24s/it]

Error Rate Test 59
Error Rate Test Size 163484
Train month 38
Calculating Expected ROC AUC
Error Rate Train 41
Error Rate Train Size 163484
Model Training
Test month 39


 80%|███████▉  | 39/49 [31:57<11:02, 66.21s/it]

Error Rate Test 67
Error Rate Test Size 167806
Train month 39
Calculating Expected ROC AUC
Error Rate Train 39
Error Rate Train Size 167806
Model Training
Test month 40


 82%|████████▏ | 40/49 [33:19<10:40, 71.13s/it]

Error Rate Test 34
Error Rate Test Size 169104
Train month 40
Calculating Expected ROC AUC
Error Rate Train 12
Error Rate Train Size 169104
Model Training
Test month 41


 84%|████████▎ | 41/49 [34:23<09:12, 69.01s/it]

Error Rate Test 44
Error Rate Test Size 171308
Train month 41
Calculating Expected ROC AUC
Error Rate Train 29
Error Rate Train Size 171308
Model Training
Test month 42


 86%|████████▌ | 42/49 [35:12<07:21, 63.00s/it]

Error Rate Test 15
Error Rate Test Size 103454
Train month 42
Calculating Expected ROC AUC
Error Rate Train 9
Error Rate Train Size 103454
Model Training


 88%|████████▊ | 43/49 [35:29<04:54, 49.11s/it]

Test month 43
Error Rate Test 15
Error Rate Test Size 77461
Train month 43
Calculating Expected ROC AUC
Error Rate Train 5
Error Rate Train Size 77461
Model Training
Test month 44


 90%|████████▉ | 44/49 [35:43<03:12, 38.53s/it]

Error Rate Test 26
Error Rate Test Size 186947
Train month 44
Calculating Expected ROC AUC
Error Rate Train 9
Error Rate Train Size 186947
Model Training
Test month 45


 92%|█████████▏| 45/49 [36:43<03:00, 45.01s/it]

Error Rate Test 27
Error Rate Test Size 187811
Train month 45
Calculating Expected ROC AUC
Error Rate Train 16
Error Rate Train Size 187811
Model Training
Test month 46


 94%|█████████▍| 46/49 [37:58<02:41, 53.91s/it]

Error Rate Test 33
Error Rate Test Size 195101
Train month 46
Calculating Expected ROC AUC
Error Rate Train 23
Error Rate Train Size 195101
Model Training
Test month 47


 96%|█████████▌| 47/49 [39:30<02:10, 65.39s/it]

Error Rate Test 41
Error Rate Test Size 199277
Train month 47
Calculating Expected ROC AUC
Error Rate Train 20
Error Rate Train Size 199277
Model Training
Test month 48


 98%|█████████▊| 48/49 [40:53<01:10, 70.62s/it]

Error Rate Test 34
Error Rate Test Size 202048
Train month 48
Calculating Expected ROC AUC
Error Rate Train 19
Error Rate Train Size 202048
Model Training
Test month 49


100%|██████████| 49/49 [42:16<00:00, 51.77s/it]

Error Rate Test 35
Error Rate Test Size 210141





In [23]:
df_expected_actual['Weeks'] = weeks
df_expected_actual['Training_Error_Rate'] = training_error_rate
df_expected_actual['Testing_Error_Rate'] = testing_error_rate
df_expected_actual['Training_Size'] = training_size
df_expected_actual['Testing_Size'] = testing_size
df_expected_actual['Feature_Importance'] = feature_importance
df_expected_actual['Random_Seed'] = [random_seed]*len(feature_importance)

In [24]:
df_expected_actual

Unnamed: 0,Weeks,Training_Error_Rate,Testing_Error_Rate,Training_Size,Testing_Size,Feature_Importance,Random_Seed
0,W1_2,60,169,95296,97665,"[0.04508857829218932, 0.04008426421855664, 0.2...",9591
1,W2_3,45,122,97665,99802,"[0.05408664122022242, 0.046149860223713896, 0....",9591
2,W3_4,67,134,99802,102616,"[0.051881077494387605, 0.04734406007984115, 0....",9591
3,W4_5,61,120,102616,104596,"[0.06258636805055372, 0.03272518423433912, 0.2...",9591
4,W5_6,50,81,104596,106254,"[0.06243811727669681, 0.04523606086634496, 0.1...",9591
5,W6_7,40,126,106254,112397,"[0.05975288477581444, 0.04726028100813084, 0.2...",9591
6,W7_8,40,93,112397,112790,"[0.05690710282340819, 0.03823999952446405, 0.2...",9591
7,W8_9,48,111,112790,112953,"[0.046885818299636124, 0.05848152919511166, 0....",9591
8,W9_10,46,117,112953,114059,"[0.04673420727005296, 0.03747220787845044, 0.2...",9591
9,W10_11,41,130,114059,114420,"[0.045481432310084564, 0.03632000326943413, 0....",9591


In [25]:
df_expected_actual.to_csv('./results/concept_drift_disk_2015_rf_week_feature_importance_rs1.csv')