In [3]:
# Load Packages

import pandas as pd
import numpy as np
from time import time

from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
# from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer


In [4]:
# Reading data from a combined file with both good and failed drives
def split_train_val_test_data( root = "../dataset", drive_file = "/ST12000NM0007_last_10_day.csv",  
                          ignore_cols = ["date","serial_number","model","capacity_bytes","failure"], 
                          resample_data=False, smote_data=False):

    df = pd.read_csv(root+drive_file, parse_dates=True)

    df_good = df.loc[df['failure'] == 0]
    df_bad = df.loc[df['failure'] == 1]
     
    df_good = df_good.sort_values(["date"])
    df_bad = df_bad.sort_values(["date"])

    good_y = df_good["failure"]
    bad_y = df_bad["failure"]

    # Split into train (80%) and test (20%)
    X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(
        df_good, good_y, train_size=0.8, shuffle=False)
    X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(
        df_bad, bad_y, train_size=0.8, shuffle=False)


    # Split train into train and validation
    # Train(60%), Val(20%), Test(20%)
#     X_train_good, X_val_good, y_train_good, y_val_good = train_test_split(
#         X_train_good, y_train_good, train_size=0.75, shuffle=False)
#     X_train_bad, X_val_bad, y_train_bad, y_val_bad = train_test_split(
#         X_train_bad, y_train_bad, train_size=0.75, shuffle=False)
        
    if resample_data:
        X_train_bad = resample(df_bad, replace=True, n_samples=len(X_train_good), random_state=1)
        X_train_bad = X_train_bad.sort_values(["date"])

    y_train_bad = X_train_bad["failure"]

    X_train = pd.concat([X_train_good, X_train_bad], axis=0)
    y_train = pd.concat([y_train_good, y_train_bad], axis=0)
#     X_val = pd.concat([X_val_good, X_val_bad], axis=0)
#     y_val = pd.concat([y_val_good, y_val_bad], axis=0)
    X_test = pd.concat([X_test_good, X_test_bad], axis=0)
    y_test = pd.concat([y_test_good, y_test_bad], axis=0)

    X_train.drop(columns=ignore_cols, inplace=True, axis=1)
#     X_val.drop(columns=ignore_cols, inplace=True, axis=1)
    X_test.drop(columns=ignore_cols, inplace=True, axis=1)

    if smote_data:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    #return (X_train, X_val, X_test, y_train, y_val, y_test)
    return (X_train, X_test, y_train, y_test)

In [5]:
# Sorting data by date
def sort_data_by_date(file_path):
    df = pd.read_csv(file_path, parse_dates=True)
    sorted_df = df.sort_values(["date"])
    return sorted_df

In [6]:
def grid_tune_isolationforest():
    iso = IsolationForest(random_state=0)
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 5)]
    # The number of samples to draw from X to train each base estimator.
    max_samples = [0.2, 0.5, 0.8, 1]
    # the proportion of outliers in the data set
    contamination = ['auto', 0.1, 0.2, 0.3]
    # The number of features to draw from X to train each base estimator.
    max_features = [0.2, 0.5, 0.8, 1]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap,
               'max_samples': max_samples,
               'contamination': contamination
                }
    def acc(y_true, y_pred): 
        y_pred = [0 if x > 0 else 1 for x in y_pred]
        return accuracy_score(y_true, y_pred)
    def f1(y_true, y_pred): 
        y_pred = [0 if x > 0 else 1 for x in y_pred]
        return f1_score(y_true, y_pred)
    iso_grid = GridSearchCV(
        estimator = iso, 
        param_grid = random_grid, 
        cv = 3, 
        verbose=3, 
        n_jobs = -1, 
        scoring = {'f1': make_scorer(f1), 'acc': make_scorer(acc)}, 
        refit="f1",
        return_train_score=True
    )
    
    return iso_grid
    
    
    

In [7]:
def run(models, tune_model=False):
    #X_train, X_val, X_test, y_train, y_val, y_test = split_train_val_test_data(drive_file = "/ST12000NM0007_last_day_all_q_raw.csv", smote_data=True)
    X_train, X_test, y_train, y_test = split_train_val_test_data(drive_file = "/ST12000NM0007_last_day_all_q_raw.csv", resample_data=True)
    print("Data loaded successfully...\n")
    for model in models:  
        print("\n\n *", type(model).__name__)  
        
        # if(type(model).__name__ == "XGBClassifier" and tune_model):
        #     tune_xgb(model, X_train, y_train)

        start = time()
        model.fit(X_train, y_train)
        end = time()
        print("\nTime to train:", str((end - start)/60), " mins")
        
        print("Best Parameter", model.best_params_)
        # Test set results
        print("\n- Results on test set: ")
        y_pred = model.predict(X_test)
        y_pred = y_pred = [0 if x > 0 else 1 for x in y_pred]
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Scores:\n", classification_report(y_test, y_pred))

In [64]:
# Entry point of function
if __name__ == "__main__":
    models_list = []
    iso = grid_tune_isolationforest()
    models_list.append(iso)
    run(models_list,tune_model=True)
'''
Time to train: 48.03469951550166  mins
Best Parameter {'bootstrap': True, 'contamination': 0.3, 'max_features': 1, 'max_samples': 0.8, 'n_estimators': 125}

- Results on test set: 
Accuracy:  0.8691845269210664
Scores:
               precision    recall  f1-score   support

           0       0.98      0.88      0.93      7425
           1       0.12      0.52      0.19       227

    accuracy                           0.87      7652
   macro avg       0.55      0.70      0.56      7652
weighted avg       0.96      0.87      0.91      7652
'''

Data loaded successfully...



 * GridSearchCV
Fitting 3 folds for each of 640 candidates, totalling 1920 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 37.8min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 48.0min finished

Time to train: 48.03469951550166  mins
Best Parameter {'bootstrap': True, 'contamination': 0.3, 'max_features': 1, 'max_samples': 0.8, 'n_estimators': 125}

- Results on test set: 
Accuracy:  0.8691845269210664
Scores:
               precision    recall  f1-score   support

           0       0.98     

In [8]:
def run_10(file_path="/ST12000NM0007_last_day_all_q_raw.csv"):
    model = IsolationForest(bootstrap= True, contamination= 0.3, max_features= 1, max_samples= 0.8, n_estimators= 125)
    X_train, X_test, y_train, y_test = split_train_val_test_data(drive_file = file_path, resample_data=True)
    
    print("Data loaded successfully...\n")
    print("\n\n *", type(model).__name__)  

    start = time()
    model.fit(X_train, y_train)
    end = time()
    print("\nTime to train:", str((end - start)/60), " mins")
    
    # Test set results
    print("\n- Results on test set: ")
    y_pred = model.predict(X_test)
    y_pred = y_pred = [0 if x > 0 else 1 for x in y_pred]
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Scores:\n", classification_report(y_test, y_pred))
    

In [9]:
run_10("/ST4000DM000_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * IsolationForest

Time to train: 0.3185644030570984  mins

- Results on test set: 
Accuracy:  0.6331121575342465
Scores:
               precision    recall  f1-score   support

           0       0.98      0.64      0.77     45926
           1       0.01      0.13      0.01       794

    accuracy                           0.63     46720
   macro avg       0.49      0.39      0.39     46720
weighted avg       0.96      0.63      0.76     46720



In [10]:
run_10("/ST8000DM002_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * IsolationForest

Time to train: 0.1331276337305705  mins

- Results on test set: 
Accuracy:  0.40264773985704216
Scores:
               precision    recall  f1-score   support

           0       0.98      0.41      0.57     19627
           1       0.00      0.20      0.01       239

    accuracy                           0.40     19866
   macro avg       0.49      0.30      0.29     19866
weighted avg       0.96      0.40      0.57     19866



In [14]:
run_10("/ST8000NM0055_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * IsolationForest

Time to train: 0.21579244534174602  mins

- Results on test set: 
Accuracy:  0.5355463158612228
Scores:
               precision    recall  f1-score   support

           0       0.98      0.54      0.70     28906
           1       0.01      0.24      0.01       436

    accuracy                           0.54     29342
   macro avg       0.49      0.39      0.36     29342
weighted avg       0.96      0.54      0.69     29342



In [15]:
run_10("/ST12000NM0007_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * IsolationForest

Time to train: 0.7989759643872579  mins

- Results on test set: 
Accuracy:  0.6798932803222516
Scores:
               precision    recall  f1-score   support

           0       0.98      0.69      0.81     74210
           1       0.05      0.50      0.08      2252

    accuracy                           0.68     76462
   macro avg       0.51      0.59      0.45     76462
weighted avg       0.95      0.68      0.78     76462



In [16]:
run_10("/TOSHIBA MQ01ABF050_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * IsolationForest

Time to train: 0.006784852345784505  mins

- Results on test set: 
Accuracy:  0.48408239700374533
Scores:
               precision    recall  f1-score   support

           0       0.82      0.51      0.63       912
           1       0.11      0.35      0.17       156

    accuracy                           0.48      1068
   macro avg       0.46      0.43      0.40      1068
weighted avg       0.72      0.48      0.56      1068

