In [1]:
# Load Packages

import pandas as pd
import numpy as np
from time import time

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import IsolationForest

In [2]:
# Reading data from a combined file with both good and failed drives
def split_train_val_test_data( root = "../dataset", drive_file = "/ST12000NM0007_last_10_day.csv",  
                          ignore_cols = ["date","serial_number","model","capacity_bytes","failure"], 
                          resample_data=False, smote_data=False):

    df = pd.read_csv(root+drive_file, parse_dates=True)

    df_good = df.loc[df['failure'] == 0]
    df_bad = df.loc[df['failure'] == 1]
     
    df_good = df_good.sort_values(["date"])
    df_bad = df_bad.sort_values(["date"])

    good_y = df_good["failure"]
    bad_y = df_bad["failure"]

    # Split into train (80%) and test (20%)
    X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(
        df_good, good_y, train_size=0.8, shuffle=False)
    X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(
        df_bad, bad_y, train_size=0.8, shuffle=False)


    # Split train into train and validation
    # Train(60%), Val(20%), Test(20%)
    X_train_good, X_val_good, y_train_good, y_val_good = train_test_split(
        X_train_good, y_train_good, train_size=0.75, shuffle=False)
    X_train_bad, X_val_bad, y_train_bad, y_val_bad = train_test_split(
        X_train_bad, y_train_bad, train_size=0.75, shuffle=False)
        
    if resample_data:
        X_train_bad = resample(df_bad, replace=True, n_samples=len(X_train_good), random_state=1)
        X_train_bad = X_train_bad.sort_values(["date"])

    y_train_bad = X_train_bad["failure"]

    X_train = pd.concat([X_train_good, X_train_bad], axis=0)
    y_train = pd.concat([y_train_good, y_train_bad], axis=0)
    X_val = pd.concat([X_val_good, X_val_bad], axis=0)
    y_val = pd.concat([y_val_good, y_val_bad], axis=0)
    X_test = pd.concat([X_test_good, X_test_bad], axis=0)
    y_test = pd.concat([y_test_good, y_test_bad], axis=0)

    X_train.drop(columns=ignore_cols, inplace=True, axis=1)
    X_val.drop(columns=ignore_cols, inplace=True, axis=1)
    X_test.drop(columns=ignore_cols, inplace=True, axis=1)

    if smote_data:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    return (X_train, X_val, X_test, y_train, y_val, y_test)
    # return (X_train, X_test, y_train, y_test)

In [3]:
# Sorting data by date
def sort_data_by_date(file_path):
    df = pd.read_csv(file_path, parse_dates=True)
    sorted_df = df.sort_values(["date"])
    return sorted_df

In [4]:
def tune_n_estimators(model, X_train, y_train):
    print("Getting optimal n_estimators! ")
    import xgboost as xgb
    # 1: Set learning rate and n_estimators
    xgtrain = xgb.DMatrix(X_train, y_train)
    params = model.get_xgb_params()
    num_boost_round = model.get_params()['n_estimators'] 
    metrics = 'auc'
    nfold = 5
    early_stopping_rounds = 50
    
    
    cvresult = xgb.cv(params, xgtrain, num_boost_round = num_boost_round, nfold = nfold,
                      metrics = metrics, early_stopping_rounds = early_stopping_rounds)
    
    n_estimators = cvresult.shape[0]
    print("n_estimators: ", n_estimators)
    return n_estimators

In [5]:
def tune_max_dep_min_child_weight(n_estimators, X_train, y_train):
    print("Getting optimal max_depth and min_child_weight!")
    param = {
        'max_depth': range(3,10,2),
        'min_child_weight': range(1,6,2)
    }
    
    gsearch = GridSearchCV(estimator = XGBClassifier( learning_rate = 0.1, n_estimators = n_estimators, 
                                                      max_depth = 5, min_child_weight = 1, gamma = 0, 
                                                      subsample = 0.8, colsample_bytree = 0.8,
                                                      objective = 'binary:logistic', scale_pos_weight = 1, seed = 27), 
                                                      param_grid = param, scoring = ['f1','accuracy'], refit='f1', iid = False, cv = 5)
    gsearch.fit(X_train, y_train)
    max_depth = gsearch.best_params_['max_depth']
    min_child_weight = gsearch.best_params_['min_child_weight']
    print("Best params: ", gsearch.best_params_)
    return (max_depth, min_child_weight)
    
    

In [6]:
def tune_gamma(n_estimators, max_depth, min_child_weight, X_train, y_train):
    print("Getting optimal gamma!")
    
    param = {
        'gamma':[i/10.0 for i in range(0,5)]
    }
    gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate = 0.1, n_estimators = n_estimators, 
                                                      max_depth = max_depth, min_child_weight = min_child_weight,
                                                      gamma = 0, subsample = 0.8, colsample_bytree = 0.8,
                                                      objective= 'binary:logistic', scale_pos_weight = 1, seed = 27), 
                                                      param_grid = param,  scoring = ['f1','accuracy'], refit='f1',iid = False, cv = 5)
    gsearch.fit(X_train, y_train)
    gamma = gsearch.best_params_['gamma']
    print("Best params: ", gsearch.best_params_)
    return gamma

In [7]:
def tune_reg_params(n_estimators, max_depth, min_child_weight, gamma, X_train, y_train):
    print("Getting optimal reg_alpha!")
    param = {
        'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }
    
    gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators = n_estimators, 
                                                      max_depth = max_depth, min_child_weight = min_child_weight,
                                                      gamma = gamma, subsample=0.8, colsample_bytree=0.8,
                                                     objective= 'binary:logistic', scale_pos_weight=1,seed=27), 
                                                       param_grid = param, scoring = ['f1','accuracy'], refit='f1', iid=False, cv=5)
    gsearch.fit(X_train, y_train)
    reg_alpha = gsearch.best_params_['reg_alpha']
    print("Best params: ", gsearch.best_params_)
    return reg_alpha
    

In [8]:
def tune_xgb(model, X_train, y_train):
    print("Start tuning")
    # 1: Tuning n_estimators
    n_estimators = tune_n_estimators(model, X_train, y_train)
    
    # 2:  Tuning max_depth and min_child_weight
    max_depth, min_child_weight = tune_max_dep_min_child_weight(n_estimators, X_train, y_train)
    
    # 3: Tuning gamma
    gamma = tune_gamma(n_estimators, max_depth, min_child_weight, X_train, y_train)
    
    # 4: Tuning regularization paramaters
    reg_alpha = tune_reg_params(n_estimators, max_depth, min_child_weight, gamma, X_train, y_train)

In [18]:
def run(models = [RandomForestClassifier(max_depth=2, random_state=0)], tune_model=False):
    X_train, X_val, X_test, y_train, y_val, y_test = split_train_val_test_data(drive_file = "/TOSHIBA MQ01ABF050_last_10_day_all_q_raw.csv", resample_data=True)
#     X_train, X_test, y_train, y_test = split_train_val_test_data(drive_file = "/ST8000DM002_last_10_day_all_q_raw.csv", resample_data=True)
    #X_train, X_test, y_train, y_test = get_train_test_data(resample_data=True)
    print("Data loaded successfully...\n")
    for model in models:  
        print("\n\n *", type(model).__name__)  
        
        if(type(model).__name__ == "XGBClassifier" and tune_model):
            tune_xgb(model, X_train, y_train)

        start = time()
        model.fit(X_train, y_train)
        end = time()
        print("\nTime to train:", str((end - start)/60), " mins")
        
        # Validation set results
        print("\n- Results on validation set: ")
        y_val_pred = model.predict(X_val)
        print("Accuracy: ", accuracy_score(y_val, y_val_pred))
        print("Scores:\n", classification_report(y_val, y_val_pred))
        
        # Test set results
        print("\n- Results on test set: ")
        y_pred = model.predict(X_test)
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Scores:\n", classification_report(y_test, y_pred))

In [19]:
# Entry point of function
if __name__ == "__main__":
    models_list = []
    xgbc = XGBClassifier(learning_rate = 0.2,
                         n_estimators=1251,
                         max_depth=3,
                         min_child_weight=3,
                         gamma=0.1,
                         reg_alpha=0.01,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective= 'binary:logistic',
                         scale_pos_weight=1,
                         seed=27)
    
    models_list.append(xgbc)
    run(models_list,tune_model=False)

Data loaded successfully...



 * XGBClassifier

Time to train: 0.019732332229614256  mins

- Results on validation set: 
Accuracy:  1.0
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       912
           1       1.00      1.00      1.00       155

    accuracy                           1.00      1067
   macro avg       1.00      1.00      1.00      1067
weighted avg       1.00      1.00      1.00      1067


- Results on test set: 
Accuracy:  0.9990636704119851
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       912
           1       0.99      1.00      1.00       156

    accuracy                           1.00      1068
   macro avg       1.00      1.00      1.00      1068
weighted avg       1.00      1.00      1.00      1068



## Results:

#### ST12000NM0007_last_10_day_all_q_raw.csv


 * XGBClassifier

Time to train: 3.739137363433838  mins

- Results on validation set: 
Accuracy:  0.9919698673851063
Scores:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     74210
           1       0.79      1.00      0.88      2252

    accuracy                           0.99     76462
   macro avg       0.89      1.00      0.94     76462
weighted avg       0.99      0.99      0.99     76462


- Results on test set: 
Accuracy:  0.9851821819989014
Scores:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     74210
           1       0.67      1.00      0.80      2252

    accuracy                           0.99     76462
   macro avg       0.83      0.99      0.90     76462
weighted avg       0.99      0.99      0.99     76462


#### ST4000DM000_last_10_day_all_q_raw.csv

 * XGBClassifier

Time to train: 2.175768995285034  mins

- Results on validation set: 
Accuracy:  0.9982876712328768
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45926
           1       0.91      1.00      0.95       794

    accuracy                           1.00     46720
   macro avg       0.95      1.00      0.98     46720
weighted avg       1.00      1.00      1.00     46720


- Results on test set: 
Accuracy:  0.9941780821917808
Scores:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     45926
           1       0.74      1.00      0.85       794

    accuracy                           0.99     46720
   macro avg       0.87      1.00      0.93     46720
weighted avg       1.00      0.99      0.99     46720


#### ST8000NM0055_last_10_day_all_q_raw.csv

* XGBClassifier

Time to train: 0.7112944483757019  mins

- Results on validation set: 
Accuracy:  0.9999318383204963
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     28906
           1       1.00      1.00      1.00       436

    accuracy                           1.00     29342
   macro avg       1.00      1.00      1.00     29342
weighted avg       1.00      1.00      1.00     29342


- Results on test set: 
Accuracy:  0.9997273532819849
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     28906
           1       0.98      1.00      0.99       436

    accuracy                           1.00     29342
   macro avg       0.99      1.00      1.00     29342
weighted avg       1.00      1.00      1.00     29342


#### ST8000DM002_last_10_day_all_q_raw.csv

* XGBClassifier

Time to train: 0.47229440212249757  mins

- Results on validation set: 
Accuracy:  0.9998993254807208
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19627
           1       0.99      1.00      1.00       239

    accuracy                           1.00     19866
   macro avg       1.00      1.00      1.00     19866
weighted avg       1.00      1.00      1.00     19866


- Results on test set: 
Accuracy:  0.9997483137018021
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19627
           1       0.98      1.00      0.99       239

    accuracy                           1.00     19866
   macro avg       0.99      1.00      0.99     19866
weighted avg       1.00      1.00      1.00     19866

#### TOSHIBA MQ01ABF050

* XGBClassifier

Time to train: 0.019732332229614256  mins

- Results on validation set: 
Accuracy:  1.0
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       912
           1       1.00      1.00      1.00       155

    accuracy                           1.00      1067
   macro avg       1.00      1.00      1.00      1067
weighted avg       1.00      1.00      1.00      1067


- Results on test set: 
Accuracy:  0.9990636704119851
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       912
           1       0.99      1.00      1.00       156

    accuracy                           1.00      1068
   macro avg       1.00      1.00      1.00      1068
weighted avg       1.00      1.00      1.00      1068


### Isolation Forest

In [10]:
X_train, X_test, _, y_test = split_train_val_test_data(resample_data=True)
clf = IsolationForest(random_state=0, contamination=0.1)
clf.fit(X_train)
y_pred = clf.predict(X_test) 
y_pred = [0 if x > 0 else 1 for x in y_pred]
print("Scores:\n", classification_report(y_test, y_pred))

Counter({1: 66885, -1: 2877})
Counter({0: 66885, 1: 2877})
Scores:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98     69419
           1       0.02      0.19      0.04       343

    accuracy                           0.96     69762
   macro avg       0.51      0.57      0.51     69762
weighted avg       0.99      0.96      0.97     69762



In [11]:
X_train, X_test, _, y_test = split_train_val_test_data(resample_data=False)
clf = IsolationForest(random_state=0, contamination=0.1)
clf.fit(X_train)
y_pred = clf.predict(X_test) 
y_pred = [0 if x > 0 else 1 for x in y_pred]
print("Scores:\n", classification_report(y_test, y_pred))

Scores:
               precision    recall  f1-score   support

           0       1.00      0.90      0.94     69419
           1       0.02      0.41      0.04       343

    accuracy                           0.90     69762
   macro avg       0.51      0.65      0.49     69762
weighted avg       0.99      0.90      0.94     69762

