In [13]:
# Load Packages

import pandas as pd
import numpy as np
from time import time

from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
# from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn import metrics 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [18]:
# Reading data from a combined file with both good and failed drives
def split_train_val_test_data( root = "./", drive_file = "/ST12000NM0007_last_10_day.csv",  
                          ignore_cols = ["date","serial_number","model","capacity_bytes","failure"], 
                          resample_data=False, smote_data=False):

    df = pd.read_csv(root+drive_file, parse_dates=True)

    df_good = df.loc[df['failure'] == 0]
    df_bad = df.loc[df['failure'] == 1]
     
    df_good = df_good.sort_values(["date"])
    df_bad = df_bad.sort_values(["date"])

    good_y = df_good["failure"]
    bad_y = df_bad["failure"]

    # Split into train (80%) and test (20%)
    X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(
        df_good, good_y, train_size=0.8, shuffle=False)
    X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(
        df_bad, bad_y, train_size=0.8, shuffle=False)


    # Split train into train and validation
    # Train(60%), Val(20%), Test(20%)
#     X_train_good, X_val_good, y_train_good, y_val_good = train_test_split(
#         X_train_good, y_train_good, train_size=0.75, shuffle=False)
#     X_train_bad, X_val_bad, y_train_bad, y_val_bad = train_test_split(
#         X_train_bad, y_train_bad, train_size=0.75, shuffle=False)
        
    if resample_data:
        X_train_bad = resample(df_bad, replace=True, n_samples=len(X_train_good), random_state=1)
        X_train_bad = X_train_bad.sort_values(["date"])

    y_train_bad = X_train_bad["failure"]

    X_train = pd.concat([X_train_good, X_train_bad], axis=0)
    y_train = pd.concat([y_train_good, y_train_bad], axis=0)
#     X_val = pd.concat([X_val_good, X_val_bad], axis=0)
#     y_val = pd.concat([y_val_good, y_val_bad], axis=0)
    X_test = pd.concat([X_test_good, X_test_bad], axis=0)
    y_test = pd.concat([y_test_good, y_test_bad], axis=0)

    X_train.drop(columns=ignore_cols, inplace=True, axis=1)
#     X_val.drop(columns=ignore_cols, inplace=True, axis=1)
    X_test.drop(columns=ignore_cols, inplace=True, axis=1)

    if smote_data:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    #return (X_train, X_val, X_test, y_train, y_val, y_test)
    return (X_train, X_test, y_train, y_test)

In [5]:
# Sorting data by date
def sort_data_by_date(file_path):
    df = pd.read_csv(file_path, parse_dates=True)
    sorted_df = df.sort_values(["date"])
    return sorted_df

In [14]:
def random_tune_randomforest():
    rf = RandomForestClassifier(random_state = 1)
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    #Entropy calculations
    criterion = ["gini", "entropy"]
    
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion
                }
    
    rf_random = RandomizedSearchCV(
        estimator = rf, 
        param_distributions = random_grid, 
        n_iter = 100, 
        cv = 3, 
        verbose=2, 
        random_state=1, 
        n_jobs = -1, 
        scoring = ["f1", "accuracy"], 
        refit="f1"
    )
    
    return rf_random
    
    
    

In [17]:
def run(models = [RandomForestClassifier(max_depth=2, random_state=0)], tune_model=False):
    #X_train, X_val, X_test, y_train, y_val, y_test = split_train_val_test_data(drive_file = "/ST12000NM0007_last_day_all_q_raw.csv", smote_data=True)
    X_train, X_test, y_train, y_test = split_train_val_test_data(drive_file = "/ST12000NM0007_last_day_all_q_raw.csv", resample_data=True)
    #X_train, X_test, y_train, y_test = get_train_test_data(resample_data=True)
    print("Data loaded successfully...\n")
    for model in models:  
        print("\n\n *", type(model).__name__)  
        
        if(type(model).__name__ == "XGBClassifier" and tune_model):
            tune_xgb(model, X_train, y_train)

        start = time()
        model.fit(X_train, y_train)
        end = time()
        print("\nTime to train:", str((end - start)/60), " mins")
        
        print(model.best_params_)
        # Test set results
        print("\n- Results on test set: ")
        y_pred = model.predict(X_test)
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Scores:\n", classification_report(y_test, y_pred))

In [19]:
# Entry point of function
if __name__ == "__main__":
    models_list = []
    rf = random_tune_randomforest()
    models_list.append(rf)
#     rfc = RandomForestClassifier(max_depth=2, random_state=0)
#     models_list.append(rfc)
    run(models_list,tune_model=True)

Data loaded successfully...



 * RandomizedSearchCV
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 94.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 154.2min finished



Time to train: 157.08657914797465  mins
{'n_estimators': 2000, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 40, 'criterion': 'entropy', 'bootstrap': True}

- Results on test set: 
Accuracy:  0.9849712493465761
Scores:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      7425
           1       0.66      1.00      0.80       227

    accuracy                           0.98      7652
   macro avg       0.83      0.99      0.90      7652
weighted avg       0.99      0.98      0.99      7652



In [26]:
def run_rf_10(file_path="/ST12000NM0007_last_day_all_q_raw.csv"):

    model = RandomForestClassifier(
        n_estimators = 2000, 
        min_samples_split = 5, 
        min_samples_leaf = 4,
        max_features = 'auto', 
        max_depth = 40, 
        criterion = 'entropy',
        bootstrap = True
    )
    X_train, X_test, y_train, y_test = split_train_val_test_data(drive_file = file_path, resample_data=True)
    
    print("Data loaded successfully...\n")
    print("\n\n *", type(model).__name__)  

    start = time()
    model.fit(X_train, y_train)
    end = time()
    print("\nTime to train:", str((end - start)/60), " mins")
    
    # Test set results
    print("\n- Results on test set: ")
    y_pred = model.predict(X_test)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Scores:\n", classification_report(y_test, y_pred))
    

In [28]:
run_rf_10("/ST4000DM000_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * RandomForestClassifier

Time to train: 19.21467758019765  mins

- Results on test set: 
Accuracy:  0.9999357876712329
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     45926
           1       1.00      1.00      1.00       794

    accuracy                           1.00     46720
   macro avg       1.00      1.00      1.00     46720
weighted avg       1.00      1.00      1.00     46720



In [29]:
run_rf_10("/ST8000DM002_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * RandomForestClassifier

Time to train: 5.710624718666077  mins

- Results on test set: 
Accuracy:  1.0
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19627
           1       1.00      1.00      1.00       239

    accuracy                           1.00     19866
   macro avg       1.00      1.00      1.00     19866
weighted avg       1.00      1.00      1.00     19866



In [30]:
run_rf_10("/ST8000NM0055_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * RandomForestClassifier

Time to train: 9.502617053190868  mins

- Results on test set: 
Accuracy:  1.0
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     28906
           1       1.00      1.00      1.00       436

    accuracy                           1.00     29342
   macro avg       1.00      1.00      1.00     29342
weighted avg       1.00      1.00      1.00     29342



In [27]:
run_rf_10("/ST12000NM0007_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * RandomForestClassifier

Time to train: 57.33980790376663  mins

- Results on test set: 
Accuracy:  0.9997122753786194
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     74210
           1       0.99      1.00      1.00      2252

    accuracy                           1.00     76462
   macro avg       1.00      1.00      1.00     76462
weighted avg       1.00      1.00      1.00     76462



In [31]:
run_rf_10("/TOSHIBA MQ01ABF050_last_10_day_all_q_raw.csv")

Data loaded successfully...



 * RandomForestClassifier

Time to train: 0.16820285320281983  mins

- Results on test set: 
Accuracy:  0.9990636704119851
Scores:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       912
           1       0.99      1.00      1.00       156

    accuracy                           1.00      1068
   macro avg       1.00      1.00      1.00      1068
weighted avg       1.00      1.00      1.00      1068



#### Default parameters

* XGBClassifier

Time to train: 0.03838756481806437  mins

- Results on validation set: 
Accuracy:  0.9845791949817041
Scores:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      7425
           1       0.85      0.59      0.69       227

    accuracy                           0.98      7652
   macro avg       0.92      0.79      0.84      7652
weighted avg       0.98      0.98      0.98      7652


- Results on test set: 
Accuracy:  0.9801359121798223
Scores:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      7425
           1       0.91      0.37      0.52       227

    accuracy                           0.98      7652
   macro avg       0.95      0.68      0.76      7652
weighted avg       0.98      0.98      0.98      7652
