# Training and Evaluating Models

As a baseline for comparison, we will use simple decision rules based on [domain knowledge](https://www.backblaze.com/blog/what-smart-stats-indicate-hard-drive-failures/)

We will train models with no hyperparameter tuning just using default settings.  For each dataset, we will output several metrics for all models and baseline model.

We chose the following models:
- XGBoost
- KNN
- RandomForest
- LinearSVC

All datasets has been under_sampled so the 'majority' classification is roughly equal to the 'minority'.

In [1]:
import os
import time

import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

import pickle

HDD_PATH = os.path.join('datasets', 'drive_stats', '50_50')
RESULTS_DIR = 'exploring_models/'

FILENAME_LIST = ['fail_today', 
                 'fail_today_or_tomorrow', 
                 'fail_this_week',
                 'fail_this_month']

LABEL_DICT = {'fail_today': 'failure',
              'fail_today_or_tomorrow': 'fail_today_or_tomorrow',
              'fail_this_week': 'fail_this_week',
              'fail_this_month': 'fail_this_month'}

In [2]:
def get_csv_as_frame(filename, hdd_path=HDD_PATH):
    df = pd.read_csv(os.path.join(hdd_path, filename + '.csv'))
    # Drop 'normalized' columns
    cols_to_drop = [col for col in df.columns if 'normalized' in col]
    df.drop(columns=cols_to_drop, inplace=True)
    # Fill nulls
    df.fillna(method='ffill', inplace=True)
    # Affirm dtypes of columns
    columns = df.columns.tolist()
    for col in columns:
        if 'model' in col or 'serial_number' in col:
            df[col] = df[col].astype('object')
        elif 'date' in col or 'last_day' in col:
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
        else:
            df[col] = df[col].astype('int64')
    return df

def get_features(groups, cols, history=False):
    aggs = ['last']
    if history:
        aggs += [np.mean, np.var, 'min', 'max']
    features = groups[cols].agg(aggs)
    # For some reason, nulls appear with history aggs, so impute them
    for col in features:
        features[col] = features[col].fillna(features[col].mode()[0])
    return features

In [3]:
def xgboost_train_and_predict(X_train, y_train, X_test):
    bst = xgb.XGBClassifier()
    bst.fit(X_train, y_train)
    predictions = bst.predict(X_test)
    return predictions, bst

def knn_train_and_predict(X_train, y_train, X_test):
    clf = make_pipeline(RobustScaler(),
                       KNeighborsClassifier())
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    return predictions, clf

def forest_train_and_predict(X_train, y_train, X_test):
    forest = RandomForestClassifier(random_state=42)
    forest.fit(X_train, y_train)
    predictions = forest.predict(X_test)
    return predictions, forest

def lin_svc_train_and_predict(X_train, y_train, X_test):
    clf = make_pipeline(RobustScaler(),
                        LinearSVC(max_iter=80000))
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    return predictions, clf

In [4]:
def get_metrics(preds, labels):
    acc = metrics.accuracy_score(labels, preds)
    recall = metrics.recall_score(labels, preds)
    precision = metrics.precision_score(labels, preds)
    F1 = metrics.f1_score(labels, preds)
    auc = metrics.roc_auc_score(labels, preds)
    confusion_matrix = metrics.confusion_matrix(labels, preds, labels=(0, 1))
    return [acc, recall, precision, F1, auc] + list(confusion_matrix.ravel())

def evaluate(preds, labels, table, end_time, model=None):
    result = [table, model, end_time]
    result += get_metrics(preds, labels)
    return result

def class_name(classifier, history):
    if history:
        model_name = '{}_history'.format(classifier)
        file_name = '_{}_history.pickle.dat'.format(classifier)
    else:
        model_name = '{}'.format(classifier)
        file_name = '_{}.pickle.dat'.format(classifier)
    return model_name, file_name

def train_eval_and_save(grouped,
                        labels, 
                        cols_to_keep, 
                        table,
                        history=False, 
                        results_dir=RESULTS_DIR,
                        classifier='xgb'):
    start_time = time.time()
    features = get_features(grouped, cols_to_keep, history=history)
    X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                       test_size=0.2,
                                                       random_state=123)
    if classifier == 'xgb':
        preds, model = xgboost_train_and_predict(X_train, y_train, X_test)
    elif classifier == 'knn':
        preds, model = knn_train_and_predict(X_train, y_train, X_test)
    elif classifier == 'forest':
        preds, model = forest_train_and_predict(X_train, y_train, X_test)
    elif classifier == 'lin_svc':
        preds, model = lin_svc_train_and_predict(X_train, y_train, X_test)
    else:
        print ('No Classifier: type either xgb, knn, forest, or lin_svc')
    end_time = time.time() - start_time
    print('Time taken: {}s'.format(end_time))
    model_name, file_name = class_name(classifier, history)
    result = evaluate(preds, y_test, table, end_time, model_name)
    # Save the model
    pickle.dump(model, open(results_dir + table + file_name, 'wb'))
    return result, X_test, y_test

In [5]:
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)
    
BASELINE_FEATURES = ['smart_5_raw', 'smart_187_raw',
                    'smart_188_raw', 'smart_197_raw',
                    'smart_198_raw']

CLASSIFIER_LIST = ['xgb', 'knn', 'forest', 'lin_svc']

def run_tests(filename_list=FILENAME_LIST, 
              label_dict=LABEL_DICT, 
              class_list=CLASSIFIER_LIST,
              baseline_features=BASELINE_FEATURES):
    results = []
    for table in filename_list:
        print("{:=^80}".format(" Training on {} ".format(table)))
        dataset = get_csv_as_frame(table)
        cols_to_keep = [col for col in dataset.columns if 'smart' in col]
        # Groupby will get 10 day windows
        grouped = dataset.groupby(['serial_number', 'last_day'])
        # Label should always be what the last day in the windows is
        labels = grouped[label_dict[table]].agg('last')
        for classifier in class_list:
            print("{:=^80}".format(" Evaluating {} with History ".format(classifier)))
            try:
                result_history, _, _ = train_eval_and_save(grouped,
                                                           labels,
                                                           cols_to_keep,
                                                           table,
                                                           history=True,
                                                           classifier=classifier)
                results.append(result_history)
            except ValueError:
                print('Stupid things happened to {} with history'.format(classifier))
            print("{:=^80}".format(" Evaluating {} without History ".format(classifier)))
            start_time = time.time()
            result, X_test, y_test = train_eval_and_save(grouped,
                                                         labels,
                                                         cols_to_keep,
                                                         table,
                                                         history=False,
                                                         classifier=classifier)
            results.append(result)
        print('{:-^80}'.format(" Evaluating Baseline "))
        preds_manual = np.any(
            X_test[baseline_features].values > 0, axis=1).astype(int)
        results.append(evaluate(preds_manual, y_test, table, 'baseline'))
        print("\n \n")
    return results

In [6]:
results = run_tests()

Time taken: 4.658549785614014s
Time taken: 1.1000597476959229s
Time taken: 1.9378204345703125s
Time taken: 0.47672557830810547s
Time taken: 3.296919345855713s
Time taken: 1.3875246047973633s




Time taken: 118.92120313644409s




Time taken: 32.525153398513794s
----------------------------- Evaluating Baseline ------------------------------

 

Time taken: 6.6488518714904785s
Time taken: 1.8824799060821533s
Time taken: 5.310841798782349s
Time taken: 1.2899343967437744s
Time taken: 7.159361124038696s
Time taken: 2.7207283973693848s




Time taken: 264.8507146835327s




Time taken: 99.57688426971436s
----------------------------- Evaluating Baseline ------------------------------

 

Time taken: 23.388913869857788s
Time taken: 6.452430963516235s
Time taken: 40.6409957408905s
Time taken: 13.59852385520935s
Time taken: 25.363605976104736s
Time taken: 10.621796369552612s




Time taken: 1218.452787399292s




Time taken: 645.3991339206696s
----------------------------- Evaluating Baseline ------------------------------

 

Time taken: 99.89101648330688s
Time taken: 23.835524797439575s
Time taken: 576.5933685302734s
Time taken: 198.99973702430725s
Time taken: 140.58676958084106s
Time taken: 57.53584814071655s




Time taken: 6336.98339676857s




Time taken: 3794.034133195877s
----------------------------- Evaluating Baseline ------------------------------

 



In [7]:
eval_cols = ['Dataset', 'model', 'time',
             'accuracy', 'recall',  
             'precision', 'F1', 
             'auc', 'TN', 'FP', 'FN', 'TP']

results_frame = pd.DataFrame(results, columns=eval_cols)

results_frame.to_csv(RESULTS_DIR + 'full_results.csv')

## Checking for Overfitting

I want to make sure models are not overfitting, especially the forest model with over 90% accuracy.

In [8]:
# CLASSIFIER_LIST = ['xgb', 'forest']

def test_overfitting(filename_list=FILENAME_LIST, 
                     label_dict=LABEL_DICT, 
                     class_list=CLASSIFIER_LIST,
                     baseline_features=BASELINE_FEATURES):
    model_folder = os.path.join('exploring_models')
    results = []
    for table in filename_list:
        print("{:=^80}".format(" Checking on {} ".format(table)))
        # Making Training and Test sets
        dataset = get_csv_as_frame(table)
        cols_to_keep = [col for col in dataset.columns if 'smart' in col]
        # Groupby will get 10 day windows
        grouped = dataset.groupby(['serial_number', 'last_day'])
        # Label should always be what the last day in the windows is
        labels = grouped[label_dict[table]].agg('last')
        for classifier in class_list:
            print("{:=^80}".format(" Evaluating {} with History ".format(classifier)))
            features = get_features(grouped, cols_to_keep, history=True)
            model_name, file_name = class_name(classifier, history=True)
            X_train, X_test, y_train, y_test = train_test_split(features, 
                                                                labels,
                                                                test_size=0.2,
                                                                random_state=123)
            loaded_model = pickle.load(open(
                model_folder + '/{}{}'.format(table, file_name), 'rb'))
            pred_train = loaded_model.predict(X_train)
            pred_test = loaded_model.predict(X_test)
            
            result_train = evaluate(pred_train, y_train, table, end_time='train', model=model_name)
            results.append(result_train)
            result_test = evaluate(pred_test, y_test, table, end_time='test', model=model_name)
            results.append(result_test)
            
            print("{:=^80}".format(" Evaluating {} without History ".format(classifier)))
            features = get_features(grouped, cols_to_keep, history=False)
            model_name, file_name = class_name(classifier, history=False)
            X_train, X_test, y_train, y_test = train_test_split(features, 
                                                                labels,
                                                                test_size=0.2,
                                                                random_state=123)
            loaded_model = pickle.load(open(
                model_folder + '/{}{}'.format(table, file_name), 'rb'))
            pred_train = loaded_model.predict(X_train)
            pred_test = loaded_model.predict(X_test)
            
            result_train = evaluate(pred_train, y_train, table, end_time='train', model=model_name)
            results.append(result_train)
            result_test = evaluate(pred_test, y_test, table, end_time='test', model=model_name)
            results.append(result_test)
        print("\n \n")
    return results

In [9]:
results = test_overfitting()


 


 


 


 



In [10]:
eval_cols = ['Dataset', 'model', 'type',
             'accuracy', 'recall',  
             'precision', 'F1', 
             'auc', 'TN', 'FP', 'FN', 'TP']

results_frame = pd.DataFrame(results, columns=eval_cols)

results_frame.to_csv(RESULTS_DIR + 'overfitting_results.csv')