In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np

from kmodes.kmodes import KModes

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score, fbeta_score, 
                             matthews_corrcoef, brier_score_loss,
                                r2_score, mean_absolute_error)

from sklearn.calibration import CalibrationDisplay

from imblearn.over_sampling import RandomOverSampler


from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
faults_diagnostics = pd.read_csv('../data/faults_diagnostics.csv')

  faults_diagnostics = pd.read_csv('../data/faults_diagnostics.csv')


In [3]:
faults_diagnostics_sorted = faults_diagnostics.sort_values('EventTimeStamp').reset_index(drop = True)

# to find the time at which the 80% event happens, use loc to get just derate rows
derates = faults_diagnostics_sorted.loc[(faults_diagnostics_sorted['spn'] == 5246)].reset_index(drop = True)

# len * .8 to find how many events to include in train
ts_08 = int(len(derates) * 0.8)
ts_06 = int(len(derates) * 0.6)

# get timestamp corresponding to that row number, everything at or before is train
split_08 = derates.loc[ts_08, 'EventTimeStamp']
split_06 = derates.loc[ts_06, 'EventTimeStamp']

In [4]:
# find optimal threshold
def find_optimal_threshold(y_val, y_proba):
    optimal_threshold = 0
    max_savings = 0

    # test threshold values to find the best
    for threshold in np.arange(0, 1, 0.001):

        y_pred = y_proba > threshold

        # create confusion matrix
        conf_matrix = confusion_matrix(y_val, y_pred)

        tp = conf_matrix[1, 1]
        fp = conf_matrix[0, 1]

        # savings calculated
        savings = (4000 * tp) - (500 * fp)

        # check to see if it's best
        if savings > max_savings:
            max_savings = savings
            optimal_threshold = threshold
    
    return optimal_threshold

In [5]:
def pred_with_optimal_threshold(y_test, y_proba, threshold):
    
    y_pred = (y_proba >= threshold).astype(int)

    return y_pred

In [6]:
train_set = faults_diagnostics_sorted[faults_diagnostics_sorted['EventTimeStamp'] <= split_06]
val_set = faults_diagnostics_sorted[(faults_diagnostics_sorted['EventTimeStamp'] < split_06) & (faults_diagnostics_sorted['EventTimeStamp'] <= split_08)]
test_set = faults_diagnostics_sorted[(faults_diagnostics_sorted['EventTimeStamp'] > split_08)]


#Creating the predictors & specifiying the categorical predictors.
predictors = ['AcceleratorPedal', 'BarometricPressure',
       'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature',
       'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
       'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature',
       'IntakeManifoldTemperature','Speed','SwitchedBatteryVoltage', 'Throttle', 
       'TurboBoostPressure', 'CruiseControlActive_True', 'IgnStatus_True', 
       'ParkingBrake_True', 'spn','fmi', 'active', 'activeTransitionCount', 
       'LampStatus', 'EquipmentID']

categorical_predictors = ['fmi', 'LampStatus','active',
                          'ParkingBrake_True', 'IgnStatus_True',
                          'CruiseControlActive_True', 'EquipmentID']
target = ['had_derate']


X_train = pd.get_dummies(train_set[predictors], columns=categorical_predictors).copy()
X_val = pd.get_dummies(val_set[predictors], columns=categorical_predictors).copy()
X_test = pd.get_dummies(test_set[predictors], columns=categorical_predictors).copy()

y_train = train_set[target].copy()
y_val = val_set[target].copy()
y_test = test_set[target].copy()






In [7]:
#boolean predictor variables.
predictors_bool = [ 'LampStatus_0',
'LampStatus_2',
'LampStatus_9',
'LampStatus_11',
'LampStatus_255',
'LampStatus_511',
'LampStatus_617',
'LampStatus_1023',
'LampStatus_1279',
'LampStatus_2035',
'LampStatus_2047',
'LampStatus_4351',
'LampStatus_5119',
'LampStatus_5375',
'LampStatus_6143',
'LampStatus_16639',
'LampStatus_16895',
'LampStatus_17407',
'LampStatus_17663',
'LampStatus_18419',
'LampStatus_18431',
'LampStatus_20735',
'LampStatus_21503',
'LampStatus_22515',
'LampStatus_22527',
'LampStatus_50175',
'LampStatus_51199',
'LampStatus_62463',
'LampStatus_63487',
 'LampStatus_65535',
     'ParkingBrake_True_False',
     'ParkingBrake_True_True',
     'IgnStatus_True_False',
     'IgnStatus_True_True',
     'CruiseControlActive_True_False',
     'CruiseControlActive_True_True',]

#numeric predictor variables.
predictors_num = list(X_train.columns[X_train.dtypes == 'float']) + ['activeTransitionCount']

#imputing the boolean predictors by the mode.
bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_train[predictors_bool].astype(str)), 
            columns = predictors_bool)

#creating the scaler and iterative variables to be used on all the numeric predictors. 
scaler = StandardScaler()
iterative_imputer = IterativeImputer()

#initializes scaler & fits to the numeric predictors. 
X_train_scaled = scaler.fit_transform(X_train[predictors_num])
X_train_imputed = iterative_imputer.fit_transform(X_train_scaled)

#scale validation and test data
X_val_scaled = scaler.transform(X_val[predictors_num])  
X_val_imputed = iterative_imputer.transform(X_val_scaled)

X_test_scaled = scaler.transform(X_test[predictors_num])  
X_test_imputed = iterative_imputer.transform(X_test_scaled)

# Initialize and fit HistGradientBoostingClassifier
hgb = HistGradientBoostingClassifier()
hgb.fit(X_train_imputed, y_train)

val_preds = hgb.predict(X_val_imputed)

  y = column_or_1d(y, warn=True)


In [8]:
# calculate and print metrics
def calculate_metrics(y_val, val_preds):
    
    metrics = {}
    
    metrics['accuracy'] = accuracy_score(y_val, val_preds)
    metrics['matthews_corrcoef'] = matthews_corrcoef(y_val, val_preds)
    metrics['classification_report'] = classification_report(y_val, val_preds, zero_division = 0.0)
    metrics['confusion_matrix'] = confusion_matrix(y_val, val_preds)
    metrics['model_savings'] = metrics['confusion_matrix'][1, 1] * 4000 - metrics['confusion_matrix'][0, 1] * 500
    
    return metrics

def print_metrics(metrics):
    
    print(f'Accuracy score: {metrics["accuracy"]}')
    print(f'Matthews corr coef: {metrics["matthews_corrcoef"]}')
    print('Classification Report:')
    print(metrics['classification_report'])
    print('Confusion Matrix:')
    print(metrics['confusion_matrix'])
    print(f'Model savings: {metrics["model_savings"]}')

In [9]:
metrics = calculate_metrics(y_val, val_preds)
print_metrics(metrics)

Accuracy score: 0.9989580011304109
Matthews corr coef: 0.1538941234393452
Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    632836
        True       0.26      0.09      0.14       562

    accuracy                           1.00    633398
   macro avg       0.63      0.55      0.57    633398
weighted avg       1.00      1.00      1.00    633398

Confusion Matrix:
[[632686    150]
 [   510     52]]
Model savings: 133000


In [10]:
# Predict on test data
test_preds = hgb.predict(X_test_imputed)

metrics = calculate_metrics(y_test, test_preds)
print_metrics(metrics)



Accuracy score: 0.9988651191018055
Matthews corr coef: 0.0
Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    164588
        True       0.00      0.00      0.00       187

    accuracy                           1.00    164775
   macro avg       0.50      0.50      0.50    164775
weighted avg       1.00      1.00      1.00    164775

Confusion Matrix:
[[164588      0]
 [   187      0]]
Model savings: 0
