In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

Potential important factors:
- ecuSource: ignore 3, 11 to see if it boosts model performance
- engine temp columns, turboBoostPressure
- BarometricPressure factors into load
- EngineLoad complicated (percentage 0-1 if not turbo-boosted)
- generally engine-related columns would be interesting

In [2]:
fault_diag = pd.read_csv('../data/fault_diag.csv')

  fault_diag = pd.read_csv('../data/fault_diag.csv')


First model, simple logistic regression - not expected to do well.

In [6]:
variables = ['BarometricPressure', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature']

X = fault_diag[variables]
y = fault_diag['derate_full']

# split data into train/test before scaling and imputation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy = 'mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

logreg = LogisticRegression(max_iter = 1000).fit(X_train_imputed, y_train)

print(accuracy_score(y_test, logreg.predict(X_test_imputed)))
print(classification_report(y_test, logreg.predict(X_test_imputed), zero_division = 0.0))

0.9953939517420031
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    136795
        True       0.00      0.00      0.00       633

    accuracy                           1.00    137428
   macro avg       0.50      0.50      0.50    137428
weighted avg       0.99      1.00      0.99    137428



Second model. Incorporate cost information, still with simple model. *Very* bad performance on cost.

In [10]:
variables = ['BarometricPressure', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature']

X = fault_diag[variables]
y = fault_diag['derate_full']

# split data into train/test before scaling and imputation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy = 'mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

logreg = LogisticRegression(max_iter = 1000).fit(X_train_imputed, y_train)

# predicted probabilities for test data
y_prob = logreg.predict_proba(X_test_imputed)[:, 1]

# costs defined
cost_fn = 4000
cost_fp = 500

# decision threshord with cost as a factor
threshold = cost_fn / (cost_fn + cost_fp)

# probabilities converted to binary predictions based on new decision threshold
y_pred_adjusted = (y_prob >= threshold).astype(int)

# evaluation of adjusted predictions
print(f'Accuracy: {accuracy_score(y_test, y_pred_adjusted)}')
print(classification_report(y_test, y_pred_adjusted, zero_division = 0.0))

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)

# cost calculation
total_cost = conf_matrix[1, 0] * cost_fn + conf_matrix[0, 1] * cost_fp
print(f'Total cost: {total_cost}')

Accuracy: 0.9953939517420031
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    136795
        True       0.00      0.00      0.00       633

    accuracy                           1.00    137428
   macro avg       0.50      0.50      0.50    137428
weighted avg       0.99      1.00      0.99    137428

Total cost: 2532000


In [7]:
fault_diag.columns

Index(['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'ecuSource',
       'spn', 'fmi', 'active', 'activeTransitionCount', 'EquipmentID',
       'Latitude', 'Longitude', 'event_date', 'event_time',
       'false_eventTimeStamp', 'event_year', 'event_month', 'event_day',
       'event_dayofweek', 'event_dayname', 'event_hour', 'event_time_quadrant',
       'derate_full', 'derate_partial', 'AcceleratorPedal',
       'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed',
       'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad',
       'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
       'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IgnStatus',
       'IntakeManifoldTemperature', 'LampStatus', 'Speed',
       'TurboBoostPressure'],
      dtype='object')

Third model. Modify variables to account for information from Rob. Needs a lot more work.

In [11]:
variables = ['BarometricPressure', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'IntakeManifoldTemperature', 'TurboBoostPressure']

X = fault_diag[variables]
y = fault_diag['derate_full']

# split data into train/test before scaling and imputation
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy = 'mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

logreg = LogisticRegression(max_iter = 1000).fit(X_train_imputed, y_train)

# predicted probabilities for test data
y_prob = logreg.predict_proba(X_test_imputed)[:, 1]

# costs defined
cost_fn = 4000
cost_fp = 500

# decision threshord with cost as a factor
threshold = cost_fn / (cost_fn + cost_fp)

# probabilities converted to binary predictions based on new decision threshold
y_pred_adjusted = (y_prob >= threshold).astype(int)

# evaluation of adjusted predictions
print(f'Accuracy: {accuracy_score(y_test, y_pred_adjusted)}')
print(classification_report(y_test, y_pred_adjusted, zero_division = 0.0))

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_adjusted)

# cost calculation
total_cost = conf_matrix[1, 0] * cost_fn + conf_matrix[0, 1] * cost_fp
print(f'Total cost: {total_cost}')

Accuracy: 0.9953939517420031
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    136795
        True       0.00      0.00      0.00       633

    accuracy                           1.00    137428
   macro avg       0.50      0.50      0.50    137428
weighted avg       0.99      1.00      0.99    137428

Total cost: 2532000
