In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np

from kmodes.kmodes import KModes

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score, fbeta_score, 
                             matthews_corrcoef, brier_score_loss)

from sklearn.calibration import CalibrationDisplay

from imblearn.over_sampling import RandomOverSampler

In [2]:
cols = ['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'spn', 'fmi', 'active', 'activeTransitionCount',
            'EquipmentID', 'Latitude', 'Longitude', 'LocationTimeStamp']
faults = pd.read_csv("../data/J1939Faults.csv", usecols = cols, dtype = str)
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")
sfc = pd.read_excel("../data/Service Fault Codes_1_0_0_167.xlsx")

  for idx, row in parser.parse():


In [3]:
diagnostics = pd.pivot(diagnostics, 
                       columns = ['Name'], 
                       values = 'Value', 
                       index = 'FaultId')


In [4]:
diagnostics = diagnostics.reset_index()
diagnostics['FaultId'] = diagnostics['FaultId'].astype(str)
diagnostics = diagnostics.drop(columns = 'ServiceDistance')

In [5]:
# Convert columns to numeric
diagnostics[['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']] = diagnostics[['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']].apply(pd.to_numeric, errors = 'coerce')

In [6]:
diagnostics = pd.get_dummies(diagnostics, drop_first=True, columns = ['CruiseControlActive', 'IgnStatus', 'ParkingBrake'])


**There are service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722), so you should remove any records in the vicinity of these locations, as fault codes may be tripped when working on the vehicles.**

In [7]:
station_1 = (faults['Latitude'].str.contains('36.06')) & faults['Longitude'].str.contains('86.43')
station_2 = (faults['Latitude'].str.contains('35.58')) & faults['Longitude'].str.contains('86.44')
station_3 = (faults['Latitude'].str.contains('36.19')) & faults['Longitude'].str.contains('83.17')
faults = faults.drop(faults[station_1 | station_2 | station_3].index)

In [8]:
# creating a column that has the lat long values together.
faults["lat_long"] = list(zip(faults['Latitude'], faults['Longitude']))

In [9]:
# convert dtypes as needed
# some columns have numeric-appearing values but they're not continuous variables, leaving them as strings
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])
faults['activeTransitionCount'] = faults['activeTransitionCount'].astype(int)

In [10]:
# time of day for faults could be interesting, split event column to date and timestamp columns
faults['event_date'] = faults['EventTimeStamp'].dt.date
faults['event_time'] = faults['EventTimeStamp'].dt.time

In [11]:
faults = faults.copy()

faults['event_year'] = faults['EventTimeStamp'].dt.year
faults['event_month'] = faults['EventTimeStamp'].dt.month
faults['event_day'] = faults['EventTimeStamp'].dt.day
faults['event_dayofweek'] = faults['EventTimeStamp'].dt.dayofweek
faults['event_dayname'] = faults['EventTimeStamp'].dt.day_name()
faults['event_hour'] = faults['EventTimeStamp'].dt.hour
faults['event_time_quadrant'] = faults['EventTimeStamp'].dt.hour // 4

In [12]:
# sort df by equipment, spn, timestamp
faults = faults.copy()
faults = faults.sort_values(by = ['EquipmentID', 'spn', 'EventTimeStamp'])

# get index for first row in each group, yields list of integers
first_index = faults.groupby(['EquipmentID', 'spn']).head(1).index

# check first_index for rows where active == False
drop_index = first_index[faults.loc[first_index, 'active'] == 'False']

# drop rows where first index is an active False row
faults = faults.drop(drop_index)

# now this should work as expected
faults['false_eventTimeStamp'] = faults.sort_values(by = ['EventTimeStamp']).groupby(by = ['EquipmentID', 'spn'])['EventTimeStamp'].shift(-1)

Merging Dataframes

In [13]:
faults_diagnostics = pd.merge(diagnostics, faults, left_on='FaultId', right_on='RecordID')

In [14]:
faults_diagnostics[['FaultId', 'LampStatus']] = faults_diagnostics[['FaultId', 'LampStatus']].astype(str)

In [15]:
faults_diagnostics[['CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']] = faults_diagnostics[['CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']].astype(bool)

In [16]:
faults_diagnostics['had_derate']= faults_diagnostics['spn'] == '5246'

In [17]:
faults_diagnostics['had_derate'].value_counts()

False    1057555
True         938
Name: had_derate, dtype: int64

In [18]:
faults_diagnostics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1058493 entries, 0 to 1058492
Data columns (total 48 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   FaultId                    1058493 non-null  object        
 1   AcceleratorPedal           481595 non-null   float64       
 2   BarometricPressure         531933 non-null   float64       
 3   CruiseControlSetSpeed      522854 non-null   float64       
 4   DistanceLtd                531794 non-null   float64       
 5   EngineCoolantTemperature   531927 non-null   float64       
 6   EngineLoad                 531476 non-null   float64       
 7   EngineOilPressure          532039 non-null   float64       
 8   EngineOilTemperature       530403 non-null   float64       
 9   EngineRpm                  532367 non-null   float64       
 10  EngineTimeLtd              528069 non-null   float64       
 11  FuelLevel                  456371 non

In [19]:
#Creating the predictors & specifiying the categorical predictors.
predictors = ['AcceleratorPedal', 'BarometricPressure',
       'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature',
       'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
       'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature',
       'IntakeManifoldTemperature','Speed','SwitchedBatteryVoltage', 'Throttle', 
       'TurboBoostPressure', 'CruiseControlActive_True', 'IgnStatus_True', 
       'ParkingBrake_True', 'spn','fmi', 'active', 'activeTransitionCount', 
       'LampStatus', 'EquipmentID']

categorical_predictors = ['spn', 'fmi', 'LampStatus','active',
                          'ParkingBrake_True', 'IgnStatus_True',
                          'CruiseControlActive_True', 'EquipmentID']

#creating the X and y varibales 
X = faults_diagnostics[predictors]
X = pd.get_dummies(X, columns = categorical_predictors)
y = faults_diagnostics['had_derate'] 

# Remove correlated features to improve interpretability
cols_to_drop = ['AcceleratorPedal', 'EngineLoad', 'EngineRpm', 'FuelRate', 'FuelLevel', 'TurboBoostPressure','EngineCoolantTemperature']
X = X.drop(columns = cols_to_drop)

#splitting the X and y varibles for training and testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 117, stratify = y)

#splitting the traing data so that there is now data used for validation.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, random_state = 117, train_size = 0.6/0.8)

In [20]:
faults_diagnostics.columns

Index(['FaultId', 'AcceleratorPedal', 'BarometricPressure',
       'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature',
       'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
       'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature',
       'IntakeManifoldTemperature', 'LampStatus', 'Speed',
       'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure',
       'CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True',
       'RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'spn',
       'fmi', 'active', 'activeTransitionCount', 'EquipmentID', 'Latitude',
       'Longitude', 'LocationTimeStamp', 'lat_long', 'event_date',
       'event_time', 'event_year', 'event_month', 'event_day',
       'event_dayofweek', 'event_dayname', 'event_hour', 'event_time_quadrant',
       'false_eventTimeStamp', 'had_derate'],
      dtype='object')

In [21]:
X.dtypes.value_counts()


uint8      1552
float64      12
int64         1
dtype: int64

In [22]:
list(X.columns[X.dtypes == 'uint8'])

['spn_0',
 'spn_100',
 'spn_101',
 'spn_102',
 'spn_1023',
 'spn_1024',
 'spn_1028',
 'spn_103',
 'spn_1043',
 'spn_1045',
 'spn_105',
 'spn_1056',
 'spn_1059',
 'spn_1067',
 'spn_1068',
 'spn_107',
 'spn_1071',
 'spn_1072',
 'spn_1075',
 'spn_1078',
 'spn_108',
 'spn_1081',
 'spn_110',
 'spn_111',
 'spn_1127',
 'spn_114863',
 'spn_116',
 'spn_1172',
 'spn_1176',
 'spn_118',
 'spn_1209',
 'spn_1213',
 'spn_1231',
 'spn_1235',
 'spn_1236',
 'spn_1239',
 'spn_1247',
 'spn_125',
 'spn_127',
 'spn_1279',
 'spn_1321',
 'spn_1322',
 'spn_1323',
 'spn_1324',
 'spn_1325',
 'spn_1326',
 'spn_1327',
 'spn_1328',
 'spn_1347',
 'spn_1349',
 'spn_13600',
 'spn_139296',
 'spn_1464',
 'spn_1481',
 'spn_1482',
 'spn_1483',
 'spn_1487',
 'spn_153',
 'spn_153931',
 'spn_1569',
 'spn_157',
 'spn_158',
 'spn_16',
 'spn_160',
 'spn_1612',
 'spn_1659',
 'spn_1668',
 'spn_167',
 'spn_1675',
 'spn_168',
 'spn_17096',
 'spn_171',
 'spn_173',
 'spn_174',
 'spn_175',
 'spn_17590',
 'spn_1761',
 'spn_177',
 'spn_

IMPUTING THE TRAINING DATA USING ITERATIVE IMPUTER AND SCALING

In [25]:
#boolean predictor variables.
    predictors_bool = [ 'LampStatus_0', 'LampStatus_1023',
    'LampStatus_11', 'LampStatus_11801','LampStatus_1279', 
    'LampStatus_16639', 'LampStatus_16895', 'LampStatus_17407', 
    'LampStatus_17663', 'LampStatus_18419', 'LampStatus_18431',
    'LampStatus_2', 'LampStatus_2035','LampStatus_2047', 
    'LampStatus_20735', 'LampStatus_21503', 'LampStatus_22515', 
    'LampStatus_22527', 'LampStatus_255', 'LampStatus_28436', 'LampStatus_4351', 
    'LampStatus_50175', 'LampStatus_50431', 'LampStatus_511', 'LampStatus_5119',
    'LampStatus_51199', 'LampStatus_5375', 'LampStatus_544', 'LampStatus_55295', 
     'LampStatus_6131', 'LampStatus_6143', 'LampStatus_617', 
     'LampStatus_62463', 'LampStatus_63487', 'LampStatus_65535',
     'LampStatus_9',
     'ParkingBrake_True_False',
     'ParkingBrake_True_True',
     'IgnStatus_True_False',
     'IgnStatus_True_True',
     'CruiseControlActive_True_False',
     'CruiseControlActive_True_True',]

#numeric predictor variables.
predictors_num = list(X.columns[X.dtypes == 'float']) + ['activeTransitionCount']

#imputing the boolean predictors by the mode.
bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_train[predictors_bool].astype(str)), 
            columns = predictors_bool)

#creating the scaler and iterative variables to be used on all the numeric predictors. 
scaler = StandardScaler()
iterative_imputer = IterativeImputer()

#initializes scaler & fits to the numeric predictors. 
X_train_scaler = scaler.fit(X_train[predictors_num])
X_train_imputed = iterative_imputer.fit(scaler.transform(X_train[predictors_num]))

#scaling and imputing on X_train.
X_train[predictors_num] = X_train_imputed.transform(scaler.transform(X_train[predictors_num]))

In [26]:

# #creating the scaler and iterative variables to be used on all the numeric predictors. 
# scaler = StandardScaler()
# iterative_imputer = IterativeImputer()

# #initializes scaler & fits to the numeric predictors. 
# X_train_scaler = scaler.fit(X_train[predictors_num])
# X_train_imputed = iterative_imputer.fit(scaler.transform(X_train[predictors_num]))

# #scaling and imputing on X_train.
# X_train[predictors_num] = X_train_imputed.transform(scaler.transform(X_train[predictors_num]))



In [27]:
#scale validation and test data
X_val_scaled = scaler.transform(X_val[predictors_num])
X_test_scaled = scaler.transform(X_test[predictors_num])

#impute missing values in validation and test data
X_val[predictors_num] = iterative_imputer.transform(X_val_scaled)
X_test[predictors_num] = iterative_imputer.transform(X_test_scaled)


In [29]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [30]:
y_val_pred_proba = lr.predict_proba(X_val)[:,1]

In [31]:
y_pred = lr.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
MCC: 1.0
[[264389      0]
 [     0    235]]
              precision    recall  f1-score   support

       False       1.00      1.00      1.00    264389
        True       1.00      1.00      1.00       235

    accuracy                           1.00    264624
   macro avg       1.00      1.00      1.00    264624
weighted avg       1.00      1.00      1.00    264624



In [32]:
candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

Unnamed: 0,threshold,f1
0,0.1,1.0
36,0.46,1.0
52,0.62,1.0
51,0.61,1.0
50,0.6,1.0


IMPUTING THE TRAINING VALIDATION DATA USING ITERATIVE IMPUTER AND SCALING

In [None]:
# predictors_bool = ['active_False', 'active_True' ,'ParkingBrake_True_False', 'ParkingBrake_True_True', 'IgnStatus_True_False', 'IgnStatus_True_True', 'CruiseControlActive_True_False', 'CruiseControlActive_True_True']

# bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_val[predictors_bool].astype(str)), 
#             columns = predictors_bool)

In [None]:
# predictors_num = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount']

# scaler = StandardScaler().fit(X_val[predictors_num])
# X_val_iterative = IterativeImputer().fit(scaler.transform(X_val[predictors_num]))

In [None]:
# X_val_scaled = scaler.transform(X_val[predictors_num])
# X_test_scaled = scaler.transform(X_test[predictors_num])

# X_val_iterative = iterative_imputer.fit(X_val_scaled)
# X_test_iterative = iterative_imputer.fit(X_test_scaled)

# X_val[predictors_num] = X_val_iterative.transform(scaler.transform(X_val[predictors_num]))

# X_test[predictors_num] = X_test_iterative.transform(scaler.transform(X_test[predictors_num]))


In [None]:
# X_val[predictors_num] = X_val_iterative.transform(scaler.transform(X_val[predictors_num]))
# X_val

IMPUTING THE TESTING DATA USING ITERATIVE IMPUTER AND SCALING

In [None]:
# predictors_bool = ['active_False', 'active_True' ,'ParkingBrake_True_False', 'ParkingBrake_True_True', 'IgnStatus_True_False', 'IgnStatus_True_True', 'CruiseControlActive_True_False', 'CruiseControlActive_True_True']

# bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_test[predictors_bool].astype(str)), 
#             columns = predictors_bool)

In [None]:
# predictors_num = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount']

# scaler = StandardScaler().fit(X_test[predictors_num])
# X_test_iterative = IterativeImputer().fit(scaler.transform(X_test[predictors_num]))

In [None]:
# X_test[predictors_num] = X_test_iterative.transform(scaler.transform(X_test[predictors_num]))
# X_test

In [None]:
candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

In [None]:
# predictors_bool = ['IgnStatus_True', 'CruiseControlActive_True', 'ParkingBrake_True']

# bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(faults_diagnostics[predictors_bool].astype(str)), 
#             columns = predictors_bool)


In [None]:
# predictors_num = ['AcceleratorPedal', 'activeTransitionCount','BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
#                'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure','EngineOilTemperature', 
#                'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 
#                'Speed', 'TurboBoostPressure', 'SwitchedBatteryVoltage', 'FuelTemperature','Throttle']

# scaler = StandardScaler().fit(faults_diagnostics[predictors_num])
# faults_diagnostics_iterative = IterativeImputer().fit(scaler.transform(faults_diagnostics[predictors_num]))

In [None]:
# faults_diagnostics[predictors_num] = faults_diagnostics_iterative.transform(scaler.transform(faults_diagnostics[predictors_num]))
# faults_diagnostics

In [None]:
# diag_features=['FaultId', 'AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
#                'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure','EngineOilTemperature', 
#                'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 
#                'LampStatus', 'Speed', 'TurboBoostPressure', 'CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']

In [None]:
# derate_equip = tuple(had_derate['EquipmentID'].to_list())

In [None]:
# faults_diagnostics[faults_diagnostics['EquipmentID'].isin(derate_equip)].reset_index(drop = True)

In [None]:
# diagnostics_drop = faults_diagnostics.dropna(subset = diag_features)

CHECK A COUPLE OF EQUIPMENT IDS AND SEE WHAT VALUES THEY HAD IN COMMON. MAYBE DO SOME IMPUTING SO IM NOT LOOKING AT NAN VALUES

In [None]:
#predictors = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

In [None]:
#predictors = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']
#scaler = StandardScaler().fit(diagnostics[predictors])
#diagnostics_KNN = IterativeImputer().fit(scaler.transform(diagnostics[predictors]))

In [None]:
#diagnostics_KNN.transform(scaler.transform(diagnostics[predictors]))

In [None]:
#diagnostics[predictors] = diagnostics_KNN.transform(scaler.transform(diagnostics[predictors]))