In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np

from kmodes.kmodes import KModes

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, f1_score, fbeta_score, 
                             matthews_corrcoef, brier_score_loss)

from sklearn.calibration import CalibrationDisplay

from imblearn.over_sampling import RandomOverSampler

In [2]:
cols = ['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'spn', 'fmi', 'active', 'activeTransitionCount',
            'EquipmentID', 'Latitude', 'Longitude', 'LocationTimeStamp']
faults = pd.read_csv("../data/J1939Faults.csv", usecols = cols, dtype = str)
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")
sfc = pd.read_excel("../data/Service Fault Codes_1_0_0_167.xlsx")

  for idx, row in parser.parse():


In [3]:
diagnostics = pd.pivot(diagnostics, 
                       columns = ['Name'], 
                       values = 'Value', 
                       index = 'FaultId')


In [4]:
diagnostics = diagnostics.reset_index()
diagnostics['FaultId'] = diagnostics['FaultId'].astype(str)
diagnostics = diagnostics.drop(columns = 'ServiceDistance')

In [5]:
# Convert columns to numeric
diagnostics[['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']] = diagnostics[['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']].apply(pd.to_numeric, errors = 'coerce')

In [6]:
diagnostics = pd.get_dummies(diagnostics, drop_first=True, columns = ['CruiseControlActive', 'IgnStatus', 'ParkingBrake'])


**There are service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722), so you should remove any records in the vicinity of these locations, as fault codes may be tripped when working on the vehicles.**

In [7]:
station_1 = (faults['Latitude'].str.contains('36.06')) & faults['Longitude'].str.contains('86.43')
station_2 = (faults['Latitude'].str.contains('35.58')) & faults['Longitude'].str.contains('86.44')
station_3 = (faults['Latitude'].str.contains('36.19')) & faults['Longitude'].str.contains('83.17')
faults = faults.drop(faults[station_1 | station_2 | station_3].index)

In [8]:
# creating a column that has the lat long values together.
faults["lat_long"] = list(zip(faults['Latitude'], faults['Longitude']))

In [9]:
# convert dtypes as needed
# some columns have numeric-appearing values but they're not continuous variables, leaving them as strings
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])
faults['activeTransitionCount'] = faults['activeTransitionCount'].astype(int)

In [10]:
# time of day for faults could be interesting, split event column to date and timestamp columns
faults['event_date'] = faults['EventTimeStamp'].dt.date
faults['event_time'] = faults['EventTimeStamp'].dt.time

In [11]:
faults = faults.copy()

faults['event_year'] = faults['EventTimeStamp'].dt.year
faults['event_month'] = faults['EventTimeStamp'].dt.month
faults['event_day'] = faults['EventTimeStamp'].dt.day
faults['event_dayofweek'] = faults['EventTimeStamp'].dt.dayofweek
faults['event_dayname'] = faults['EventTimeStamp'].dt.day_name()
faults['event_hour'] = faults['EventTimeStamp'].dt.hour
faults['event_time_quadrant'] = faults['EventTimeStamp'].dt.hour // 4

In [12]:
# sort df by equipment, spn, timestamp
faults = faults.copy()
faults = faults.sort_values(by = ['EquipmentID', 'spn', 'EventTimeStamp'])

# get index for first row in each group, yields list of integers
first_index = faults.groupby(['EquipmentID', 'spn']).head(1).index

# check first_index for rows where active == False
drop_index = first_index[faults.loc[first_index, 'active'] == 'False']

# drop rows where first index is an active False row
faults = faults.drop(drop_index)

# now this should work as expected
faults['false_eventTimeStamp'] = faults.sort_values(by = ['EventTimeStamp']).groupby(by = ['EquipmentID', 'spn'])['EventTimeStamp'].shift(-1)

Merging Dataframes

In [13]:
faults_diagnostics = pd.merge(diagnostics, faults, left_on='FaultId', right_on='RecordID')

In [14]:
faults_diagnostics[['FaultId', 'LampStatus']] = faults_diagnostics[['FaultId', 'LampStatus']].astype(str)

In [15]:
faults_diagnostics[['CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']] = faults_diagnostics[['CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']].astype(bool)

In [16]:
faults_diagnostics['had_derate']= faults_diagnostics['spn'] == '5246'

In [17]:
faults_diagnostics['had_derate'].value_counts()

False    1057555
True         938
Name: had_derate, dtype: int64

In [18]:
faults_diagnostics.head(1)

Unnamed: 0,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IntakeManifoldTemperature,LampStatus,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,CruiseControlActive_True,IgnStatus_True,ParkingBrake_True,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,LocationTimeStamp,lat_long,event_date,event_time,event_year,event_month,event_day,event_dayofweek,event_dayname,event_hour,event_time_quadrant,false_eventTimeStamp,had_derate
0,1,0.0,14.21,66.48672,423178.7,100.4,11.0,0.0,96.74375,0.0,1632.2,43.2,12300.907429,0.0,,78.8,1023,0.0,3276.75,,0.0,False,False,True,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,2015-02-21 11:34:25.000,"(38.857638, -84.626851)",2015-02-21,10:47:13,2015,2,21,5,Saturday,10,2,2015-02-21 11:43:18,False


In [19]:
predictors = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount', 'active','ParkingBrake_True', 'IgnStatus_True', 'CruiseControlActive_True' ]
categorical_predictors = ['active','ParkingBrake_True', 'IgnStatus_True', 'CruiseControlActive_True']

X = faults_diagnostics[predictors]
X = pd.get_dummies(X, columns = categorical_predictors)
y = faults_diagnostics['had_derate'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 117, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, random_state = 117, train_size = 0.6/0.8)

IMPUTING THE TRAINING DATA USING ITERATIVE IMPUTER AND SCALING

In [20]:
predictors_bool = ['active_False', 'active_True' ,'ParkingBrake_True_False', 'ParkingBrake_True_True', 'IgnStatus_True_False', 'IgnStatus_True_True', 'CruiseControlActive_True_False', 'CruiseControlActive_True_True']

bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_train[predictors_bool].astype(str)), 
            columns = predictors_bool)

In [23]:
predictors_num = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount']

scaler = StandardScaler()
iterative_imputer = IterativeImputer()

X_train_scaler = scaler.fit(X_train[predictors_num])
X_train_iterative = iterative_imputer.fit(scaler.transform(X_train[predictors_num]))

In [34]:
X_val_scaled = scaler.transform(X_val[predictors_num])
X_test_scaled = scaler.transform(X_test[predictors_num])

X_val_iterative = iterative_imputer.fit(X_val_scaled)
X_test_iterative = iterative_imputer.fit(X_test_scaled)

In [25]:
X_train[predictors_num] = X_train_iterative.transform(scaler.transform(X_train[predictors_num]))
X_train

Unnamed: 0,FuelLevel,FuelRate,SwitchedBatteryVoltage,activeTransitionCount,active_False,active_True,ParkingBrake_True_False,ParkingBrake_True_True,IgnStatus_True_False,IgnStatus_True_True,CruiseControlActive_True_False,CruiseControlActive_True_True
432626,0.037752,0.712538,0.068836,0.806094,0,1,1,0,0,1,1,0
1017898,-0.211578,-0.903053,-0.085307,-1.323678,0,1,0,1,0,1,1,0
710835,0.018175,-0.128440,-0.050767,0.499407,1,0,1,0,1,0,1,0
637938,0.170298,-1.091432,-0.211143,0.806094,0,1,1,0,0,1,1,0
285099,0.029956,-0.211697,-0.083676,0.823132,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1043261,-0.016549,0.116950,0.046226,-0.454731,1,0,1,0,1,0,1,0
365131,-0.048173,0.340430,0.134559,-1.323678,1,0,1,0,1,0,1,0
798658,0.029336,-0.207315,-0.081944,0.806094,1,0,1,0,1,0,1,0
181798,-0.048173,0.340430,0.134559,-1.323678,1,0,1,0,1,0,1,0


In [35]:
X_val[predictors_num] = X_val_iterative.transform(scaler.transform(X_val[predictors_num]))

X_test[predictors_num] = X_test_iterative.transform(scaler.transform(X_test[predictors_num]))


In [29]:
X_val

Unnamed: 0,FuelLevel,FuelRate,SwitchedBatteryVoltage,activeTransitionCount,active_False,active_True,ParkingBrake_True_False,ParkingBrake_True_True,IgnStatus_True_False,IgnStatus_True_True,CruiseControlActive_True_False,CruiseControlActive_True_True
1038500,93.877683,5.074306,9159.481791,1.260000e+02,1,0,1,0,1,0,1,0
41384,91.907530,8.195460,9295.811762,1.000000e+00,1,0,1,0,1,0,1,0
912089,93.877683,5.074306,9159.481791,1.260000e+02,1,0,1,0,1,0,1,0
702811,93.814638,5.174183,9163.844350,1.220000e+02,1,0,1,0,1,0,1,0
552832,93.877683,5.074306,9159.481791,1.260000e+02,1,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
632780,152.166176,2.334440,9542.576209,1.000000e+00,0,1,1,0,0,1,1,0
34402,107.122753,23.520304,10146.200641,-1.813945e-18,0,1,1,0,0,1,1,0
510808,93.877683,5.074306,9159.481791,1.260000e+02,1,0,1,0,1,0,1,0
553431,46.253263,2.095984,8775.427257,1.260000e+02,0,1,1,0,0,1,1,0


In [30]:
X_test

Unnamed: 0,FuelLevel,FuelRate,SwitchedBatteryVoltage,activeTransitionCount,active_False,active_True,ParkingBrake_True_False,ParkingBrake_True_True,IgnStatus_True_False,IgnStatus_True_True,CruiseControlActive_True_False,CruiseControlActive_True_True
158178,99.209720,0.720279,10146.200641,126.0,0,1,1,0,0,1,1,0
578783,93.877683,5.074306,9159.481791,126.0,1,0,1,0,1,0,1,0
366309,69.383670,25.189498,9571.256652,1.0,0,1,1,0,0,1,1,0
730825,90.079296,2.884722,9080.366106,126.0,0,1,0,1,0,1,1,0
396232,77.905398,2.261069,8985.612466,126.0,0,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
770609,93.877683,5.074306,9159.481791,126.0,1,0,1,0,1,0,1,0
75782,92.317322,7.546260,9267.455128,27.0,1,0,1,0,1,0,1,0
731795,93.877683,5.074306,9159.481791,126.0,1,0,1,0,1,0,1,0
301107,93.877683,5.074306,9159.481791,126.0,1,0,1,0,1,0,1,0


In [None]:
predictors_bool = ['active_False', 'active_True' ,'ParkingBrake_True_False', 'ParkingBrake_True_True', 'IgnStatus_True_False', 'IgnStatus_True_True', 'CruiseControlActive_True_False', 'CruiseControlActive_True_True']

bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_train[predictors_bool].astype(str)), 
            columns = predictors_bool)

predictors_num = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount']

scaler = StandardScaler().fit(X_train[predictors_num])
X_train_iterative = IterativeImputer().fit(scaler.transform(X_train[predictors_num]))

X_train[predictors_num] = X_train_iterative.transform(scaler.transform(X_train[predictors_num]))


IMPUTING THE TRAINING VALIDATION DATA USING ITERATIVE IMPUTER AND SCALING

In [None]:
predictors_bool = ['active_False', 'active_True' ,'ParkingBrake_True_False', 'ParkingBrake_True_True', 'IgnStatus_True_False', 'IgnStatus_True_True', 'CruiseControlActive_True_False', 'CruiseControlActive_True_True']

bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_val[predictors_bool].astype(str)), 
            columns = predictors_bool)

In [None]:
predictors_num = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount']

scaler = StandardScaler().fit(X_val[predictors_num])
X_val_iterative = IterativeImputer().fit(scaler.transform(X_val[predictors_num]))

In [None]:
X_val[predictors_num] = X_val_iterative.transform(scaler.transform(X_val[predictors_num]))
X_val

IMPUTING THE TESTING DATA USING ITERATIVE IMPUTER AND SCALING

In [None]:
predictors_bool = ['active_False', 'active_True' ,'ParkingBrake_True_False', 'ParkingBrake_True_True', 'IgnStatus_True_False', 'IgnStatus_True_True', 'CruiseControlActive_True_False', 'CruiseControlActive_True_True']

bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(X_test[predictors_bool].astype(str)), 
            columns = predictors_bool)

In [None]:
predictors_num = ['FuelLevel', 'FuelRate', 'SwitchedBatteryVoltage', 'activeTransitionCount']

scaler = StandardScaler().fit(X_test[predictors_num])
X_test_iterative = IterativeImputer().fit(scaler.transform(X_test[predictors_num]))

In [None]:
X_test[predictors_num] = X_test_iterative.transform(scaler.transform(X_test[predictors_num]))
X_test

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
y_val_pred_proba = lr.predict_proba(X_val)[:,1]

In [None]:
candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

In [None]:
# predictors_bool = ['IgnStatus_True', 'CruiseControlActive_True', 'ParkingBrake_True']

# bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(faults_diagnostics[predictors_bool].astype(str)), 
#             columns = predictors_bool)


In [None]:
# predictors_num = ['AcceleratorPedal', 'activeTransitionCount','BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
#                'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure','EngineOilTemperature', 
#                'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 
#                'Speed', 'TurboBoostPressure', 'SwitchedBatteryVoltage', 'FuelTemperature','Throttle']

# scaler = StandardScaler().fit(faults_diagnostics[predictors_num])
# faults_diagnostics_iterative = IterativeImputer().fit(scaler.transform(faults_diagnostics[predictors_num]))

In [None]:
# faults_diagnostics[predictors_num] = faults_diagnostics_iterative.transform(scaler.transform(faults_diagnostics[predictors_num]))
# faults_diagnostics

In [None]:
# diag_features=['FaultId', 'AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
#                'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure','EngineOilTemperature', 
#                'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 
#                'LampStatus', 'Speed', 'TurboBoostPressure', 'CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']

In [None]:
# derate_equip = tuple(had_derate['EquipmentID'].to_list())

In [None]:
# faults_diagnostics[faults_diagnostics['EquipmentID'].isin(derate_equip)].reset_index(drop = True)

In [None]:
# diagnostics_drop = faults_diagnostics.dropna(subset = diag_features)

CHECK A COUPLE OF EQUIPMENT IDS AND SEE WHAT VALUES THEY HAD IN COMMON. MAYBE DO SOME IMPUTING SO IM NOT LOOKING AT NAN VALUES

In [None]:
#predictors = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

In [None]:
#predictors = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']
#scaler = StandardScaler().fit(diagnostics[predictors])
#diagnostics_KNN = IterativeImputer().fit(scaler.transform(diagnostics[predictors]))

In [None]:
#diagnostics_KNN.transform(scaler.transform(diagnostics[predictors]))

In [None]:
#diagnostics[predictors] = diagnostics_KNN.transform(scaler.transform(diagnostics[predictors]))