In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np

from kmodes.kmodes import KModes

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
cols = ['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'spn', 'fmi', 'active', 'activeTransitionCount',
            'EquipmentID', 'Latitude', 'Longitude', 'LocationTimeStamp']
faults = pd.read_csv("../data/J1939Faults.csv", usecols = cols, dtype = str)
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")
sfc = pd.read_excel("../data/Service Fault Codes_1_0_0_167.xlsx")

  for idx, row in parser.parse():


In [3]:
diagnostics = pd.pivot(diagnostics, 
                       columns = ['Name'], 
                       values = 'Value', 
                       index = 'FaultId')


In [4]:
diagnostics = diagnostics.reset_index()
diagnostics['FaultId'] = diagnostics['FaultId'].astype(str)
diagnostics = diagnostics.drop(columns = 'ServiceDistance')

In [5]:
# Convert columns to numeric
diagnostics[['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']] = diagnostics[['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']].apply(pd.to_numeric, errors = 'coerce')

In [6]:
diagnostics = pd.get_dummies(diagnostics, drop_first=True, columns = ['CruiseControlActive', 'IgnStatus', 'ParkingBrake'])


**There are service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722), so you should remove any records in the vicinity of these locations, as fault codes may be tripped when working on the vehicles.**

In [7]:
station_1 = (faults['Latitude'].str.contains('36.06')) & faults['Longitude'].str.contains('86.43')
station_2 = (faults['Latitude'].str.contains('35.58')) & faults['Longitude'].str.contains('86.44')
station_3 = (faults['Latitude'].str.contains('36.19')) & faults['Longitude'].str.contains('83.17')
faults = faults.drop(faults[station_1 | station_2 | station_3].index)

In [8]:
# creating a column that has the lat long values together.
faults["lat_long"] = list(zip(faults['Latitude'], faults['Longitude']))

In [9]:
# convert dtypes as needed
# some columns have numeric-appearing values but they're not continuous variables, leaving them as strings
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])
faults['activeTransitionCount'] = faults['activeTransitionCount'].astype(int)

In [10]:
# time of day for faults could be interesting, split event column to date and timestamp columns
faults['event_date'] = faults['EventTimeStamp'].dt.date
faults['event_time'] = faults['EventTimeStamp'].dt.time

In [11]:
faults = faults.copy()

faults['event_year'] = faults['EventTimeStamp'].dt.year
faults['event_month'] = faults['EventTimeStamp'].dt.month
faults['event_day'] = faults['EventTimeStamp'].dt.day
faults['event_dayofweek'] = faults['EventTimeStamp'].dt.dayofweek
faults['event_dayname'] = faults['EventTimeStamp'].dt.day_name()
faults['event_hour'] = faults['EventTimeStamp'].dt.hour
faults['event_time_quadrant'] = faults['EventTimeStamp'].dt.hour // 4

In [14]:
#faults['RecordID'] = faults[['RecordID']].astype(str)

In [15]:
# sort df by equipment, spn, timestamp
faults = faults.copy()
faults = faults.sort_values(by = ['EquipmentID', 'spn', 'EventTimeStamp'])

# get index for first row in each group, yields list of integers
first_index = faults.groupby(['EquipmentID', 'spn']).head(1).index

# check first_index for rows where active == False
drop_index = first_index[faults.loc[first_index, 'active'] == 'False']

# drop rows where first index is an active False row
faults = faults.drop(drop_index)

# now this should work as expected
faults['false_eventTimeStamp'] = faults.sort_values(by = ['EventTimeStamp']).groupby(by = ['EquipmentID', 'spn'])['EventTimeStamp'].shift(-1)

Merging Dataframes

In [16]:
faults_diagnostics = pd.merge(diagnostics, faults, left_on='FaultId', right_on='RecordID')

In [17]:
faults_diagnostics[['FaultId', 'LampStatus']] = faults_diagnostics[['FaultId', 'LampStatus']].astype(str)

In [18]:
faults_diagnostics[['CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']] = faults_diagnostics[['CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']].astype(bool)

***IMPUTING USING ITERATIVE IMPUTER AND SCALING***

In [19]:
predictors_bool = ['IgnStatus_True', 'CruiseControlActive_True', 'ParkingBrake_True']

bool_cols = pd.DataFrame(SimpleImputer(strategy = 'most_frequent').fit_transform(faults_diagnostics[predictors_bool].astype(str)), 
            columns = predictors_bool)


In [20]:
predictors_num = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
               'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure','EngineOilTemperature', 
               'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 
               'Speed', 'TurboBoostPressure', 'SwitchedBatteryVoltage', 'FuelTemperature','Throttle']

scaler = StandardScaler().fit(faults_diagnostics[predictors_num])
faults_diagnostics_iterative = IterativeImputer().fit(scaler.transform(faults_diagnostics[predictors_num]))



In [25]:
faults_diagnostics[predictors_num] = faults_diagnostics_iterative.transform(scaler.transform(faults_diagnostics[predictors_num]))
faults_diagnostics

Unnamed: 0,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IntakeManifoldTemperature,LampStatus,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,CruiseControlActive_True,IgnStatus_True,ParkingBrake_True,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,LocationTimeStamp,lat_long,event_date,event_time,event_year,event_month,event_day,event_dayofweek,event_dayname,event_hour,event_time_quadrant,false_eventTimeStamp
0,1,-0.764085,0.004772,0.366264,0.435578,-2.136328,-0.673683,-2.895517,-1.439362,-2.419510,-1.570137,-0.797310,-1.593882,-0.788824,-0.885845,-1.086887,1023,-0.916878,0.313730,0.117896,-0.722624,False,False,True,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,2015-02-21 11:34:25.000,"(38.857638, -84.626851)",2015-02-21,10:47:13,2015,2,21,5,Saturday,10,2,2015-02-21 11:43:18
1,2,0.040136,-0.000569,-0.037479,-0.000758,0.003698,-0.000428,-0.002810,-0.001866,-0.000015,0.007924,0.001256,-0.000284,-0.000982,0.000262,0.002765,1279,-0.003960,-0.053351,0.030052,-0.000998,False,True,False,2,990360,2015-02-21 11:34:34,,629,12,True,127,1439,38.857638,-84.626851,2015-02-21 11:35:10.000,"(38.857638, -84.626851)",2015-02-21,11:34:34,2015,2,21,5,Saturday,11,2,2015-02-21 16:45:27
2,4,0.040136,-0.000569,-0.037479,-0.000758,0.003698,-0.000428,-0.002810,-0.001866,-0.000015,0.007924,0.001256,-0.000284,-0.000982,0.000262,0.002765,1279,-0.003960,-0.053351,0.030052,-0.000998,False,True,False,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,2015-02-21 11:36:08.000,"(41.421018, -87.767361)",2015-02-21,11:35:33,2015,2,21,5,Saturday,11,2,2015-02-21 11:57:37
3,6,0.545375,0.196584,0.237154,0.770364,0.352293,-0.049585,0.454459,0.135821,1.328516,0.644343,-0.761500,0.862555,0.008786,-0.631448,0.175950,1023,-0.438004,0.313730,-0.033519,0.063906,False,True,False,6,990431,2015-02-21 11:40:22,Low (Severity Low) Engine Coolant Level,111,17,True,1,1417,33.043564,-96.179722,2015-02-21 11:40:59.000,"(33.043564, -96.179722)",2015-02-21,11:40:22,2015,2,21,5,Saturday,11,2,2015-02-21 11:41:55
4,7,1.494733,0.068709,0.237154,-0.588879,0.573503,1.592778,0.555974,0.348923,1.815735,-0.255185,0.169572,-0.381079,1.698247,-1.003352,-1.086887,1023,0.545380,0.313730,-0.064013,1.705360,False,True,False,7,990439,2015-02-21 11:40:52,Low (Severity Low) Engine Coolant Level,111,17,True,2,1597,36.902916,-86.436481,2015-02-21 11:41:29.000,"(36.902916, -86.436481)",2015-02-21,11:40:52,2015,2,21,5,Saturday,11,2,2015-02-21 11:43:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058488,1248454,0.040136,-0.000569,-0.037479,-0.000758,0.003698,-0.000428,-0.002810,-0.001866,-0.000015,0.007924,0.001256,-0.000284,-0.000982,0.000262,0.002765,1023,-0.003960,-0.053351,0.030052,-0.000998,False,False,False,1248454,123904424,2020-03-06 14:00:26,Low (Severity Low) Catalyst Tank Level,1761,17,False,3,2282,37.094768,-85.897407,2020-03-06 14:00:21.000,"(37.094768, -85.897407)",2020-03-06,14:00:26,2020,3,6,4,Friday,14,3,NaT
1058489,1248455,1.963956,0.260522,0.237154,0.440963,0.462898,0.640208,0.352945,0.372008,0.823046,0.995006,1.584084,0.381388,0.542058,-0.215897,-0.315153,18431,1.371874,-0.745741,0.069118,0.200694,True,True,False,1248455,123905139,2020-03-06 14:04:23,Condition Exists Engine Protection Torque Derate,1569,31,True,5,1994,34.39074,-79.461805,2020-03-06 14:04:59.000,"(34.39074, -79.461805)",2020-03-06,14:04:23,2020,3,6,4,Friday,14,3,NaT
1058490,1248456,-0.764085,0.132647,0.366264,0.738760,0.518201,1.001528,0.708245,0.393319,0.898526,0.601099,1.745231,0.639559,0.776514,-0.271001,-0.595784,17407,1.426905,0.040137,0.669541,0.098103,True,True,False,1248456,123905996,2020-03-06 14:13:38,Abnormal Rate of Change Aftertreatment 1 Intak...,3216,10,True,1,1850,34.43037,-84.920509,2020-03-06 14:14:14.000,"(34.43037, -84.920509)",2020-03-06,14:13:38,2020,3,6,4,Friday,14,3,NaT
1058491,1248457,-0.720437,0.196584,0.452337,-2.362930,0.352293,-1.035003,-0.509928,0.533610,-0.283172,-1.865140,-1.674666,-1.943351,-0.788824,3.726947,-0.244996,1023,-0.499872,-3.186937,0.669541,-0.517442,False,True,False,1248457,123906113,2020-03-06 14:14:13,Low (Severity Medium) Engine Coolant Level,111,18,True,8,2377,35.030925,-85.321527,2020-03-06 14:14:49.000,"(35.030925, -85.321527)",2020-03-06,14:14:13,2020,3,6,4,Friday,14,3,2020-03-06 14:15:34


In [None]:
faults_diagnostics

In [None]:
had_derate = faults_diagnostics[faults_diagnostics['spn'] == '5246']

In [None]:
derate_equip = tuple(had_derate['EquipmentID'].to_list())

In [None]:
faults_diagnostics[faults_diagnostics['EquipmentID'].isin(derate_equip)].reset_index(drop = True)

In [None]:
diag_features=['FaultId', 'AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
               'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure','EngineOilTemperature', 
               'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 
               'LampStatus', 'Speed', 'TurboBoostPressure', 'CruiseControlActive_True', 'IgnStatus_True', 'ParkingBrake_True']

In [None]:
diagnostics_drop = faults_diagnostics.dropna(subset = diag_features)

CHECK A COUPLE OF EQUIPMENT IDS AND SEE WHAT VALUES THEY HAD IN COMMON. MAYBE DO SOME IMPUTING SO IM NOT LOOKING AT NAN VALUES

In [None]:
#predictors = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

In [None]:
#predictors = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']
#scaler = StandardScaler().fit(diagnostics[predictors])
#diagnostics_KNN = IterativeImputer().fit(scaler.transform(diagnostics[predictors]))

In [None]:
#diagnostics_KNN.transform(scaler.transform(diagnostics[predictors]))

In [None]:
#diagnostics[predictors] = diagnostics_KNN.transform(scaler.transform(diagnostics[predictors]))