In [29]:
import pandas as pd
import numpy as np

In [30]:
cols = ['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'spn', 'fmi', 'active', 'activeTransitionCount',
           'EquipmentID', 'Latitude', 'Longitude']

faults = pd.read_csv("../data/J1939Faults.csv", usecols = cols, dtype = str)
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851
1,2,990360,2015-02-21 11:34:34.000,,629,12,True,127,1439,38.857638,-84.626851
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361
4,5,990416,2015-02-21 11:39:41.000,,4364,17,False,2,1674,38.416481,-89.442638


In [31]:
faults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 11 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   RecordID               1187335 non-null  object
 1   ESS_Id                 1187335 non-null  object
 2   EventTimeStamp         1187335 non-null  object
 3   eventDescription       1126490 non-null  object
 4   spn                    1187335 non-null  object
 5   fmi                    1187335 non-null  object
 6   active                 1187335 non-null  object
 7   activeTransitionCount  1187335 non-null  object
 8   EquipmentID            1187335 non-null  object
 9   Latitude               1187335 non-null  object
 10  Longitude              1187335 non-null  object
dtypes: object(11)
memory usage: 99.6+ MB


In [32]:
# convert dtypes as needed
# some columns have numeric-appearing values but they're not continuous variables, leaving them as strings
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])
faults['activeTransitionCount'] = faults['activeTransitionCount'].astype(int)

In [33]:
# time of day for faults could be interesting, split event column to date and timestamp columns
faults['event_date'] = faults['EventTimeStamp'].dt.date
faults['event_time'] = faults['EventTimeStamp'].dt.time

In [34]:
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,event_date,event_time
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,2015-02-21,10:47:13
1,2,990360,2015-02-21 11:34:34,,629,12,True,127,1439,38.857638,-84.626851,2015-02-21,11:34:34
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361,2015-02-21,11:35:31
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,2015-02-21,11:35:33
4,5,990416,2015-02-21 11:39:41,,4364,17,False,2,1674,38.416481,-89.442638,2015-02-21,11:39:41


In [35]:
# sort df by equipment, spn, timestamp
faults = faults.copy()
faults = faults.sort_values(by = ['EquipmentID', 'spn', 'EventTimeStamp'])

# get index for first row in each group, yields list of integers
first_index = faults.groupby(['EquipmentID', 'spn']).head(1).index

# check first_index for rows where active == False
drop_index = first_index[faults.loc[first_index, 'active'] == 'False']

# drop rows where first index is an active False row
faults = faults.drop(drop_index)

# now this should work as expected
faults['false_eventTimeStamp'] = faults.sort_values(by = ['EventTimeStamp']).groupby(by = ['EquipmentID', 'spn'])['EventTimeStamp'].shift(-1)

In [36]:
# test to reveal if the process worked (this group previously had a False as the first timestamp)
faults.loc[(faults['EquipmentID'] == '1369') & (faults['spn'] == '1807')].sort_values(by = ['EventTimeStamp'])

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,event_date,event_time,false_eventTimeStamp
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,2015-02-21,11:35:33,2015-02-21 11:57:37
31,32,990702,2015-02-21 11:57:37,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42787,-87.756759,2015-02-21,11:57:37,2015-02-21 12:13:47
49,50,990999,2015-02-21 12:13:47,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.431574,-87.758981,2015-02-21,12:13:47,2015-02-21 18:26:34
421,422,995975,2015-02-21 18:26:34,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,38.330833,-85.757037,2015-02-21,18:26:34,2015-02-21 18:26:37
422,423,995979,2015-02-21 18:26:37,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,38.330833,-85.757083,2015-02-21,18:26:37,2015-02-21 18:32:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6426,6427,1088496,2015-02-26 12:14:25,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,36.936018,-86.50726800000001,2015-02-26,12:14:25,2015-02-26 13:00:12
6434,6435,1089226,2015-02-26 13:00:12,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,36.717453000000006,-86.525,2015-02-26,13:00:12,2015-02-26 13:11:23
6437,6438,1089547,2015-02-26 13:11:23,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,36.718148,-86.525324,2015-02-26,13:11:23,2015-02-26 16:05:38
6492,6493,1094483,2015-02-26 16:05:38,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,35.679212,-88.745046,2015-02-26,16:05:38,2015-02-26 16:05:41


In [37]:
# mask for each service station, lat and long to 2 decimal places
station_1 = (faults['Latitude'].str.contains('36.06')) & (faults['Longitude'].str.contains('86.43'))
station_2 = (faults['Latitude'].str.contains('35.58')) & (faults['Longitude'].str.contains('86.44'))
station_3 = (faults['Latitude'].str.contains('36.19')) & (faults['Longitude'].str.contains('83.17'))

# dataframe without faults associated with service locations
faults_nonservice = faults[~(station_1 | station_2 | station_3)]

### next steps
- add date and time breakout columns (year, month, date, weekday, hour, day segment (divide 24 hours into quadrants))
- merge diagnostic data with filtered faults data (inner join)
- figure out imputing methods
- for models, imput after train/test split
- do some eda on the data

In [38]:
faults_nonservice = faults_nonservice.copy()

faults_nonservice['event_year'] = faults_nonservice['EventTimeStamp'].dt.year
faults_nonservice['event_month'] = faults_nonservice['EventTimeStamp'].dt.month
faults_nonservice['event_day'] = faults_nonservice['EventTimeStamp'].dt.day
faults_nonservice['event_dayofweek'] = faults_nonservice['EventTimeStamp'].dt.dayofweek
faults_nonservice['event_dayname'] = faults_nonservice['EventTimeStamp'].dt.day_name()
faults_nonservice['event_hour'] = faults_nonservice['EventTimeStamp'].dt.hour
faults_nonservice['event_time_quadrant'] = faults_nonservice['EventTimeStamp'].dt.hour // 4

In [39]:
faults_nonservice.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,...,event_date,event_time,false_eventTimeStamp,event_year,event_month,event_day,event_dayofweek,event_dayname,event_hour,event_time_quadrant
1001106,1038243,55748536,2018-07-20 09:31:33,High (Severity Medium) J1939 Network #2,1231,16,True,2,105406655,36.139351,...,2018-07-20,09:31:33,2018-07-20 09:42:30,2018,7,20,4,Friday,9,2
1001107,1038244,55748640,2018-07-20 09:42:30,High (Severity Medium) J1939 Network #2,1231,16,False,2,105406655,36.138981,...,2018-07-20,09:42:30,NaT,2018,7,20,4,Friday,9,2
358800,366301,7171498,2016-01-31 07:12:25,,629,12,True,127,105301976,41.987175,...,2016-01-31,07:12:25,NaT,2016,1,31,6,Sunday,7,1
898852,923783,33800297,2017-12-04 22:25:09,Low (Severity Medium) Engine Coolant Level,111,18,False,1,105311240,35.609444,...,2017-12-04,22:25:09,NaT,2017,12,4,0,Monday,22,5
1093996,1141606,81618595,2019-03-27 08:10:52,Low (Severity Medium) Engine Coolant Level,111,18,True,1,105338729,36.060324,...,2019-03-27,08:10:52,2019-03-27 08:12:58,2019,3,27,2,Wednesday,8,2


In [40]:
# faults_nonservice['eventDescription'].value_counts(dropna = False)
faults_nonservice.loc[faults_nonservice['spn'] == '5246']['eventDescription'].value_counts(dropna = False)

NaN    938
Name: eventDescription, dtype: int64

In [41]:
diagnostics = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')

In [42]:
diagnostics.head(30)

Unnamed: 0,Id,Name,Value,FaultId
0,1,IgnStatus,False,1
1,2,EngineOilPressure,0,1
2,3,EngineOilTemperature,96.74375,1
3,4,TurboBoostPressure,0,1
4,5,EngineLoad,11,1
5,6,AcceleratorPedal,0,1
6,7,IntakeManifoldTemperature,78.8,1
7,8,FuelRate,0,1
8,9,FuelLtd,12300.907429328,1
9,10,EngineRpm,0,1


### target:
- group by equipment ID and sort by event datetime
- if equipment had derate (fault id) boolean column true for records within a certain timeframe prior
- looking for a signal with enough time to get the truck to a service location

### predictors:
- fault codes, diagnostics (downside is that a lot of values are missing, but they seem to be more present with active true)
- might not be value in looking at active false (Tomo's group only looked at active true)

### do this as part of data cleaning:
- pull timestamp from active false to get duration of active true (there should be paired rows based on fault code, different lights for different faults)
- do this before dropping service locations (don't want to miss active false which happened as a result of service)

### other thoughts:
- Code P0606 tends to be set when a PCM/ECM has failed. Depending on component condition and the make and model of a vehicle, it may be possible to resolve an internal integrity fault in the PCM/ECM by upgrading or re-flashing the control module.
- since it's a fault with the monitoring device itself, might not have any predictive power
- how many are there?
- if we get rid of this (and other faults not actually related to the engine) might reduce noise in the dataset?
- since there are active true with no diagnostics, could the rows with null diagnostics be faults like this one that don't have anything to do with the engine?