In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
cols = ['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'spn', 'fmi', 'active', 'activeTransitionCount',
           'EquipmentID', 'Latitude', 'Longitude']

faults = pd.read_csv("../data/J1939Faults.csv", usecols = cols, dtype = str)
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851
1,2,990360,2015-02-21 11:34:34.000,,629,12,True,127,1439,38.857638,-84.626851
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361
4,5,990416,2015-02-21 11:39:41.000,,4364,17,False,2,1674,38.416481,-89.442638


ESS_Id – the event subscriber service event that contained the fault
EventTimeStamp – when the event took place
eventDescription – brief text of meaning of the code (not always present)
actionDescription – never seen this filled in
ecuSoftwareVersion – version string from the reporting vehicle computer system
ecuSerialNumber – Serial number of the reporting Engine Control Module (ECM)
ecuModel -Model of the reporting ECM
ecuMake – Manufacturer of the reporting ECM
ecuSource –
spn – Fault code being reported
fmi – Failure Mode associated with the Fault Code
active – whether the code is being set or being removed
activeTransitionCount – Number of times code has been set/unset
faultValue – never seen used
EquipmentID – Assigned truck number of the unit in question
MCTNumber – Communications Terminal assigned to the truck
Latitude – Latitude at time of event
Longitude – Longitude at time of event
LocationTimeStamp – Time latitude and longitude were obtained

ecuSoftwareVersion
ecuSerialNumber 
ecuModel
ecuMake
ecuSource

In [4]:
ecu_cols = ['RecordID', 'ESS_Id', 'EventTimeStamp', 'eventDescription', 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake', 'ecuSource' ,'spn', 'fmi', 'active', 'activeTransitionCount',
           'EquipmentID']

ecu = pd.read_csv("../data/J1939Faults.csv", usecols = ecu_cols, dtype = str)
ecu.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID
0,1,990349,2015-02-21 10:47:13.000,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,17,True,2,1439
1,2,990360,2015-02-21 11:34:34.000,,unknown,unknown,unknown,unknown,11,629,12,True,127,1439
2,3,990364,2015-02-21 11:35:31.000,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,False,127,1369
3,4,990370,2015-02-21 11:35:33.000,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,True,127,1369
4,5,990416,2015-02-21 11:39:41.000,,22281684P01*22357957P01*22362082P01*,13063430,0USA13_13_0415_2238A,VOLVO,0,4364,17,False,2,1674


ecu source codes: 0, 11, 49, 3, 61
Most important for engine, exhuast and therefor relevant to derates: 0, 49, 61

In [12]:
ecu.loc[ecu['ecuSource']=='61']['eventDescription'].unique()

array([nan, 'Low Voltage (Sensor supply voltage 1)',
       'Low Current Aftertreatment 1 Particulate Trap Differential Pressure',
       'Low Current Aftertreatment 1 Exhaust Gas Temperature 3',
       'Low Current Aftertreatment 1 Exhaust Gas Temperature 1',
       'Low Current Aftertreatment 1 Exhaust Gas Temperature 2',
       'Abnormal Update Rate Source Address 23',
       'Low Voltage (ECU Power Output Supply Voltage #1)',
       'Abnormal Update Rate Source Address 0',
       'Abnormal Update Rate Catalyst Tank Controller',
       'Low Current Catalyst Tank Heater',
       'High Voltage (ECU Power Output Supply Voltage #1)',
       'High (Severity High) Catalyst Tank Temperature',
       'Incorrect Data J1939 Network #4',
       'Abnormal Update Rate Source Address 17',
       'Low Voltage (Catalyst Tank Heater)'], dtype=object)

In [4]:
ecu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 14 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   RecordID               1187335 non-null  object
 1   ESS_Id                 1187335 non-null  object
 2   EventTimeStamp         1187335 non-null  object
 3   eventDescription       1126490 non-null  object
 4   ecuSoftwareVersion     891285 non-null   object
 5   ecuSerialNumber        844318 non-null   object
 6   ecuModel               1122577 non-null  object
 7   ecuMake                1122577 non-null  object
 8   ecuSource              1187335 non-null  object
 9   spn                    1187335 non-null  object
 10  fmi                    1187335 non-null  object
 11  active                 1187335 non-null  object
 12  activeTransitionCount  1187335 non-null  object
 13  EquipmentID            1187335 non-null  object
dtypes: object(14)
memory usage: 126.8+

In [67]:
print(ecu['ecuMake'].nunique())
print(ecu['ecuMake'].unique())
print(ecu['ecuMake'].value_counts())

22
['unknown' 'VOLVO' 'CMMNS' '?????' 'PCAR' nan '?CAR' '?MMNS' '???R'
 '?????MX' '??MNS' 'BNDWS' 'PACCR' '?ACCR' '????S' '?NDWS' '????R' 'EATON'
 '?????MX16U13D13' '?ATON' '??DWS' '???CR' '5516014']
ecuMake
CMMNS              433403
unknown            298549
PACCR              277021
BNDWS               71001
PCAR                20229
EATON               12612
VOLVO                7252
?????                 755
????S                 627
????R                 589
?MMNS                 289
?CAR                  152
?ACCR                  39
???CR                  20
?NDWS                  15
?????MX16U13D13         9
?????MX                 6
??MNS                   3
?ATON                   3
???R                    1
??DWS                   1
5516014                 1
Name: count, dtype: int64


In [66]:
print(ecu['ecuModel'].nunique())
print(ecu['ecuModel'].unique())
print(ecu['ecuModel'].value_counts())

29
['unknown' '0USA13_13_0415_2238A' '6X1u10D1500000000' '6X1u13D1500000000'
 'MX' nan '20412511P07' '________Y043718' 'EC60-adv' 'CECU3B-NAMUX4'
 'EC80ESP' 'CE' '6U13D13' '0USA10_13_0405_2237A' '6L u13D0890000000'
 '6X1u17D1500000000' 'MX16U13D13' 'EEO-xxF112C' 'FAOM-xx810S-EC3'
 'CECU3-NAMUX3' 'Gen 4 Boot Loader' 'EC80ESP AM000036' 'Y044053'
 'EC80ESP+' 'EC80ESP AM000038' 'Y049568' 'E0031' 'MX16U15D13' '202.35.0'
 '6X1u20D1500000000']
ecuModel
unknown                 298549
CECU3B-NAMUX4           277919
6X1u10D1500000000       216230
6X1u13D1500000000       203685
EC60-adv                 48816
EC80ESP                  22202
MX                       16362
6X1u17D1500000000        14499
EEO-xxF112C               8131
0USA13_13_0415_2238A      7252
FAOM-xx810S-EC3           4464
MX16U13D13                3629
MX16U15D13                 391
CE                         297
EC80ESP+                    42
6L u13D0890000000           28
Gen 4 Boot Loader           21
CECU3-NAMUX3           

In [70]:
print(ecu['ecuSerialNumber'].nunique())
print(ecu['ecuSerialNumber'].unique())
print(ecu['ecuSerialNumber'].value_counts().head(30))

1989
['unknown' '13063430' '79466580' ... '80239683' '80239366' 'Z0098155']
ecuSerialNumber
unknown       298549
6U13D13        11207
79845785       10302
79856768        8158
79845329        7199
79623056        7066
79621048        6828
79845786        6349
79840984        6345
79844876        5872
79623054        5663
79623410        5578
79844877        5506
79620769        5482
79844882        5417
79614871        5326
79615187        5218
79857688        5059
79857689        5019
79614865        4374
79619434        4206
79620768        4133
79615184        3857
79845331        3819
79619117        3797
79614866        3788
S381222841      3176
79615183        3169
79845327        3149
79844880        3113
Name: count, dtype: int64


In [91]:
ecu.loc[ecu['ecuSerialNumber']=='S381222841'].groupby('EquipmentID')['EventTimeStamp'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
EquipmentID,Unnamed: 1_level_1,Unnamed: 2_level_1
105434215,2017-01-02 18:24:21.000,2017-01-02 18:24:21.000
1641,2016-04-12 19:11:38.000,2018-03-26 18:38:44.000


In [94]:
ecu.loc[ecu['ecuSerialNumber']=='6U13D13']['EquipmentID'].value_counts()

EquipmentID
1749         5474
1748         1747
304          1497
1745          758
305           554
1746          479
1747          269
301           219
302           209
105349493       1
Name: count, dtype: int64

105369518

In [71]:
print(ecu['ecuSoftwareVersion'].nunique())
print(ecu['ecuSoftwareVersion'].unique())
print(ecu['ecuSoftwareVersion'].value_counts().head(30))

1898
['unknown' '22281684P01*22357957P01*22362082P01*'
 '04993120*00025921*082113134117*07700053*I0*BBZ*' ...
 '05572391*03002387*110719010101*62602004*N1*BKR*'
 '5516018*202.56.0*5516502*E003.e003*5539634*86.21*5538103*4*'
 '05317106*05100987*050719120655*09401585*G1*BDR*']
ecuSoftwareVersion
unknown                                                                                                                                                        298549
BB41103*   BB41104*                                                                                                                                             45381
P30-1011-124*1*                                                                                                                                                 22068
PC4__1284P4C_2*                                                                                                                                                 15082
AAAI000031*AAAM000036*BB41259   *A82J1406

In [90]:
print(ecu['ecuSource'].nunique())
print(ecu['ecuSource'].unique())
print(ecu['ecuSource'].value_counts())

5
['0' '11' '49' '61' '3']
ecuSource
0     528044
49    514059
11    131122
3      13484
61       626
Name: count, dtype: int64


In [10]:
faults.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 11 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   RecordID               1187335 non-null  object
 1   ESS_Id                 1187335 non-null  object
 2   EventTimeStamp         1187335 non-null  object
 3   eventDescription       1126490 non-null  object
 4   spn                    1187335 non-null  object
 5   fmi                    1187335 non-null  object
 6   active                 1187335 non-null  object
 7   activeTransitionCount  1187335 non-null  object
 8   EquipmentID            1187335 non-null  object
 9   Latitude               1187335 non-null  object
 10  Longitude              1187335 non-null  object
dtypes: object(11)
memory usage: 99.6+ MB


In [11]:
# convert dtypes as needed
# some columns have numeric-appearing values but they're not continuous variables, leaving them as strings
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])
faults['activeTransitionCount'] = faults['activeTransitionCount'].astype(int)

In [12]:
# time of day for faults could be interesting, split event column to date and timestamp columns
faults['event_date'] = faults['EventTimeStamp'].dt.date
faults['event_time'] = faults['EventTimeStamp'].dt.time

In [13]:
faults.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,event_date,event_time
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,111,17,True,2,1439,38.857638,-84.626851,2015-02-21,10:47:13
1,2,990360,2015-02-21 11:34:34,,629,12,True,127,1439,38.857638,-84.626851,2015-02-21,11:34:34
2,3,990364,2015-02-21 11:35:31,Incorrect Data Steering Wheel Angle,1807,2,False,127,1369,41.42125,-87.767361,2015-02-21,11:35:31
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,1807,2,True,127,1369,41.421018,-87.767361,2015-02-21,11:35:33
4,5,990416,2015-02-21 11:39:41,,4364,17,False,2,1674,38.416481,-89.442638,2015-02-21,11:39:41


In [14]:
# mask for each service station, lat and long to 2 decimal places
station_1 = (faults['Latitude'].str.contains('36.06')) & (faults['Longitude'].str.contains('86.43'))
station_2 = (faults['Latitude'].str.contains('35.58')) & (faults['Longitude'].str.contains('86.44'))
station_3 = (faults['Latitude'].str.contains('36.19')) & (faults['Longitude'].str.contains('83.17'))

# dataframe without faults associated with service locations
faults_nonservice = faults[~(station_1 | station_2 | station_3)]

### next steps
- add date and time breakout columns (year, month, date, weekday, hour, day segment (divide 24 hours into quadrants))
- merge diagnostic data with filtered faults data (inner join)
- figure out imputing methods
- for models, imput after train/test split
- do some eda on the data

In [84]:
faults_nonservice = faults_nonservice.copy()

faults_nonservice['event_year'] = faults_nonservice['EventTimeStamp'].dt.year
faults_nonservice['event_month'] = faults_nonservice['EventTimeStamp'].dt.month
faults_nonservice['event_day'] = faults_nonservice['EventTimeStamp'].dt.day
faults_nonservice['event_dayofweek'] = faults_nonservice['EventTimeStamp'].dt.dayofweek
faults_nonservice['event_dayname'] = faults_nonservice['EventTimeStamp'].dt.day_name()
faults_nonservice['event_hour'] = faults_nonservice['EventTimeStamp'].dt.hour
faults_nonservice['event_time_quadrant'] = faults_nonservice['EventTimeStamp'].dt.hour // 6

In [85]:
derate=faults_nonservice.loc[((faults_nonservice['spn']=='1569') & (faults_nonservice['fmi']=='31'))|(faults_nonservice['spn']=='5246')]
derate.head()

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,event_date,event_time,event_year,event_month,event_day,event_dayofweek,event_dayname,event_hour,event_time_quadrant
40,41,990856,2015-02-21 12:06:22,Condition Exists Engine Protection Torque Derate,1569,31,True,5,1721,39.051805,-84.560509,2015-02-21,12:06:22,2015,2,21,5,Saturday,12,2
249,250,993542,2015-02-21 15:01:49,Condition Exists Engine Protection Torque Derate,1569,31,False,5,1721,38.113240000000005,-85.667361,2015-02-21,15:01:49,2015,2,21,5,Saturday,15,2
290,291,994045,2015-02-21 15:35:45,Condition Exists Engine Protection Torque Derate,1569,31,True,6,1721,37.735185,-85.80810100000001,2015-02-21,15:35:45,2015,2,21,5,Saturday,15,2
306,307,994237,2015-02-21 15:51:58,Condition Exists Engine Protection Torque Derate,1569,31,False,6,1721,37.54037,-85.880972,2015-02-21,15:51:58,2015,2,21,5,Saturday,15,2
340,341,994680,2015-02-21 16:22:24,Condition Exists Engine Protection Torque Derate,1569,31,True,7,1721,37.166666,-85.964027,2015-02-21,16:22:24,2015,2,21,5,Saturday,16,2


In [86]:
dayofweek=derate['event_dayname'].value_counts()
dayofweek

event_dayname
Wednesday    1955
Tuesday      1907
Thursday     1762
Monday       1743
Friday       1649
Saturday      922
Sunday        797
Name: count, dtype: int64

In [87]:
timequadrant=derate['event_time_quadrant'].value_counts()
timequadrant

event_time_quadrant
2    3954
1    3883
3    1519
0    1379
Name: count, dtype: int64