# SNCB train events classification into incidents

- **Goal**: automatically suggest incident types based on new sequences of events
- **Method**: Predictive models
    * Decision tree
    * Random Forest
    * PCA

## 1. Import librairies

In [155]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth, fpmax

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# utils
import pandas as pd
import ast
import numpy as np

## 2. Load Data

In [140]:
dataset = 'data/sncb_data_challenge.csv'
df = pd.read_csv(dataset, delimiter=';')

In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1011 entries, 0 to 1010
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    1011 non-null   int64  
 1   incident_id                   1011 non-null   int64  
 2   vehicles_sequence             1011 non-null   object 
 3   events_sequence               1011 non-null   object 
 4   seconds_to_incident_sequence  1011 non-null   object 
 5   approx_lat                    1011 non-null   float64
 6   approx_lon                    1011 non-null   float64
 7   train_kph_sequence            1011 non-null   object 
 8   dj_ac_state_sequence          1011 non-null   object 
 9   dj_dc_state_sequence          1011 non-null   object 
 10  incident_type                 1011 non-null   int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 87.0+ KB


## 3. Data Preprocessing
- drop irrelevant columns
- evaluate sequences as python lists

In [142]:
# Drop unused/irrelevant columns
df = df.drop(columns=['Unnamed: 0', 'incident_id'])

# Convert the string representations to lists
df['vehicles_sequence'] = df['vehicles_sequence'].apply(ast.literal_eval)
df['events_sequence'] = df['events_sequence'].apply(ast.literal_eval)
df['seconds_to_incident_sequence'] = df['seconds_to_incident_sequence'].apply(ast.literal_eval)
df['train_kph_sequence'] = df['train_kph_sequence'].apply(ast.literal_eval)
df['dj_ac_state_sequence'] = df['dj_ac_state_sequence'].apply(ast.literal_eval)
df['dj_dc_state_sequence'] = df['dj_dc_state_sequence'].apply(ast.literal_eval)

df.head()

Unnamed: 0,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,"[609, 609, 609, 609, 609, 609, 609, 609, 609, ...","[2744, 4004, 2852, 4110, 2854, 4396, 1132, 414...","[-5510, -5510, -5507, -5507, -5506, -5506, -55...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",4
1,"[526, 526, 526, 526, 526, 526, 526, 526, 526, ...","[2744, 4148, 4394, 1566, 1570, 4396, 3634, 412...","[-8573, -8573, -8032, -8032, -8032, -7859, -61...",51.037435,4.431218,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 29.1,...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13
2,"[592, 592, 592, 592, 592, 592, 592, 592, 592, ...","[4394, 1566, 1570, 4114, 4168, 4168, 4156, 406...","[-12291, -12291, -12291, -10932, -10932, -1091...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, False, True, Tr...",14
3,"[576, 576, 576, 576, 576, 576, 576, 576, 576, ...","[4066, 4066, 4066, 4066, 4068, 2742, 4026, 270...","[-14351, -14204, -13890, -13383, -12739, -1243...",51.18322,4.276025,"[0.0, 0.0, 0.0, 0.015625, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",2
4,"[634, 634, 634, 634, 634, 634, 634, 634, 634, ...","[4002, 4032, 4028, 2852, 4026, 4110, 2742, 285...","[-224, -224, -223, -222, -222, -222, -220, -22...",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",14


## 4. Feature engineering
- transform raw sequence data into highly informative features:
    - energy_switch
    - on_battery_count
    - multi_vehicle
    - ...
- Mine and one hot encode frequent events sequences as new features

### 4.1 raw sequence to informative features

In [143]:
# Energy source switch (check if both energy sources (dj_ac and dj_dc) were active (True) at any point in the sequence)
df['energy_switch'] = df.apply(lambda row: int(any(row['dj_ac_state_sequence']) and any(row['dj_dc_state_sequence'])), axis=1)

# On battery (counts occurences of both energy sources (dj_ac and dj_dc) being simultaneously inactive (False) in the sequence)
df['on_battery_count'] = df.apply(lambda row: sum(1 for ac, dc in zip(row['dj_ac_state_sequence'], row['dj_dc_state_sequence']) if not ac and not dc), axis=1)

# On both energies (counts occurences of both energy sources (dj_ac and dj_dc) being simultaneously active (True) in the sequence)
df['on_both_count'] = df.apply(lambda row: sum(1 for ac, dc in zip(row['dj_ac_state_sequence'], row['dj_dc_state_sequence']) if ac and dc), axis=1)

# Vehicle count (counts number of different vehicles involved in incident - reported tokens)
df['vehicle_count'] = df['vehicles_sequence'].apply(lambda x: len(set(x)))

# Average speed
df['avg_speed'] = df['train_kph_sequence'].apply(lambda x: np.mean(x))

# Max speed
df['max_speed'] = df['train_kph_sequence'].apply(lambda x: max(x))

# Break count (counts number of speed decrease - breaks)
df['break_count'] = df['train_kph_sequence'].apply(lambda x: sum(1 for i in range(len(x) - 1) if x[i] > x[i + 1]))

df.head()

Unnamed: 0,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type,energy_switch,on_battery_count,on_both_count,vehicle_count,avg_speed,max_speed,break_count
0,"[609, 609, 609, 609, 609, 609, 609, 609, 609, ...","[2744, 4004, 2852, 4110, 2854, 4396, 1132, 414...","[-5510, -5510, -5507, -5507, -5506, -5506, -55...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",4,0,108,0,1,0.000312,0.125,1
1,"[526, 526, 526, 526, 526, 526, 526, 526, 526, ...","[2744, 4148, 4394, 1566, 1570, 4396, 3634, 412...","[-8573, -8573, -8032, -8032, -8032, -7859, -61...",51.037435,4.431218,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 29.1,...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13,0,0,0,2,33.363578,119.1,140
2,"[592, 592, 592, 592, 592, 592, 592, 592, 592, ...","[4394, 1566, 1570, 4114, 4168, 4168, 4156, 406...","[-12291, -12291, -12291, -10932, -10932, -1091...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, False, True, Tr...",14,0,234,0,3,0.746736,22.8,69
3,"[576, 576, 576, 576, 576, 576, 576, 576, 576, ...","[4066, 4066, 4066, 4066, 4068, 2742, 4026, 270...","[-14351, -14204, -13890, -13383, -12739, -1243...",51.18322,4.276025,"[0.0, 0.0, 0.0, 0.015625, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",2,0,0,0,2,0.044463,3.625,20
4,"[634, 634, 634, 634, 634, 634, 634, 634, 634, ...","[4002, 4032, 4028, 2852, 4026, 4110, 2742, 285...","[-224, -224, -223, -222, -222, -222, -220, -22...",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",14,0,71,0,2,0.023934,1.4,6


### 4.2 raw sequence to frequent sequence features

In [144]:
transactions = df['events_sequence']
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
encoded_events = pd.DataFrame(te_ary, columns=te.columns_)

encoded_events.head()

Unnamed: 0,10,12,28,30,42,52,60,64,66,74,...,4394,4396,4406,4408,4410,4412,4414,4416,4418,4420
0,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,False,True,True,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,False,False,False,False


In [145]:
frequent_events_sequences = fpmax(encoded_events, min_support=0.7, use_colnames=True)
frequent_events_sequences

Unnamed: 0,support,itemsets
0,0.701286,"(4168, 4026, 2708)"
1,0.706231,"(2708, 4026, 4394)"
2,0.731949,"(4026, 2708, 4030)"
3,0.701286,"(4120, 2956, 4148, 2708)"
4,0.705242,"(2708, 2956, 4148, 4124)"
...,...,...
68,0.703264,"(4066, 2744, 2708, 4148, 4120)"
69,0.719090,"(2744, 2708, 4148, 2742, 4120, 4026)"
70,0.743818,"(4066, 4068, 2708, 4148, 2742, 4120, 4026)"
71,0.701286,"(4068, 2708, 4148, 2742, 2744)"


In [146]:
for ind, itemset in enumerate(frequent_events_sequences['itemsets']):
    for events in df['events_sequence']:
        df[f'freq_event_seq_{ind}'] = 1 if itemset in events else 0

# drop sequences features
df = df.drop(columns=['vehicles_sequence', 'events_sequence', 'seconds_to_incident_sequence', 'train_kph_sequence', 'dj_ac_state_sequence', 'dj_dc_state_sequence'])

df.head()

Unnamed: 0,approx_lat,approx_lon,incident_type,energy_switch,on_battery_count,on_both_count,vehicle_count,avg_speed,max_speed,break_count,...,freq_event_seq_63,freq_event_seq_64,freq_event_seq_65,freq_event_seq_66,freq_event_seq_67,freq_event_seq_68,freq_event_seq_69,freq_event_seq_70,freq_event_seq_71,freq_event_seq_72
0,50.876601,4.718143,4,0,108,0,1,0.000312,0.125,1,...,0,0,0,0,0,0,0,0,0,0
1,51.037435,4.431218,13,0,0,0,2,33.363578,119.1,140,...,0,0,0,0,0,0,0,0,0,0
2,50.864083,4.162115,14,0,234,0,3,0.746736,22.8,69,...,0,0,0,0,0,0,0,0,0,0
3,51.18322,4.276025,2,0,0,0,2,0.044463,3.625,20,...,0,0,0,0,0,0,0,0,0,0
4,50.818727,3.253601,14,0,71,0,2,0.023934,1.4,6,...,0,0,0,0,0,0,0,0,0,0


## 5. Feature/Target

In [153]:
X = df.drop(columns=['incident_type'])
y = df['incident_type']

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

## 6. Decision tree
- class = 'incident_type'
- criterion = Information gain ('entropy')

In [154]:
# init
tree_clf = DecisionTreeClassifier(criterion="entropy", max_depth=None, random_state=42)

# cross validation
stratified_kfold = StratifiedKFold(n_splits=4)
tree_scores = cross_val_score(tree_clf, X, y, cv=stratified_kfold, scoring='accuracy')  # 5-fold cv

# results
print("%0.2f accuracy with a standard deviation of %0.2f" % (tree_scores.mean(), tree_scores.std()))

0.27 accuracy with a standard deviation of 0.01


## KNN

In [166]:
# init
knn_clf = KNeighborsClassifier()

# cross validation
stratified_kfold = StratifiedKFold(n_splits=4) # 5-fold cv
knn_scores = cross_val_score(knn_clf, X, y, cv=stratified_kfold, scoring='accuracy')

# results
print("%0.2f accuracy with a standard deviation of %0.2f" % (tree_scores.mean(), tree_scores.std()))

0.27 accuracy with a standard deviation of 0.01
