# SNCB train incidents association rule mining

- **Goal**: find subsequences of events (scenarios) that seem to be highly associated to some types of incidents
- **Method**: Association rule minign from frequent itemsets using Fpgrowth algorithm

## 1. Import librairies

In [4]:
# pip install numpy pandas mlxtend

In [2]:
from mlxtend.frequent_patterns import fpgrowth, fpmax, association_rules
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd
import ast

## 2. Load Data

In [7]:
dataset = 'data/sncb_data_challenge.csv'
df = pd.read_csv(dataset, delimiter=';')
df.head()

Unnamed: 0.1,Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,0,4432881,"[609, 609, 609, 609, 609, 609, 609, 609, 609, ...","[2744, 4004, 2852, 4110, 2854, 4396, 1132, 414...","[-5510, -5510, -5507, -5507, -5506, -5506, -55...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",4
1,1,4432943,"[526, 526, 526, 526, 526, 526, 526, 526, 526, ...","[2744, 4148, 4394, 1566, 1570, 4396, 3634, 412...","[-8573, -8573, -8032, -8032, -8032, -7859, -61...",51.037435,4.431218,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 29.1,...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13
2,2,4432955,"[592, 592, 592, 592, 592, 592, 592, 592, 592, ...","[4394, 1566, 1570, 4114, 4168, 4168, 4156, 406...","[-12291, -12291, -12291, -10932, -10932, -1091...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, False, True, Tr...",14
3,3,4433021,"[576, 576, 576, 576, 576, 576, 576, 576, 576, ...","[4066, 4066, 4066, 4066, 4068, 2742, 4026, 270...","[-14351, -14204, -13890, -13383, -12739, -1243...",51.18322,4.276025,"[0.0, 0.0, 0.0, 0.015625, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",2
4,4,4433129,"[634, 634, 634, 634, 634, 634, 634, 634, 634, ...","[4002, 4032, 4028, 2852, 4026, 4110, 2742, 285...","[-224, -224, -223, -222, -222, -222, -220, -22...",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",14


## 3. Feature engineering
- one hot encod 'events_sequence'
- Discretize ['train_kph_sequence', 'dj_ac_state_sequence', 'dj_dc_state_sequence', 'seconds_to_incident_sequence']

In [8]:
def discretize_speed(speed):
    if speed == 0:
        return 'stop'
    else:
        return 'moving'

def discretize_dj_states(ac_state, dc_state):
    if ac_state and dc_state:
        return 'ac_and_dc'
    elif ac_state:
        return 'ac'
    elif dc_state:
        return 'dc'
    else:
        return 'battery'

def discretize_seconds_to(second):
    if second < 0:
        return 'before'
    else:
        return 'after'

def create_transactions(events_seq, speeds_seq, seconds_to_seq, ac_states_seq, dc_states_seq, incidents):
    res = []
    for events, speeds, seconds_to, ac_states, dc_states, incident in zip(events_seq, speeds_seq, seconds_to_seq, ac_states_seq, dc_states_seq, incidents):
        transaction = []
        for event, speed, second, ac_state, dc_state in zip(events, speeds, seconds_to, ac_states, dc_states):
            transaction.append(f"{event}_{discretize_speed(speed)}_{discretize_dj_states(ac_state, dc_state)}_{discretize_seconds_to(second)}")
        transaction.append(str(incident)) # add incident dimension
        res.append(transaction) # [event1_speed_dj_time, ..., eventn_speed_dj_time, incident]
    return res
    
events_seq = df['events_sequence'].apply(ast.literal_eval)
speeds_seq = df['train_kph_sequence'].apply(ast.literal_eval)
seconds_to_seq = df['seconds_to_incident_sequence'].apply(ast.literal_eval)
ac_states_seq = df['dj_ac_state_sequence'].apply(ast.literal_eval)
dc_states_seq = df['dj_dc_state_sequence'].apply(ast.literal_eval)
incidents = df['incident_type']

transactions = create_transactions(events_seq, speeds_seq, seconds_to_seq, ac_states_seq, dc_states_seq, incidents)

In [74]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
processed_df = pd.DataFrame(te_ary, columns=te.columns_)

processed_df

Unnamed: 0,1000_moving_battery_after,1000_moving_battery_before,1000_moving_dc_before,1000_stop_battery_after,1000_stop_battery_before,1000_stop_dc_before,1002_stop_battery_after,1002_stop_battery_before,1002_stop_dc_after,1002_stop_dc_before,...,986_stop_battery_before,986_stop_dc_before,99,990_stop_dc_before,992_moving_dc_after,992_moving_dc_before,992_stop_dc_after,992_stop_dc_before,998_stop_battery_before,998_stop_dc_after
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1007,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1008,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1009,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## 4. Frequent itemset mining (Fpgrowth)

In [85]:
frequent_itemsets = fpgrowth(processed_df, min_support=0.7, use_colnames=True)

In [86]:
def get_incident_containing_itemset(itemsets):
    result = []
    for itemset in itemsets:
        for incident_type in df['incident_type']:
            if str(incident_type) in itemset:
                result.append(itemset)
                break
    return result

incidents_frequent_itemsets = get_incident_containing_itemset(frequent_itemsets['itemsets'])
incidents_frequent_itemsets

[]

Adding incident_type_X directly to each row means it will only appear in itemsets with sufficient frequency if that incident type occurs very frequently in the data.
- add more contextual information to each row to see if that improves the support of each incident_type

Impossible to mine association rules with incident_types items since any incident_types is frequent (too rare compared to number of transactions):
- Use incident_type as the Consequent (Target) Only
- Leverage Conditional Pattern Mining


## 5. Association rules generation

In [87]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(4026_stop_dc_before),(2708_stop_dc_before),0.853610,0.864491,0.832839,0.975666,1.128603,0.094901,5.568791,0.778391
1,(2708_stop_dc_before),(4026_stop_dc_before),0.864491,0.853610,0.832839,0.963387,1.128603,0.094901,3.998269,0.840890
2,(4068_stop_dc_before),(2708_stop_dc_before),0.789318,0.864491,0.767557,0.972431,1.124860,0.085199,4.915295,0.526862
3,(2708_stop_dc_before),(4068_stop_dc_before),0.864491,0.789318,0.767557,0.887872,1.124860,0.085199,1.878944,0.819136
4,(4026_stop_dc_before),(4068_stop_dc_before),0.853610,0.789318,0.753709,0.882966,1.118645,0.079940,1.800188,0.724516
...,...,...,...,...,...,...,...,...,...,...
95,(3658_stop_dc_before),"(3636_stop_dc_before, 2708_stop_dc_before)",0.734916,0.711177,0.711177,0.967699,1.360700,0.188522,8.941477,1.000000
96,(3636_stop_dc_before),"(3658_stop_dc_before, 2708_stop_dc_before)",0.730959,0.714144,0.711177,0.972936,1.362380,0.189166,10.562364,0.988664
97,(2708_stop_dc_before),"(3658_stop_dc_before, 3636_stop_dc_before)",0.864491,0.730959,0.711177,0.822654,1.125445,0.079269,1.517042,0.822543
98,(3658_stop_dc_after),(3636_stop_dc_after),0.721068,0.724036,0.718101,0.995885,1.375464,0.196022,67.059347,0.978635


## 6. Rule evaluation
- lift
- corelation

(Justify)