In [61]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,case_id,activity,accumulated_time,day_of_month,day_of_week,day_of_year,hour_of_day,min_of_hour,month_of_year,sec_of_min,secs_within_day,week_of_year
0,173688,A_SUBMITTED,0.0,0.466667,0.166667,0.245205,0.456522,0.144068,0.227273,0.245763,0.443565,0.230769
1,173688,A_PARTLYSUBMITTED,4e-06,0.466667,0.166667,0.245205,0.456522,0.144068,0.227273,0.245763,0.443565,0.230769
2,173688,A_PREACCEPTED,0.000618,0.466667,0.166667,0.245205,0.456522,0.161017,0.227273,0.127119,0.444178,0.230769
3,173688,W_Completeren aanvraag,0.000629,0.466667,0.166667,0.245205,0.456522,0.161017,0.227273,0.144068,0.44419,0.230769
4,173688,W_Completeren aanvraag,0.456966,-0.5,0.333333,0.247945,-0.108696,0.110169,0.318182,0.279661,-0.099468,0.230769


In [67]:
def convert(data):
    activity_map = {act: i for i, act in enumerate(data['activity'].unique())}
    lines = []
    for case_id in data['case_id'].unique():
        events = []
        since_start = []
        since_last = []

        e_l = 0
        for l, line in data[data['case_id'] == case_id].iterrows():
            events.append(activity_map[line['activity']])
            since_start.append(line['accumulated_time'])
            if e_l != 0:
                since_last.append(since_start[-1] - since_start[-2])
            else:
                since_last.append(0)
            e_l += 1

        lines.append([events, since_start, since_last, len(events), len(activity_map)])

    df_new = pd.DataFrame(lines, columns=['type_event', 'time_since_start', 'time_since_last_event', 'seq_len', 'dim_process'])
    idx = df_new.index
    idx.set_names('seq_idx', inplace=True)
    return df_new

In [70]:
from sklearn.model_selection import train_test_split    

df_train = convert(train)
df_test = convert(test)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=42)


In [78]:
import json 
records = df_test.to_dict(orient="records")

# Export to JSON in the correct format
with open("test.json", "w") as f:
    json.dump(records, f, indent=2)

In [71]:
df_train.to_json('train.json')
df_valid.to_json('validation.json')
df_test.to_json('test.json')

In [72]:
df_train.head()

Unnamed: 0_level_0,type_event,time_since_start,time_since_last_event,seq_len,dim_process
seq_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1452,"[0, 1, 19, 19, 19, 19, 17, 19]","[0.0, 3.125e-06, 0.0004658564814814, 0.4764326...","[0, 3.125e-06, 0.0004627314814814, 0.475966759...",8,24
3269,"[0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[0.0, 1.5740740740740742e-06, 0.00036065972222...","[0, 1.5740740740740742e-06, 0.0003590856481481...",44,24
5267,"[0, 1, 2, 3, 3, 3, 3, 4, 6, 5, 7, 8, 9, 3, 9, ...","[0.0, 3.8657407407407406e-06, 0.00037872685185...","[0, 3.8657407407407406e-06, 0.0003748611111110...",34,24
1323,"[0, 1, 17]","[0.0, 1.0879629629629629e-06, 0.0003424421296296]","[0, 1.0879629629629629e-06, 0.000341354166666637]",3,24
5189,"[0, 1, 17]","[0.0, 5.127314814814815e-06, 0.0004020717592592]","[0, 5.127314814814815e-06, 0.0003969444444443852]",3,24


In [73]:
df_valid.head()

Unnamed: 0_level_0,type_event,time_since_start,time_since_last_event,seq_len,dim_process
seq_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3909,"[0, 1, 19, 19, 19, 19, 17, 19]","[0.0, 3.0439814814814815e-06, 0.00020884259259...","[0, 3.0439814814814815e-06, 0.0002057986111110...",8,24
2462,"[0, 1, 19, 19, 2, 3, 19, 3, 4, 6, 5, 7, 8, 9, ...","[0.0, 1.423611111111111e-06, 0.001020416666666...","[0, 1.423611111111111e-06, 0.00101899305555548...",53,24
2879,"[0, 1, 2, 3, 3, 3, 3, 4, 6, 5, 7, 8, 9, 3, 9, ...","[0.0, 1.9328703703703703e-06, 0.00053951388888...","[0, 1.9328703703703703e-06, 0.0005375810185184...",37,24
3761,"[0, 1, 17]","[0.0, 2.5810185185185184e-06, 0.0003488888888888]","[0, 2.5810185185185184e-06, 0.0003463078703702...",3,24
3745,"[0, 1, 19, 19, 17, 19]","[0.0, 2.4814814814814816e-05, 0.00073938657407...","[0, 2.4814814814814816e-05, 0.0007145717592591...",6,24


In [74]:
df_test.head()

Unnamed: 0_level_0,type_event,time_since_start,time_since_last_event,seq_len,dim_process
seq_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[0, 1, 2, 2, 3, 4, 2, 4, 4, 4, 4, 4, 5, 6, 7, ...","[0.0, 4.699074074074074e-06, 0.000550798611111...","[0, 4.699074074074074e-06, 0.00054609953703702...",39,24
1,"[0, 1, 2, 2, 3, 4, 2, 4, 4, 4, 4, 4, 4, 4, 5, ...","[0.0, 2.128472222222222e-05, 0.000485347222222...","[0, 2.128472222222222e-05, 0.00046406249999997...",76,24
2,"[0, 1, 2, 2, 3, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, ...","[0.0, 4.0625e-06, 0.0002425810185185, 0.002120...","[0, 4.0625e-06, 0.0002385185185185, 0.00187760...",56,24
3,"[0, 1, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 7, 8, ...","[0.0, 1.0752314814814816e-05, 0.00061980324074...","[0, 1.0752314814814816e-05, 0.0006090509259258...",45,24
4,"[0, 1, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 7, 8, ...","[0.0, 7.465277777777778e-06, 0.000488726851851...","[0, 7.465277777777778e-06, 0.00048126157407402...",61,24
