# Label each row with time remaining until end of each segment

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pds
import datetime as dt
import time
from datetime import datetime

In [2]:
df = pds.read_csv('../../bus203_all.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,event,vehicle_id,line,longitude,latitude,direction,speed,station,journey_number,segment_number
0,0,2018-02-16T04:48:40.0000000+01:00,JourneyStartedEvent,5432,203,58.414238,15.571015,-1.0,-1.0,,1,1
1,1,2018-02-16T04:48:40.0000000+01:00,ObservedPositionEvent,5432,0,58.414238,15.571015,147.300003,0.0,,1,1
2,2,2018-02-16T04:48:40.0000000+01:00,ArrivedEvent,5432,203,58.414238,15.571015,-1.0,-1.0,Rydsv\xe4gens \xe4ndh\xe5llpl.,1,1
3,3,2018-02-16T04:48:41.0000000+01:00,ObservedPositionEvent,5432,0,58.414246,15.571012,147.300003,0.0,,1,1
4,4,2018-02-16T04:48:42.0000000+01:00,ObservedPositionEvent,5432,0,58.414249,15.571008,147.300003,0.0,,1,1


Ignore entries that are not `ObservedPositionEvent`

In [None]:
df = df[df['event'] == "ObservedPositionEvent"]

This leaves the indexes of rows untouched, reset.

In [None]:
df = df.reset_index().drop(columns=['index'])

A triple loop as it seems, but it is only to finally group rows from each individual segment from every journey. Should be linear in time, as the innermost loop will do all executions and every row is visited once. This only took ~50% of my 8GB of RAM but took 850 seconds to run.

In [None]:
ts = pds.DataFrame(np.zeros(len(df.index)), columns=list('t'))

t0 = time.time()

for j, df_j in df.groupby('journey_number'):
    for k, df_s in df_j.groupby('segment_number'):
        end_time = df_s['timestamp'].iloc[-1]
        for idx, row in df_s.iterrows():
            # The subtraction returns timedelta between the two timestamp objects
            # and total seconds convert the pandas datetime object to seconds
            ts.iloc[idx] = (end_time - row['timestamp']).total_seconds()

elapsed = time.time() - t0
print("Data processed in", elapsed, " seconds")

Add time left to dataframe `df`

In [None]:
datta = pds.concat([df, ts], axis=1)
datta.head()

... and save it.

In [None]:
datta.to_pickle('added_t.pkl')

Extra work to add `time_since_journey_start`

In [4]:
data = pds.read_pickle('added_t.pkl')
df = pds.read_csv('../../bus203_all_labelled.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,timestamp,event,vehicle_id,line,longitude,latitude,direction,speed,station,journey_number,segment_number,time_since_journey_start,label
0,0,0,2018-02-16 04:48:40+01:00,JourneyStartedEvent,5432,203,58.414238,15.571015,-1.0,-1.0,,1,1,0,72
1,1,1,2018-02-16 04:48:40+01:00,ObservedPositionEvent,5432,0,58.414238,15.571015,147.300003,0.0,,1,1,0,72
2,2,2,2018-02-16 04:48:40+01:00,ArrivedEvent,5432,203,58.414238,15.571015,-1.0,-1.0,Rydsv\xe4gens \xe4ndh\xe5llpl.,1,1,0,72
3,3,3,2018-02-16 04:48:41+01:00,ObservedPositionEvent,5432,0,58.414246,15.571012,147.300003,0.0,,1,1,0,72
4,4,4,2018-02-16 04:48:42+01:00,ObservedPositionEvent,5432,0,58.414249,15.571008,147.300003,0.0,,1,1,0,72


In [9]:
df = df[df['event'] == "ObservedPositionEvent"]
df = df.reset_index().drop(columns=['index'])

In [10]:
data['ttjs'] = df['time_since_journey_start']
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,timestamp,event,vehicle_id,line,longitude,latitude,direction,speed,station,journey_number,segment_number,label,t,ttjs
0,1,1,2018-02-16 04:48:40+01:00,ObservedPositionEvent,5432,0,58.414238,15.571015,147.300003,0.0,,1,1,72,71.0,0
1,3,3,2018-02-16 04:48:41+01:00,ObservedPositionEvent,5432,0,58.414246,15.571012,147.300003,0.0,,1,1,72,70.0,0
2,4,4,2018-02-16 04:48:42+01:00,ObservedPositionEvent,5432,0,58.414249,15.571008,147.300003,0.0,,1,1,72,69.0,0
3,5,5,2018-02-16 04:48:43+01:00,ObservedPositionEvent,5432,0,58.414257,15.571004,147.300003,0.0,,1,1,72,68.0,0
4,6,6,2018-02-16 04:48:44+01:00,ObservedPositionEvent,5432,0,58.414257,15.571006,147.300003,0.0,,1,1,72,67.0,0


In [11]:
data.to_pickle('added_t_ttjs.pkl')