# Label each row with features required to run ANN notebooks

The new features are:
* The time left until next bus stop (seconds)
* The time it takes to travel the full segment (seconds)
* The time from the start of the journey to the start of the current segment (seconds)

In [1]:
import numpy as np
import pandas as pds
import datetime as dt
import time

In [2]:
df = pds.read_pickle('211_train.p')
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,event,vehicle_id,line,longitude,latitude,direction,speed,station,journey_number,segment_number
0,0,2018-02-16 07:03:00+01:00,JourneyStartedEvent,5333,211,58.417591,15.622906,-1.0,-1.0,,1,1
1,1,2018-02-16 07:03:00+01:00,EnteredEvent,5333,211,58.417591,15.622906,-1.0,-1.0,Link\xf6pings resecentrum,1,1
2,2,2018-02-16 07:03:00+01:00,ArrivedEvent,5333,211,58.417591,15.622906,-1.0,-1.0,Link\xf6pings resecentrum,1,1
3,3,2018-02-16 07:03:01+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1
4,4,2018-02-16 07:03:02+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1


In [3]:
#df = pds.read_csv('bus203_all.csv')
#df.head()

Ignore entries that are not `ObservedPositionEvent`

In [4]:
df = df[df['event'] == "ObservedPositionEvent"]

This leaves the indexes of rows untouched, reset.

In [5]:
df = df.reset_index().drop(columns=['index'])

Convert `timestamp` to pandas datetime object.

In [6]:
#df['timestamp'] = pds.to_datetime(df['timestamp']).dt.tz_localize("UTC").dt.tz_convert("Europe/Stockholm")

A triple loop as it seems, but it is only to finally group rows from each individual segment from every journey. Should be linear in time, as the innermost loop will do all executions and every row is visited once. This only took ~50% of my 8GB of RAM but took ~30 minutes to run.

In [7]:
time_left = pds.DataFrame(np.zeros(len(df.index)), columns=['time_left'])
segment_time = pds.DataFrame(np.zeros(len(df.index)), columns=['segment_time'])
# Time since journey start
tsjs = pds.DataFrame(np.zeros(len(df.index)), columns=['tsjs'])


t0 = time.time()

for j, df_j in df.groupby('journey_number'):
    journey_start = df_j['timestamp'].iloc[0]
    for k, df_s in df_j.groupby('segment_number'):
        end_time = df_s['timestamp'].iloc[-1]
        start_time = df_s['timestamp'].iloc[0]
        for idx, row in df_s.iterrows():
            # The subtraction returns timedelta between the two timestamp objects
            # and total seconds convert the pandas datetime object to seconds
            time_left.iloc[idx] = (end_time - row['timestamp']).total_seconds()
            segment_time.iloc[idx] = (end_time - start_time).total_seconds()
            tsjs.iloc[idx] = (start_time - journey_start).total_seconds()

elapsed = time.time() - t0
print("Data processed in", elapsed, " seconds")

Data processed in 611.1575894355774  seconds


Add new features to dataframe `df`

In [8]:
data = pds.concat([df, time_left,segment_time, tsjs], axis=1)
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,event,vehicle_id,line,longitude,latitude,direction,speed,station,journey_number,segment_number,time_left,segment_time,tsjs
0,3,2018-02-16 07:03:01+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,271.0,271.0,0.0
1,4,2018-02-16 07:03:02+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,270.0,271.0,0.0
2,5,2018-02-16 07:03:03+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,269.0,271.0,0.0
3,6,2018-02-16 07:03:04+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,268.0,271.0,0.0
4,7,2018-02-16 07:03:05+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,267.0,271.0,0.0


Renaming some columns to keep them similar to the GP model for easier understanding.

In [9]:
data.rename(columns={'longitude': 'lon', 'latitude': 'lat', 'segment_number': 'seg', 'journey_number': 'journey'}, inplace=True)
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,event,vehicle_id,line,lon,lat,direction,speed,station,journey,seg,time_left,segment_time,tsjs
0,3,2018-02-16 07:03:01+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,271.0,271.0,0.0
1,4,2018-02-16 07:03:02+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,270.0,271.0,0.0
2,5,2018-02-16 07:03:03+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,269.0,271.0,0.0
3,6,2018-02-16 07:03:04+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,268.0,271.0,0.0
4,7,2018-02-16 07:03:05+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,1,1,267.0,271.0,0.0


Add history entries for model 3

In [10]:
history_speed = pds.DataFrame(0, index=np.arange(len(data)), columns=['speed_one', 'speed_two', 'speed_three'])
history_pos = pds.DataFrame(0, index=np.arange(len(data)), columns=['lat_one', 'lon_one', 'lat_two', 'lon_two', 'lat_three', 'lon_three'])

Add columns for the three latest positions for each sample. Replace the first sample from each journey with itself.

In [11]:
for i, a in data.groupby('journey'):
    idxs = a.index

    history_pos.lat_one.iloc[idxs] = a.lat.shift(1)
    history_pos.lat_one.iloc[idxs[0]] = history_pos.lat_one.iloc[1]

    history_pos.lon_one.iloc[idxs] = a.lon.shift(1)
    history_pos.lon_one.iloc[idxs[0]] = history_pos.lon_one.iloc[1]

    history_pos.lat_two.iloc[idxs] = a.lat.shift(2)
    history_pos.lat_two.iloc[idxs[0]] = history_pos.lat_two.iloc[2]
    history_pos.lat_two.iloc[idxs[1]] = history_pos.lat_two.iloc[2]

    history_pos.lon_two.iloc[idxs] = a.lon.shift(2)
    history_pos.lon_two.iloc[idxs[0]] = history_pos.lon_two.iloc[2]
    history_pos.lon_two.iloc[idxs[1]] = history_pos.lon_two.iloc[2]

    history_pos.lat_three.iloc[idxs] = a.lat.shift(3)
    history_pos.lat_three.iloc[idxs[0]] = history_pos.lat_three.iloc[3]
    history_pos.lat_three.iloc[idxs[1]] = history_pos.lat_three.iloc[3]
    history_pos.lat_three.iloc[idxs[2]] = history_pos.lat_three.iloc[3]

    history_pos.lon_three.iloc[idxs] = a.lon.shift(3)
    history_pos.lon_three.iloc[idxs[0]] = history_pos.lon_three.iloc[3]
    history_pos.lon_three.iloc[idxs[1]] = history_pos.lon_three.iloc[3]
    history_pos.lon_three.iloc[idxs[2]] = history_pos.lon_three.iloc[3]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


and speed

In [12]:
for i, a in data.groupby('journey'):
    idxs = a.index

    history_speed.speed_one.iloc[idxs] = a.lat.shift(1)
    history_speed.speed_one.iloc[idxs[0]] = history_speed.speed_one.iloc[1]

    history_speed.speed_two.iloc[idxs] = a.lat.shift(2)
    history_speed.speed_two.iloc[idxs[0]] = history_speed.speed_two.iloc[2]
    history_speed.speed_two.iloc[idxs[1]] = history_speed.speed_two.iloc[2]

    history_speed.speed_three.iloc[idxs] = a.lat.shift(3)
    history_speed.speed_three.iloc[idxs[0]] = history_speed.speed_three.iloc[3]
    history_speed.speed_three.iloc[idxs[1]] = history_speed.speed_three.iloc[3]
    history_speed.speed_three.iloc[idxs[2]] = history_speed.speed_three.iloc[3]

Make sure there are no NaN:s produced by the shifting

In [13]:
history_pos.isna().sum()

lat_one      0
lon_one      0
lat_two      0
lon_two      0
lat_three    0
lon_three    0
dtype: int64

In [14]:
history_speed.isna().sum()

speed_one      0
speed_two      0
speed_three    0
dtype: int64

The two cells above should contain only zeros, otherwise, model training will fail!

In [15]:
new_data = pds.concat([history_pos, history_speed], axis=1)
data = pds.concat([data, new_data], axis=1)
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,event,vehicle_id,line,lon,lat,direction,speed,station,...,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
0,3,2018-02-16 07:03:01+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,...,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
1,4,2018-02-16 07:03:02+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,...,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
2,5,2018-02-16 07:03:03+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,...,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
3,6,2018-02-16 07:03:04+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,...,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
4,7,2018-02-16 07:03:05+01:00,ObservedPositionEvent,5333,0,58.417591,15.622907,328.700012,0.0,,...,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907


... remove unwanted columns ...

In [16]:
data = data.drop(columns=['Unnamed: 0', 'event', 'vehicle_id', 'line', 'station'])
data.head()

Unnamed: 0,timestamp,lon,lat,direction,speed,journey,seg,time_left,segment_time,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
0,2018-02-16 07:03:01+01:00,58.417591,15.622907,328.700012,0.0,1,1,271.0,271.0,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
1,2018-02-16 07:03:02+01:00,58.417591,15.622907,328.700012,0.0,1,1,270.0,271.0,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
2,2018-02-16 07:03:03+01:00,58.417591,15.622907,328.700012,0.0,1,1,269.0,271.0,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
3,2018-02-16 07:03:04+01:00,58.417591,15.622907,328.700012,0.0,1,1,268.0,271.0,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907
4,2018-02-16 07:03:05+01:00,58.417591,15.622907,328.700012,0.0,1,1,267.0,271.0,0.0,15.622907,58.417591,15.622907,58.417591,15.622907,58.417591,15.622907,15.622907,15.622907


... and save it.

In [17]:
data.to_pickle('ANN_data_train_211.pkl')