In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [16]:
def parse_line(line):
    """Clean and split strings by pipe-symbol, returning a list"""
    return [x for x in str(line.strip()).strip("'b").strip("'").split('|') if x]

In [19]:
# events that we care about
events = {
    'ObservedPositionEvent',
    #'StartedEvent',
    #'StoppedEvent',
    'EnteredEvent',
    'ExitedEvent',
    'PassedEvent',
    'ArrivedEvent',
    'DepartedEvent',
    'ParameterChangedEvent',
    'JourneyStartedEvent',
    'JourneyCompletedEvent',
    'JourneyAssignedEvent'
}

In [34]:
def extract_fields(x):
    """Extract the relevant fields from the list returned by parse_line()"""
    if (x[2] in {'ParameterChangedEvent', 'JourneyAssignedEvent'} and len(x) != 14)\
    or x[2] not in events\
    or x[6] != 'Bus':
        return None

    return [
        x[0], # timestamp
        x[2], # event
        x[9], # vehicle_id
        x[11][7:11] if x[11][:4] in {'9011', '9012', '9015'} else 0, # bus line number
        x[10].split(",")[0], # longitude 
        x[10].split(",")[1], # latitude
        x[12] if x[2] == 'ObservedPositionEvent' else -1, # direction
        x[13] if x[2] == 'ObservedPositionEvent' else -1, # speed
        x[15] if x[2] not in {
            'ObservedPositionEvent',
            'ParameterChangedEvent',
            'JourneyStartedEvent',
            'JourneyCompletedEvent',
            'JourneyAssignedEvent'
        } else None # bus station number
    ]

In [35]:
cols = ['timestamp', 'event', 'vehicle_id', 'line', 'longitude', 'latitude', 'direction', 'speed', 'station']

In [36]:
df = pd.DataFrame(columns=cols).astype(dtype={
    'timestamp': 'object', 
    'event': 'object',
    'vehicle_id': 'int64',
    'line': 'int64',
    'longitude': 'float64', 
    'latitude': 'float64',
    'direction': 'float64',
    'speed': 'float64',
    'station': 'object'
})

Since all the data won't fit into memory, we must parse it in chunks. Cell below writes 2M rows from the log to a dataframe, serializes it and then clears it from memory.

In [37]:
lines = list()
flush = 1e5
i = 1

with open('VehicleEvents.20180216.log', 'rb') as f:
    while True:
        try:
            read = f.readline()
            line = parse_line(read) if read else ""
            
            # parse relevant events
            extracted_fields = extract_fields(line) if line else None
            if extracted_fields:
                lines.append(extracted_fields)
                
            # clear the memory of the list and write to our dataframe when we have read 100k lines
            if len(lines) == flush or line == "":
                df = df.append(pd.DataFrame(data=lines, columns=cols)).astype(dtype={
                    'timestamp': 'object', 
                    'event': 'object',
                    'vehicle_id': 'int64',
                    'line': 'int64',
                    'longitude': 'float64', 
                    'latitude': 'float64',
                    'direction': 'float64',
                    'speed': 'float64',
                    'station': 'object'
                })
                print(f'df has {len(df.index)} rows')
                lines = list()
                
            # if we have reached 2M rows in our dataframe, or if we reached EOF
            # serialize & save the dataframe and clear from memory
            if len(df.index) >= 2e6 or line == "":
                pickle.dump(df, open(f'20180216-{i}.p', 'wb'))
                df = pd.DataFrame(columns=cols)
                print(f'processed and serialized {i} dataframes')
                i += 1
                
            if line == "":
                print('DONE!')
                break
                    
        except Exception as e:
            print(f'this line caused exception:\n{line}')
            print(e)

df has 100000 rows
df has 200000 rows
df has 300000 rows
df has 400000 rows
df has 500000 rows
df has 600000 rows
df has 700000 rows
df has 800000 rows
df has 900000 rows
df has 1000000 rows
df has 1100000 rows
df has 1200000 rows
df has 1300000 rows
df has 1400000 rows
df has 1500000 rows
df has 1600000 rows
df has 1700000 rows
df has 1800000 rows
df has 1900000 rows
df has 2000000 rows
processed and serialized 1 dataframes
df has 100000 rows
df has 200000 rows
df has 300000 rows
df has 400000 rows
df has 500000 rows
df has 600000 rows
df has 700000 rows
df has 800000 rows
df has 900000 rows
df has 1000000 rows
df has 1100000 rows
df has 1200000 rows
df has 1300000 rows
df has 1400000 rows
df has 1500000 rows
df has 1600000 rows
df has 1700000 rows
df has 1800000 rows
df has 1900000 rows
df has 2000000 rows
processed and serialized 2 dataframes
df has 100000 rows
df has 200000 rows
df has 300000 rows
df has 400000 rows
df has 500000 rows
df has 600000 rows
df has 700000 rows
df has 80

In [48]:
df = pickle.load(open('20180216-5.p', 'rb'))

If we sort first by vehicle id then by timestamp we get chronological events for a given vehicle 

In [43]:
df = df.sort_values(['vehicle_id', 'timestamp'])

In [46]:
grouped = df.groupby('line').count().reset_index()

In [59]:
grouped.sort_values('event', ascending=False)

Unnamed: 0,line,timestamp,event,vehicle_id,longitude,latitude
0,0,1971272,1971272,1971272,1971272,1971272
16,201,2429,2429,2429,2429,2429
18,203,1474,1474,1474,1474,1474
13,117,1413,1413,1413,1413,1413
11,115,1391,1391,1391,1391,1391
19,204,1259,1259,1259,1259,1259
23,213,1045,1045,1045,1045,1045
14,119,952,952,952,952,952
17,202,867,867,867,867,867
22,212,787,787,787,787,787


The most common bus line is 201, which seems to be bus line number 1 from looking at the plots on gmaps - the internal representation for bus number 0-9 seems to be to start with 20 and then the actual line number.  

In [49]:
df_ = df[df['vehicle_id'] == 5434].copy().sort_values('timestamp')

In [113]:
df_ = df_.sort_values('timestamp')

In [59]:
journey_cols = ['timestamp', 'event', 'line', 'vehicle_id', 'longitude', 'latitude', 'journey_number', 'segment_number']

In [60]:
df2 = pd.DataFrame(columns=journey_cols).astype(dtype={
                    'timestamp': 'object', 
                    'event': 'object',
                    'line': 'int64',
                    'vehicle_id': 'int64', 
                    'longitude': 'float64', 
                    'latitude': 'float64',
                    'journey_number': 'int64',
                    'segment_number': 'int64'
})

In [61]:
vals = list()
started = False
bus_line = 203
journey_number = 1
for row in df_.itertuples():
    if not started and row[2] == 'JourneyStartedEvent' and row[3] == bus_line:
        started = True
        segment_number = 1
        vals.append([x for x in row[1:]] + [journey_number, segment_number])
        continue
    if started and (row[3] == bus_line or row[3] == 0):
        if row[2] == 'EnteredEvent':
            segment_number += 1
        vals.append([x for x in row[1:]] + [journey_number, segment_number])
        if row[2] == 'JourneyCompletedEvent':
            df2 = df2.append(pd.DataFrame(data=vals, columns=journey_cols)).astype(dtype={
                    'timestamp': 'object', 
                    'event': 'object',
                    'line': 'int64',
                    'vehicle_id': 'int64', 
                    'longitude': 'float64', 
                    'latitude': 'float64',
                    'journey_number': 'int64',
                    'segment_number': 'int64'
            })
            started = False
            vals = list()
            journey_number += 1
    else:
        started = False

In [62]:
df2

Unnamed: 0,timestamp,event,line,vehicle_id,longitude,latitude,journey_number,segment_number
0,2018-02-16T09:31:00.0000000+01:00,JourneyStartedEvent,203,5434,58.417046,15.624240,1,1
1,2018-02-16T09:31:00.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,1
2,2018-02-16T09:31:00.0000000+01:00,ArrivedEvent,203,5434,58.417046,15.624240,1,1
3,2018-02-16T09:31:00.0000000+01:00,EnteredEvent,203,5434,58.417046,15.624240,1,2
4,2018-02-16T09:31:01.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,2
5,2018-02-16T09:31:02.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,2
6,2018-02-16T09:31:03.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,2
7,2018-02-16T09:31:04.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,2
8,2018-02-16T09:31:05.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,2
9,2018-02-16T09:31:06.0000000+01:00,ObservedPositionEvent,0,5434,58.417046,15.624240,1,2


In [133]:
df2['line'] = 203

In [135]:
pickle.dump(df2, open('bus203.p', 'wb'))

In [143]:
len(df2.index)

4258

In [142]:
df2.to_csv('bus203.csv')

In [64]:
df = df2[df2['journey_number'] == 1]

In [39]:
df = pickle.load(open('buslines/bus203.p', 'rb'))

In [4]:
df['line'] = 203

In [100]:
df['journey_number'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
       86, 87, 88, 89, 90])

In [101]:
for jn in df['journey_number'].unique():
    print(jn, df[df['journey_number'] == jn]['segment_number'].unique())

1 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
2 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
3 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
4 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
5 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
6 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
7 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
8 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
9 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
10 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
11 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
12 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
13 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
14 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
15 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
16 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
17 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
18 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
19 [ 1  2  3  4  5  6 

In [40]:
df[(df['journey_number'] == 7) & (df['event'] == 'JourneyStartedEvent')]

Unnamed: 0,timestamp,event,line,vehicle_id,longitude,latitude,journey_number,segment_number
0,2018-02-16T18:21:00.0000000+01:00,JourneyStartedEvent,203,5485,58.416988,15.624218,7,1
1505,2018-02-16T18:45:03.0000000+01:00,JourneyStartedEvent,203,5485,58.414215,15.571053,7,18
0,2018-02-16T18:21:00.0000000+01:00,JourneyStartedEvent,203,5485,58.416988,15.624218,7,1
1505,2018-02-16T18:45:03.0000000+01:00,JourneyStartedEvent,203,5485,58.414215,15.571053,7,18
0,2018-02-16T18:21:00.0000000+01:00,JourneyStartedEvent,203,5485,58.416988,15.624218,7,1
1505,2018-02-16T18:45:03.0000000+01:00,JourneyStartedEvent,203,5485,58.414215,15.571053,7,18


In [53]:
abs(58.414215 - 58.414223) < 0.001

True

In [52]:
abs(58.416988 - 58.414223) > 0.001

True

In [10]:
event_colors = {
#    'ObservedPositionEvent': 'blue',
    'EnteredEvent': 'green',
    'ExitedEvent': 'orange',
    'PassedEvent': 'yellow',
    'ArrivedEvent': 'black',
    'DepartedEvent': 'white',
    'ParameterChangedEvent': 'purple',
    'JourneyStartedEvent': 'grey',
    'JourneyCompletedEvent': 'brown',
    'JourneyAssignedEvent': 'pink'
}

In [2]:
import gmplot
# gonna commit my API key :>
# api key: AIzaSyA1uq1rco3dlRPvzlQjqyaFkqcRJzLgnI8

In [14]:
def create_map(df, vehicle_id=None, name=None):
    """
    Create an IFrame with a google map inside the notebook, drawing a line between the GPS coordinates of the input,
    and adding points for all events which are not ObservedPositionEvents.
    """
    if vehicle_id:
        df_ = df[df['vehicle_id'] == vehicle_id]
    else:
        df_ = df
        vehicle_id = name
    gmap = gmplot.GoogleMapPlotter(df_['longitude'].iloc[0], df_['latitude'].iloc[0], 13)
    for event, color in event_colors.items():
        for long, lat in zip(df_[df_['event'] == event]['longitude'].values, df_[df_['event'] == event]['latitude'].values):
            gmap.marker(
                long, 
                lat,
                color,
                title=event
            )

    gmap.plot(
        df_['longitude'].values,
        df_['latitude'].values, 
        'blue', 
        edge_width=3
    )

    gmap.draw(f'{vehicle_id}_gmplot.html')
    return jupyter_display(f'{vehicle_id}_gmplot.html', 'AIzaSyA1uq1rco3dlRPvzlQjqyaFkqcRJzLgnI8')

In [9]:
def create_heatmap(df):
    gmap = gmplot.GoogleMapPlotter(df['longitude'].iloc[0], df['latitude'].iloc[0], 13)
    gmap.heatmap(
        df['longitude'].values,
        df['latitude'].values
    )

    gmap.draw('heatmap.html')
    return jupyter_display('heatmap.html', 'AIzaSyA1uq1rco3dlRPvzlQjqyaFkqcRJzLgnI8')

In [10]:
from IPython.display import IFrame

def jupyter_display(gmplot_filename, google_api_key):
    """Add API key to the google maps html file and return the IFrame to display in the notebook cell"""
    with open(gmplot_filename, 'rt') as f:
        f_string = f.read()
        url_pattern = 'https://maps.googleapis.com/maps/api/js?libraries=visualization&sensor=true_or_false'
        f_string = f_string.replace(url_pattern, f'{url_pattern}&key={google_api_key}')
    with open(gmplot_filename, 'wt') as f:
        f.write(f_string)
    return IFrame(gmplot_filename, width=990, height=500)

In [11]:
df = pd.read_csv('bus203_all.csv')

In [12]:
create_heatmap(df)

In [15]:
create_map(df[df['journey_number'] == 1], name='jn_1')

In [21]:
create_map(df, 5487)

In [22]:
create_map(df, 6031)

In [24]:
create_map(df, 5420)

In [12]:
df2 = pickle.load(open('bus203.p', 'rb'))