In [30]:
import pandas as pd
import folium # Plot geospatial data

from folium import plugins

from tqdm.auto import tqdm  # progress bars for Jupyter notebooks (or other notebook IDE)
from ast import literal_eval # Interperate datatypes ('[str object]' == [list object])
from datetime import datetime as dt # Convert Unix Timestamps to datetime objects


In [2]:
taxi = pd.read_csv('taxi-service/train.csv')

In [3]:
taxi.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [4]:
# Use datetime library to convert unix timestamp to understandable datetime object.
taxi['TIMESTAMP'] = taxi['TIMESTAMP'].apply(lambda x: dt.fromtimestamp(x))

In [5]:
# That's.... That's a lot of data.
taxi.shape

(1710670, 9)

In [6]:
taxi.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,2013-06-30 18:00:58,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,2013-06-30 18:08:23,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,2013-06-30 18:02:31,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,2013-06-30 18:00:54,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,2013-06-30 18:04:51,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [7]:
# This is a problem. The datatype of each row is a string but clearly should be a list of lists.
# Also, the coordinates are in lon/lat format, but folium needs lat/lon. Will need to convert.
taxi['POLYLINE'][0]

'[[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-8.630829,41.154498],[-8.630838,41.154489]]'

In [8]:
# We also have so much data that any conditioning takes too long for effectively conducting trial
# and error. Here I am limiting taxi data to just July 2013. Still 145000+ rows... But reduces
# the time it takes to clean the data.
pattern = '%Y-%m-%d %H:%M:%S' # Google datetime string formatting for more information.
taxi_subset = taxi[(taxi['TIMESTAMP'] >= dt.strptime('2013-07-01 00:00:00', pattern)) &
                   (taxi['TIMESTAMP'] <= dt.strptime('2013-07-07 23:59:59', pattern))].reset_index(drop=True)

In [9]:
# Initiate progress bar
tqdm.pandas() 
# Use progress_apply instead of apply to see progress bar.
# literal_eval should convert strings to lists.
taxi_subset['POLYLINE'] = taxi_subset['POLYLINE'].progress_apply(literal_eval)
# Get some coffee........

  from pandas import Panel


HBox(children=(FloatProgress(value=0.0, max=34791.0), HTML(value='')))




In [10]:
# Reverse the latitude and longitude
# [::-1] is the expression to reverse an iterable such as list or a string.
# [start:stop:step]. Leaving the start and stop blank just means that we want the whole thing.
# A negative step means to start at the end and go back to the beginning.
# Since this is a list of lists, we have to say give me the reverse of x for x in lst. 
taxi_subset['POLYLINE'] = taxi_subset['POLYLINE'].progress_apply(lambda lst: [x[::-1] for x in lst]) 

HBox(children=(FloatProgress(value=0.0, max=34791.0), HTML(value='')))




In [11]:
# Success
type(taxi_subset['POLYLINE'][0])

list

In [12]:
# Double success
taxi_subset['POLYLINE']

0                                 [[41.159394, -8.580051]]
1        [[41.161059, -8.621037], [41.160726, -8.621109...
2        [[41.145516, -8.610705], [41.145471, -8.610768...
3        [[41.161059, -8.621019], [41.160465, -8.621379...
4        [[41.146308, -8.579025], [41.147037, -8.580186...
                               ...                        
34786    [[41.157126, -8.666631], [41.157369, -8.666793...
34787    [[41.161122, -8.604783], [41.160951, -8.604954...
34788    [[41.146911, -8.587386], [41.146965, -8.58744]...
34789                                                   []
34790    [[41.153706, -8.610408], [41.15367, -8.611821]...
Name: POLYLINE, Length: 34791, dtype: object

In [13]:
# Determine the mean of the latitude and longitudes so we can auto-center the map at the correct spot.
lats = [lat[0] for lst in taxi_subset['POLYLINE'] for lat in lst]
lons = [lon[1] for lst in taxi_subset['POLYLINE'] for lon in lst]
mean_lat = sum(lats) / len(lats)
mean_lon = sum(lons) / len(lons)
print(mean_lat, mean_lon)

41.15702356037936 -8.618215222134841


In [17]:
# folium.map natively has an interactive a basemap for us.

my_map = folium.Map(zoom_start=14, location=(mean_lat, mean_lon))

# Add polylines. DO NOT recommend plotting all lines at once as this will take a looooooooong time
# and probably look messy. Maybe just a day at a time, or maybe a week. We've already converted
# Unix timestamps to datetime objects so this should be easy enough to filter the original dataframe.

for line in taxi_subset['POLYLINE'][:100]:
    if len(line) > 1:
        folium.PolyLine(line, color="red", weight=2.5, opacity=1).add_to(my_map)
    
 
# Note that Folium is highly interactive. You can add markers to each point on the line with popups
# that show whatever metadata you want

# Optional command to save off your map

my_map




In [45]:
my_map = folium.Map(zoom_start=14, location=(mean_lat, mean_lon))

locs = pd.DataFrame()
locs['lats'] = lats
locs['lons'] = lons

# plot heatmap
my_map.add_child(plugins.HeatMap(locs[:2000], radius=15))
my_map.save("./my_map.html")