# Taxi Trajectories Exploration

Which are the most important areas in Beijing?

In [8]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import folium
from folium.plugins import HeatMap

import pyspark
sc

## Load the dataset

In [9]:
rdd = sc.textFile( '/root/data/taxi_log_2008_by_id/1.txt')
rdd.take(10)

['1,2008-02-02 15:36:08,116.51172,39.92123',
 '1,2008-02-02 15:46:08,116.51135,39.93883',
 '1,2008-02-02 15:46:08,116.51135,39.93883',
 '1,2008-02-02 15:56:08,116.51627,39.91034',
 '1,2008-02-02 16:06:08,116.47186,39.91248',
 '1,2008-02-02 16:16:08,116.47217,39.92498',
 '1,2008-02-02 16:26:08,116.47179,39.90718',
 '1,2008-02-02 16:36:08,116.45617,39.90531',
 '1,2008-02-02 17:00:24,116.47191,39.90577',
 '1,2008-02-02 17:10:24,116.50661,39.9145']

In [10]:
rdd = rdd.map(lambda x: x.split(","))

In [11]:
from datetime import datetime

def normalize(values):
    return {
        'id': int(values[0]),
        'timestamp': datetime.strptime(values[1],'%Y-%m-%d %H:%M:%S').timestamp(),
        'lng': float(values[2]),
        'lat': float(values[3])
    }

rdd = rdd.map(normalize)

In [12]:
rdd.take(1)

[{'id': 1, 'lat': 39.92123, 'lng': 116.51172, 'timestamp': 1201966568.0}]

In [13]:
rdd = rdd.sortBy(lambda values: values['timestamp'])

In [14]:
beijing_map = folium.Map(
    location = [39.9375346, 115.837023],
    zoom_start= 9
)

def filter_by_period(date_a, date_b):
    timestamp_a = datetime.strptime(date_a, '%Y-%m-%d %H:%M:%S').timestamp()
    timestamp_b = datetime.strptime(date_b, '%Y-%m-%d %H:%M:%S').timestamp()
    return lambda values: (timestamp_a < values['timestamp'] < timestamp_b)

def prepare_coords(values):
    return [values['lat'], values['lng']]

heatmap_rdd = rdd.filter(filter_by_period('2008-02-01 00:00:00', '2008-02-09 00:00:00'))
heatmap_rdd.take(5)

heatmap_rdd = heatmap_rdd.map(prepare_coords)
heatmap_rdd.take(5)

[{'id': 1, 'lat': 39.92123, 'lng': 116.51172, 'timestamp': 1201966568.0},
 {'id': 1, 'lat': 39.93883, 'lng': 116.51135, 'timestamp': 1201967168.0},
 {'id': 1, 'lat': 39.93883, 'lng': 116.51135, 'timestamp': 1201967168.0},
 {'id': 1, 'lat': 39.91034, 'lng': 116.51627, 'timestamp': 1201967768.0},
 {'id': 1, 'lat': 39.91248, 'lng': 116.47186, 'timestamp': 1201968368.0}]

[[39.92123, 116.51172],
 [39.93883, 116.51135],
 [39.93883, 116.51135],
 [39.91034, 116.51627],
 [39.91248, 116.47186]]

In [15]:
heatmap_rdd.count()

588

In [16]:
HeatMap(heatmap_rdd.collect(), radius=10).add_to(beijing_map)

beijing_map

<folium.plugins.heat_map.HeatMap at 0x7f783ed26ac8>

## Choropleth Maps

In [17]:
import json
from branca.colormap import linear
import numpy as np
from shapely.geometry import Polygon
from shapely.geometry import Point

In [18]:
geo_json_data = json.load(open('/root/beijing.geojson',encoding='utf-8-sig'))
geo_json_data['features'][0]

{'geometry': {'coordinates': [[[116.45414, 40.773655],
    [116.45686, 40.776627],
    [116.45746, 40.778458],
    [116.45806, 40.780285],
    [116.45776, 40.78257],
    [116.45776, 40.785084],
    [116.45596, 40.787827],
    [116.45445, 40.790112],
    [116.45084, 40.793537],
    [116.44511, 40.797653],
    [116.43697, 40.807018],
    [116.42944, 40.813873],
    [116.4225, 40.817757],
    [116.41315, 40.822552],
    [116.4056, 40.828033],
    [116.39806, 40.833511],
    [116.39233, 40.839451],
    [116.38418, 40.84927],
    [116.37784, 40.85635],
    [116.3724, 40.862747],
    [116.36666, 40.868454],
    [116.3582, 40.874386],
    [116.35277, 40.877354],
    [116.34309, 40.886944],
    [116.33645, 40.891735],
    [116.33101, 40.894699],
    [116.32677, 40.900177],
    [116.32435, 40.904289],
    [116.32344, 40.909088],
    [116.32464, 40.914574],
    [116.32644, 40.922806],
    [116.33036, 40.929668],
    [116.33489, 40.935162],
    [116.34093, 40.937912],
    [116.34879, 40.938606],


In [77]:
beijing_chropleth_map = folium.Map(
    location=[39.9375346, 115.837023],
    zoom_start=9
)

folium.GeoJson(geo_json_data).add_to(beijing_chropleth_map)

beijing_chropleth_map

<folium.features.GeoJson at 0x7f783ebb39b0>

In [78]:
regions = {}

for feature in geo_json_data['features']:
    geom = feature['geometry']['coordinates']
    regions[feature['properties']['ADCODE99']] = Polygon(geom[0])

regions

{'110100': <shapely.geometry.polygon.Polygon at 0x7f784950acc0>,
 '110112': <shapely.geometry.polygon.Polygon at 0x7f783eb51860>,
 '110113': <shapely.geometry.polygon.Polygon at 0x7f783eb51828>,
 '110221': <shapely.geometry.polygon.Polygon at 0x7f783eb517b8>,
 '110224': <shapely.geometry.polygon.Polygon at 0x7f783eb51898>,
 '110226': <shapely.geometry.polygon.Polygon at 0x7f783eb517f0>,
 '110227': <shapely.geometry.polygon.Polygon at 0x7f783eb51710>,
 '110228': <shapely.geometry.polygon.Polygon at 0x7f783eb51748>,
 '110229': <shapely.geometry.polygon.Polygon at 0x7f783eb51780>}

In [79]:
chropleth_rdd = rdd.filter(filter_by_period('2008-02-01 00:00:00', '2008-02-09 00:00:00'))

def set_region(values):
    point = Point(values['lng'], values['lat'])
    
    for key, polygon in regions.items():
        if point.within(polygon):
            break

    return (key, 1)

chropleth_rdd.take(5)
chropleth_rdd = chropleth_rdd.map(set_region)
chropleth_rdd.take(5)

[{'id': 1, 'lat': 39.92123, 'lng': 116.51172, 'timestamp': 1201966568.0},
 {'id': 1, 'lat': 39.93883, 'lng': 116.51135, 'timestamp': 1201967168.0},
 {'id': 1, 'lat': 39.93883, 'lng': 116.51135, 'timestamp': 1201967168.0},
 {'id': 1, 'lat': 39.91034, 'lng': 116.51627, 'timestamp': 1201967768.0},
 {'id': 1, 'lat': 39.91248, 'lng': 116.47186, 'timestamp': 1201968368.0}]

[('110224', 1), ('110224', 1), ('110224', 1), ('110224', 1), ('110224', 1)]

In [80]:
from operator import add

chropleth_rdd = chropleth_rdd.reduceByKey(add)
chropleth_rdd.take(3)

[('110224', 109), ('110112', 479)]

In [81]:
_max = chropleth_rdd.max(lambda x: x[1])

In [82]:
threshold_scale = np.linspace(0, _max[1], 6, dtype=int).tolist()

In [83]:
beijing_chropleth_map.choropleth(
    geo_data=geo_json_data,
    data=chropleth_rdd.collect(),
    columns=['ADCODE99','count'],
    key_on='feature.properties.ADCODE99',
    fill_color='YlOrRd',
    legend_name='Taxi Activity during 8 days (2 fev to 8 fev)',
    name ='Choropleth Map Taxi Activity',
    highlight=True,
    threshold_scale = threshold_scale
)

folium.LayerControl().add_to(beijing_chropleth_map)

beijing_chropleth_map

<folium.map.LayerControl at 0x7f783eaeb080>