# Taxi Trajectories Exploration

Which are the most important areas in Beijing?

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import folium
from folium.plugins import HeatMap

import pyspark
sc

## Load the dataset

In [24]:
rdd = sc.textFile( '/root/data/taxi_log_2008_by_id/1.txt')
rdd.take(10)

['1,2008-02-02 15:36:08,116.51172,39.92123',
 '1,2008-02-02 15:46:08,116.51135,39.93883',
 '1,2008-02-02 15:46:08,116.51135,39.93883',
 '1,2008-02-02 15:56:08,116.51627,39.91034',
 '1,2008-02-02 16:06:08,116.47186,39.91248',
 '1,2008-02-02 16:16:08,116.47217,39.92498',
 '1,2008-02-02 16:26:08,116.47179,39.90718',
 '1,2008-02-02 16:36:08,116.45617,39.90531',
 '1,2008-02-02 17:00:24,116.47191,39.90577',
 '1,2008-02-02 17:10:24,116.50661,39.9145']

In [25]:
rdd = rdd.map(lambda x: x.split(","))

In [26]:
from datetime import datetime

def normalize(values):
    return {
        'id': int(values[0]),
        'timestamp': datetime.strptime(values[1],'%Y-%m-%d %H:%M:%S').timestamp(),
        'lng': float(values[2]),
        'lat': float(values[3])
    }

rdd = rdd.map(normalize)

In [27]:
rdd.take(1)

[{'id': 1, 'lat': 39.92123, 'lng': 116.51172, 'timestamp': 1201966568.0}]

In [28]:
rdd = rdd.sortBy(lambda values: values['timestamp'])

In [29]:
beijing_map = folium.Map(
    location = [39.9375346, 115.837023],
    zoom_start= 9
)

def filter_by_period(date_a, date_b):
    timestamp_a = datetime.strptime(date_a, '%Y-%m-%d %H:%M:%S').timestamp()
    timestamp_b = datetime.strptime(date_b, '%Y-%m-%d %H:%M:%S').timestamp()
    return lambda values: (timestamp_a < values['timestamp'] < timestamp_b)

def prepare_coords(values):
    return [values['lat'], values['lng']]

heatmap_rdd = rdd.filter(filter_by_period('2008-02-01 00:00:00', '2008-02-09 00:00:00'))
heatmap_rdd.take(5)

heatmap_rdd = heatmap_rdd.map(prepare_coords)
heatmap_rdd.take(5)

[{'id': 1, 'lat': 39.92123, 'lng': 116.51172, 'timestamp': 1201966568.0},
 {'id': 1, 'lat': 39.93883, 'lng': 116.51135, 'timestamp': 1201967168.0},
 {'id': 1, 'lat': 39.93883, 'lng': 116.51135, 'timestamp': 1201967168.0},
 {'id': 1, 'lat': 39.91034, 'lng': 116.51627, 'timestamp': 1201967768.0},
 {'id': 1, 'lat': 39.91248, 'lng': 116.47186, 'timestamp': 1201968368.0}]

[[39.92123, 116.51172],
 [39.93883, 116.51135],
 [39.93883, 116.51135],
 [39.91034, 116.51627],
 [39.91248, 116.47186]]

In [30]:
heatmap_rdd.count()

588

In [32]:
HeatMap(heatmap_rdd.collect(), radius=10).add_to(beijing_map)

beijing_map

<folium.plugins.heat_map.HeatMap at 0x7f9c57bbb438>