In [1]:
import pandas as pd

citibike_data = None

for month in range(1,13):
    if month < 10:
        month_str = f'0{month}'
    else:
        month_str = f'{month}'
    print('Loading month', month_str)
    current_month = pd.read_csv(f'../dataset/2018{month_str}-citibike-tripdata.csv')
    
    if citibike_data is None:
        citibike_data = current_month
    else:
        citibike_data = citibike_data.append(current_month)

Loading month 01
Loading month 02
Loading month 03
Loading month 04
Loading month 05
Loading month 06
Loading month 07
Loading month 08
Loading month 09
Loading month 10
Loading month 11
Loading month 12


In [2]:
citibike_data = citibike_data.dropna()

import numpy as np

citibike_data['starttime'] = pd.to_datetime(citibike_data['starttime'])
citibike_data['stoptime'] = pd.to_datetime(citibike_data['stoptime'])

citibike_data['tripduration'] = (citibike_data['stoptime'] - citibike_data['starttime']).dt.total_seconds().apply(np.floor).astype(int)

filter_columns = [
    'start station id',
    'start station latitude',
    'start station longitude',
    'end station id',
    'end station latitude',
    'end station longitude',
    'tripduration'
]
citibike_data = citibike_data[filter_columns]
citibike_data = citibike_data[citibike_data['tripduration'] <= 20*24*60*60]

In [3]:
start_station = citibike_data[[
    'start station id',
    'start station latitude',
    'start station longitude',
]]
start_station.index = start_station['start station id']
start_station = start_station.drop_duplicates()

count_start_station = citibike_data.groupby('start station id')['start station id'].count()
start_station['trip count'] = count_start_station
trip_count_sum = start_station['trip count'].sum()

In [4]:
start_station

Unnamed: 0_level_0,start station id,start station latitude,start station longitude,trip count
start station id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
72.0,72.0,40.767272,-73.993929,38405
79.0,79.0,40.719116,-74.006667,27198
82.0,82.0,40.711174,-74.000165,10217
83.0,83.0,40.683826,-73.976323,13132
119.0,119.0,40.696089,-73.978034,3240
...,...,...,...,...
530.0,530.0,40.771497,-73.990460,45473
3718.0,3718.0,40.727464,-73.979504,2014
3719.0,3719.0,40.683223,-73.973812,222
3721.0,3721.0,40.767549,-73.920933,238


In [5]:
import folium
from matplotlib.colors import Normalize, rgb2hex
import matplotlib.cm as cm

norm = Normalize(start_station['trip count'].min(), start_station['trip count'].max())
tileset = r'http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png'

start_station_map = folium.Map(
    location=[40.73, -73.93], 
    width=1024, 
    height=786,
    tiles=tileset,
    attr='© OpenStreetMap contributors, © CartoDB',
    zoom_start=12
)

for index, row in start_station.iterrows():
    if index % 100000 == 0:
        print(index)
    color=rgb2hex(cm.YlOrRd(norm(row['trip count'])))
    folium.CircleMarker(
        location=[
            row['start station latitude'],
            row['start station longitude']
        ], 
        popup=row['start station id'],
        radius=row['trip count'] * 2000 / trip_count_sum,
        fill_color=color, 
        line_color=color
    ).add_to(start_station_map)

start_station_map.save(outfile='start_station_map.html')

In [6]:
end_station = citibike_data[[
    'end station id',
    'end station latitude',
    'end station longitude',
]]
end_station.index = end_station['end station id']
end_station = end_station.drop_duplicates()

count_end_station = citibike_data.groupby('end station id')['end station id'].count()
end_station['trip count'] = count_end_station
trip_count_sum = end_station['trip count'].sum()

In [7]:
end_station

Unnamed: 0_level_0,end station id,end station latitude,end station longitude,trip count
end station id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
505.0,505.0,40.749013,-73.988484,75919
3255.0,3255.0,40.750585,-73.994685,81672
525.0,525.0,40.755942,-74.002116,39286
447.0,447.0,40.763707,-73.985162,45038
3356.0,3356.0,40.774667,-73.984706,16343
...,...,...,...,...
530.0,530.0,40.771497,-73.990460,45169
3718.0,3718.0,40.727464,-73.979504,1854
3719.0,3719.0,40.683223,-73.973812,257
3721.0,3721.0,40.767549,-73.920933,240


In [8]:
norm = Normalize(end_station['trip count'].min(), end_station['trip count'].max())
tileset = r'http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png'

end_station_map = folium.Map(
    location=[40.73, -73.93], 
    width=1024, 
    height=786,
    tiles=tileset,
    attr='© OpenStreetMap contributors, © CartoDB',
    zoom_start=12
)

for index, row in end_station.iterrows():
    if index % 100000 == 0:
        print(index)
    color=rgb2hex(cm.YlOrRd(norm(row['trip count'])))
    folium.CircleMarker(
        location=[
            row['end station latitude'],
            row['end station longitude']
        ], 
        popup=row['end station id'],
        radius=row['trip count'] * 2000 / trip_count_sum,
        fill_color=color, 
        line_color=color
    ).add_to(end_station_map)

end_station_map.save(outfile='end_station_map.html')

In [9]:
# Using boundingbox from https://boundingbox.klokantech.com/
# 74.40, 73.19  41.13, 40.12

citibike_data = citibike_data[citibike_data['end station latitude'] <= 41.13]
citibike_data = citibike_data[citibike_data['start station latitude'] <= 41.13]
citibike_data = citibike_data[citibike_data['end station latitude'] >= 40.12]
citibike_data = citibike_data[citibike_data['start station latitude'] >= 40.12]
citibike_data = citibike_data[citibike_data['end station longitude'] >= -74.40]
citibike_data = citibike_data[citibike_data['start station longitude'] >= -74.40]
citibike_data = citibike_data[citibike_data['end station longitude'] <= -73.19]
citibike_data = citibike_data[citibike_data['start station longitude'] <= -73.19]
citibike_data

Unnamed: 0,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,tripduration
0,72.0,40.767272,-73.993929,505.0,40.749013,-73.988484,970
1,72.0,40.767272,-73.993929,3255.0,40.750585,-73.994685,723
2,72.0,40.767272,-73.993929,525.0,40.755942,-74.002116,496
3,72.0,40.767272,-73.993929,447.0,40.763707,-73.985162,306
4,72.0,40.767272,-73.993929,3356.0,40.774667,-73.984706,306
...,...,...,...,...,...,...,...
1016500,336.0,40.730477,-73.999061,379.0,40.749156,-73.991600,734
1016501,3320.0,40.794067,-73.962868,2006.0,40.765909,-73.976342,2551
1016502,427.0,40.701907,-74.013942,259.0,40.701221,-74.012342,2376
1016503,3320.0,40.794067,-73.962868,281.0,40.764397,-73.973715,2758


In [10]:
start_station = citibike_data[[
    'start station id',
    'start station latitude',
    'start station longitude',
]]
start_station.index = start_station['start station id']
start_station = start_station.drop_duplicates()

count_start_station = citibike_data.groupby('start station id')['start station id'].count()
start_station['trip count'] = count_start_station
trip_count_sum = start_station['trip count'].sum()

In [11]:
norm = Normalize(start_station['trip count'].min(), start_station['trip count'].max())
tileset = r'http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png'

start_station_map = folium.Map(
    location=[40.73, -73.93], 
    width=1024, 
    height=786,
    tiles=tileset,
    attr='© OpenStreetMap contributors, © CartoDB',
    zoom_start=12
)

for index, row in start_station.iterrows():
    if index % 100000 == 0:
        print(index)
    color=rgb2hex(cm.YlOrRd(norm(row['trip count'])))
    folium.CircleMarker(
        location=[
            row['start station latitude'],
            row['start station longitude']
        ], 
        popup=row['start station id'],
        radius=row['trip count'] * 2000 / trip_count_sum,
        fill_color=color, 
        line_color=color
    ).add_to(start_station_map)

start_station_map.save(outfile='start_station_map_outlier_removed.html')