In [1]:
#import the necessary packages
import pandas as pd
import os
from bs4 import BeautifulSoup
import requests
import lxml
import zipfile
import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime
from datetime import datetime

In [2]:
# function to generate base map, has default values for zoom and tiles
def generateBaseMap(loc, zoom=12, tiles='Stamen Toner', crs='ESPG2263'):
    '''
    Function that generates a Folium base map
    Input location lat/long
    Zoom level default 12
    Tiles default to Stamen Toner
    CRS default 2263 for NYC
    '''
    return folium.Map(location=loc, 
                      control_scale=True, 
                      zoom_start=zoom,
                      tiles=tiles)

In [3]:
nyc = [40.7400, -73.985880] # generic nyc lat/lon in list format
base_map = generateBaseMap(nyc) # pass lat/lon to function
base_map


In [4]:
# Define path
path = r'/Users/azadeh/Documents/CitiBike Data Quest/'

In [5]:
# Import Data
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'citibike.csv'), index_col = False)

In [6]:
df.head()

Unnamed: 0,trip_id,bike_id,weekday,start_hour,start_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_time,end_station_id,end_station_name,end_station_latitude,end_station_longitude,trip_duration,subscriber,birth_year,gender
0,LnQzQk,16013,Mon,18,2013-09-09 18:18:55,523,W 38 St & 8 Ave,40.754666,-73.991382,2013-09-09 18:35:28,334,W 20 St & 7 Ave,40.742388,-73.997262,993,Subscriber,1968.0,2
1,IL9boN,15230,Thu,18,2013-09-12 18:38:53,257,Lispenard St & Broadway,40.719392,-74.002472,2013-09-12 18:48:34,236,St Marks Pl & 2 Ave,40.728419,-73.98714,581,Subscriber,1983.0,1
2,46clGB,17942,Wed,19,2013-09-18 19:44:04,479,9 Ave & W 45 St,40.760193,-73.991255,2013-09-18 19:50:05,513,W 56 St & 10 Ave,40.768254,-73.988639,361,Subscriber,1989.0,1
3,v7vdFt,19683,Sat,11,2013-09-28 11:54:37,527,E 33 St & 1 Ave,40.743156,-73.974347,2013-09-28 12:03:58,441,E 52 St & 2 Ave,40.756014,-73.967416,561,Subscriber,1988.0,2
4,VGBsb5,18024,Sat,18,2013-09-07 18:08:22,521,8 Ave & W 31 St,40.75045,-73.994811,2013-09-07 18:46:38,476,E 31 St & 3 Ave,40.743943,-73.979661,2296,Non-Subscriber,,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   trip_id                  50000 non-null  object 
 1   bike_id                  50000 non-null  int64  
 2   weekday                  50000 non-null  object 
 3   start_hour               50000 non-null  int64  
 4   start_time               50000 non-null  object 
 5   start_station_id         50000 non-null  int64  
 6   start_station_name       50000 non-null  object 
 7   start_station_latitude   50000 non-null  float64
 8   start_station_longitude  50000 non-null  float64
 9   end_time                 50000 non-null  object 
 10  end_station_id           50000 non-null  int64  
 11  end_station_name         50000 non-null  object 
 12  end_station_latitude     50000 non-null  float64
 13  end_station_longitude    50000 non-null  float64
 14  trip_duration         

In [8]:
# look at count per hour
df['start_hour'].value_counts()

start_hour
17    5070
18    4871
16    3791
19    3303
8     3284
15    3129
14    3003
13    2979
12    2900
9     2811
11    2277
20    2208
10    2166
7     2043
21    1536
22    1323
23     932
6      911
0      522
1      330
5      208
2      194
3      120
4       89
Name: count, dtype: int64



Add a count column to count how many of rides during each hour were taken from a given station.

In [9]:
df['count'] = 1

Create new df with groupby `start_station_id`, `start_station_latitude`, `start_station_longitude` and sum up `count` column.

In [10]:
df2 = pd.DataFrame(df.groupby(['start_station_id', 'start_station_latitude', 'start_station_longitude'])['count']\
                        .sum().sort_values(ascending=False))

df2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
start_station_id,start_station_latitude,start_station_longitude,Unnamed: 3_level_1
459,40.746745,-74.007756,536
293,40.730287,-73.990765,505
519,40.751884,-73.977702,503
497,40.73705,-73.990093,496
426,40.717548,-74.013221,465


In [11]:
# create list of lat/long and count (as weight)
lst = df2.groupby(['start_station_latitude', 'start_station_longitude']).sum().reset_index().values.tolist()

## Create Heat Map

In [12]:
# Specify the directory path
directory = './images'

if not os.path.exists(directory):
    os.makedirs(directory)

In [13]:
# add data to basemap 
HeatMap(data=lst, radius=12).add_to(base_map);

# save base map as .html
base_map.save('./images/bike_station_HeatMap.html')

# call map 
base_map

## Create Heat Map with Time

The data passed to HeatMapWithTime, needs to be lists within lists, with each list representing an hour. 

In [14]:
df_hour_list = [] # create blank list

# loop through each hour
for hour in df['start_hour'].sort_values().unique(): 
    df_hour_list.append(df.loc[df['start_hour'] == hour, # for each hour append to list  
    ['start_station_latitude', 'start_station_longitude', 'count']]\
    .groupby(['start_station_latitude', 'start_station_longitude'])\
    .sum().reset_index().values.tolist()) # sum totals per station, reset index and create list
    
# preview output
df_hour_list

[[[40.680342423, -73.9557689392, 1.0],
  [40.68312489, -73.97895137, 2.0],
  [40.68317813, -73.9659641, 1.0],
  [40.68382604, -73.97632328, 1.0],
  [40.68528172, -73.97805813, 2.0],
  [40.68683208, -73.9796772, 1.0],
  [40.68691865, -73.976682, 2.0],
  [40.68753406, -73.97265183, 2.0],
  [40.68764484, -73.96968902, 1.0],
  [40.68807003, -73.98410637, 3.0],
  [40.68864636, -73.98263429, 1.0],
  [40.68926942, -73.98912867, 7.0],
  [40.68940747, -73.96885458, 1.0],
  [40.6906495, -73.95643107, 1.0],
  [40.69089272, -73.99612349, 5.0],
  [40.69236178, -73.98631746, 3.0],
  [40.69246277, -73.98963911, 2.0],
  [40.693261, -73.968896, 1.0],
  [40.69363137, -73.96223558, 1.0],
  [40.695065, -73.987167, 1.0],
  [40.69760127, -73.99344559, 3.0],
  [40.69766564, -73.98476437, 1.0],
  [40.700469, -73.991454, 1.0],
  [40.70122128, -74.01234218, 1.0],
  [40.7014851, -73.98656928, 2.0],
  [40.70251526, -74.01427023, 1.0],
  [40.70255088, -73.98940236, 1.0],
  [40.70277159, -73.99383605, 1.0],
  [40.7

In [15]:
# create 2nd base map for heat map with time
base_map_2 = generateBaseMap(nyc)

In [16]:
# create a more meaningful index for heat map with time

start = datetime(2013,1,1,0)
end = datetime(2013,1,1,23)
daterange = pd.date_range(start=start,
                          end=end,
                         periods=24) # use pandas daterange function to generate date range object

time_index = [d.strftime("%I:%M %p") for d in daterange] # format time with AM/PM


In [17]:
# instantiate HeatMapWithTime
HeatMapWithTime(df_hour_list,radius=11,
                index=time_index,
                gradient={0.1: 'blue', 0.5: 'lime', 0.7: 'orange', 1: 'red'}, 
                min_opacity=0.4, 
                max_opacity=0.8, 
                use_local_extrema=True)\
                .add_to(base_map_2)

# save as html
base_map_2.save('./images/heatmapwithtime_bikeshare.html')

# call result
base_map_2