In [1]:
import csv
from datetime import datetime
import numpy as np
import pandas as pd

# Use packages below for Map Visualizations using Basemap and Arcgis
#from mpl_toolkits.basemap import Basemap
#import arcgis
#from arcgis.gis import GIS
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
import networkx as nx

# Use package below for faster and more simple visualization than Basemap
import mplleaflet

trips_data = pd.read_csv('LDA Data Set.csv', parse_dates = ['Start Date','End Date'])
test_trips_data = trips_data[0:100]
# Load Geographic Locations of Stations
station_data = pd.read_csv('station_data.csv')

In [2]:
test_trips_data.head()

Unnamed: 0,Trip ID,Duration,Start Date,Start Station,Start Terminal,End Date,End Station,End Terminal,Bike #,Subscriber Type,Zip Code
0,432946,406,2014-08-31 22:31:00,Mountain View Caltrain Station,28,2014-08-31 22:38:00,Castro Street and El Camino Real,32,17,Subscriber,94040
1,432945,468,2014-08-31 22:07:00,Beale at Market,56,2014-08-31 22:15:00,Market at 4th,76,509,Customer,11231
2,432944,534,2014-08-31 22:06:00,Beale at Market,56,2014-08-31 22:15:00,Market at 4th,76,342,Customer,11231
3,432942,1041,2014-08-31 21:45:00,Embarcadero at Sansome,60,2014-08-31 22:02:00,Steuart at Market,74,603,Customer,94521
4,432941,1091,2014-08-31 21:44:00,Embarcadero at Sansome,60,2014-08-31 22:02:00,Steuart at Market,74,598,Customer,94521


In [3]:
# Perform aggregation of rides on Start Station and End Station to get unique Start -> End Trips

weighted_data  = pd.DataFrame({'count' : trips_data.groupby( ['Start Station','End Station'] ).size()}).reset_index()
weighted_data.columns = ['Start_Station', 'End_Station', 'Rides']
Average_Num_Rides = weighted_data['Rides'].mean()
weighted_data.sort_values(['Rides'], ascending=False)[0:9]


Unnamed: 0,Start_Station,End_Station,Rides
655,Harry Bridges Plaza (Ferry Building),Embarcadero at Sansome,1689
1584,Townsend at 7th,San Francisco Caltrain (Townsend at 4th),1527
1239,San Francisco Caltrain 2 (330 Townsend),Townsend at 7th,1352
521,Embarcadero at Sansome,Steuart at Market,1128
86,2nd at Townsend,Harry Bridges Plaza (Ferry Building),1109
55,2nd at South Park,Market at Sansome,1039
645,Harry Bridges Plaza (Ferry Building),2nd at Townsend,1007
771,Market at 10th,San Francisco Caltrain (Townsend at 4th),974
1514,Steuart at Market,San Francisco Caltrain (Townsend at 4th),972


In [4]:
print('\nAverage Number of Rides: \n') 
Average_Num_Rides


Average Number of Rides: 



102.44007155635063

In [6]:
# Joins Total # of Rides To and From a Station to get Total_Rides occuring at each Station
start_table = pd.pivot_table(weighted_data, values ='Rides', index=['Start_Station'], aggfunc=np.sum)

# Sort table Descending to get most frequent Station
start_table = pd.DataFrame(start_table).reset_index().sort_values(['Rides'], ascending=False)
end_table = pd.pivot_table(weighted_data, values ='Rides', index=['End_Station'], aggfunc=np.sum)
end_table = pd.DataFrame(end_table).reset_index().sort_values(['Rides'], ascending=False)
end_table = end_table.set_index('End_Station')['Rides'].to_dict()
start_table['To_Rides']=start_table.Start_Station.map(end_table)
start_table['Total_Rides'] = start_table['Rides'] + start_table['To_Rides']
start_table['Station_Name'] = start_table['Start_Station']
rides_table = start_table[['Station_Name', 'Total_Rides']]
rides_table.head()

Unnamed: 0,Station_Name,Total_Rides
50,San Francisco Caltrain (Townsend at 4th),29682
24,Harry Bridges Plaza (Ferry Building),16244
18,Embarcadero at Sansome,15151
51,San Francisco Caltrain 2 (330 Townsend),14490
2,2nd at Townsend,14235


In [7]:
# Create dictionary with Key:Station_Name and Value: Total_Rides

# Dictionary: total_ride_dict
# Type: Dictionary
# Contains: Station and # of Rides going to and from the Station

total_ride_dict = {}

for index, row in rides_table.iterrows():
        total_ride_dict[row['Station_Name']] = row['Total_Rides']
        
#total_ride_dict

In [8]:
# Array: weighted_edges
# Type: Array of 3-tuple
# Contains: Start to End Stations and # of Rides for each respective route for each element

weighted_edges = []

for index, row in weighted_data.iterrows():
        weighted_edges.append((row['Start_Station'],row['End_Station'], row['Rides']))

#weighted_edges

In [9]:
# Create dictionary with Key:Station_Name and Value: (Latitude, Longitude)

# Dictionary: station_dict
# Type: Dictionary with Value being tuples
# Contains: Station and tuple of Latitude, Longitude

station_dict = {}

for index, row in station_data.iterrows():
        station_dict[row['name']] = (row['long'], row['lat'])

# Create dictionary with Key:Station_Name and Value: Station_Name

# Dictionary: station_name_dict
# Type: Dictionary with Value being Label Names
# Contains: Station and Station Name Labels

station_name_dict = {}

for index, row in rides_table.iterrows():
        station_name_dict[row['Station_Name']] = row['Station_Name']

In [10]:
#Create Networkx Multigraph

G = nx.MultiDiGraph()

# Add Nodes 
G.add_nodes_from(station_dict.keys())
# Add weighted edges - size is dependent on rides to End destination
G.add_weighted_edges_from(weighted_edges)

# Assign coordinates, total ride #, and label name to Node Atttributes
for index, coordinates in station_dict.items():
    G.node[index]['pos'] = coordinates
    
for index, number in total_ride_dict.items():
    G.node[index]['rides'] = number
    
for index, name in station_name_dict.items():
    G.node[index]['station_name'] = name

In [11]:
# Assign node attributes to array
pos = nx.get_node_attributes(G, 'pos')
station_name = nx.get_node_attributes(G,'station_name')
total_rides = nx.get_node_attributes(G,'rides')

#station_name

In [12]:
# Test draw without weights
nx.draw(G, pos, with_labels=True, node_size = 150, node_color='#00437c', font_size=100)
mplleaflet.show()

In [13]:
labels = nx.draw_networkx_labels(G,pos, labels=station_name, font_color='#f5f5f5', font_size=20)


In [14]:
# Create factors of edge weights and node sizes, factor is chosen based on viewability on Map

weights = [w[2]['weight']/600.0 for w in  G.edges(data=True)]
#weights

sizes = [s[1]['rides']/20 for s in  G.nodes(data=True)]
#sizes

colors = [c[1]['rides']/100 for c in  G.nodes(data=True)]
#colors

names = [n[1]['station_name'] for n in  G.nodes(data=True)]
#names

label_names = {}    
for node in G.nodes():
        label_names[node] = node


In [15]:
#colors

In [16]:
# Draw Nodes, Labels, and Edges and show on Leaflet
nx.relabel_nodes(G,mapping=station_name_dict, copy=False)
nx.draw_networkx_nodes(G, pos,labels=label_names,arrows=True, cmap=plt.get_cmap('jet'), node_size = sizes, node_color=colors, vmax = 300)
nx.draw_networkx_labels(G, pos, labels=label_names, font_color='r', font_size=16)
nx.draw_networkx_edges(G, pos, width=weights)
mplleaflet.show()

In [None]:
#'#00437c'

In [16]:
nx.draw_networkx_labels(G, pos, labels=station_name, font_color='#f5f5f5', font_size=10)
mplleaflet.show()
## Originally intended to use Basemap, but switched to mplleaflet
#plt.figure(figsize=(20,5))
#fig, ax = plt.subplots()
#map_extent = [-122.53, 37.68, -122.35, 37.83]
#m = Basemap(llcrnrlon=map_extent[0], llcrnrlat=map_extent[1],
#             urcrnrlon=map_extent[2], urcrnrlat=map_extent[3],projection='lcc',resolution='f', epsg=4269)
#m.shadedrelief()
#m.drawcoastlines(color='gray')
#m.drawcountries(color='gray')
#m.drawstates(color='gray')
#m.arcgisimage(server='http://server.arcgisonline.com/ArcGIS', service='ESRI_StreetMap_World_2D', xpixels=1500, verbose=True)
#mplleaflet.show(fig=fig.figure)


In [None]:
nx.draw_networkx_nodes(G, pos,labels=label_names,arrows=True, cmap=plt.get_cmap('jet'), node_size = sizes, node_color='#00437c')
nx.draw_networkx_labels(G, pos, labels=label_names, font_color='r', font_size=5)
nx.draw_networkx_edges(G, pos, width=weights)
plt.show()


ERROR! Session/line number was not unique in database. History logging moved to new session 37
