**START**

Load in pandas and sqlalchemy's create_engine & text
Then connect to Postgres and create the engine

In [None]:
from sqlalchemy import create_engine, text
import pandas as pd
import numpy
import seaborn as sns
import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
from folium import plugins
from folium.plugins import HeatMap
database_name = 'scooters'
connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"
engine = create_engine(connection_string)

**EDA**

In [None]:
count_all_rows = '''
(SELECT
    'scooters' table,
    COUNT(*)
FROM scooters)
UNION
(SELECT
    'trips' table,
    COUNT(*)
FROM trips);
'''

with engine.connect() as connection:
    counts = pd.read_sql(text(count_all_rows), con = connection)

counts

In [None]:
find_nulls_scooters = '''
SELECT *
FROM scooters
WHERE NOT(scooters IS NOT NULL);
'''

with engine.connect() as connection:
    nulls_scooters = pd.read_sql(text(find_nulls_scooters), con = connection)

find_nulls_trips = '''
SELECT *
FROM trips
WHERE NOT(trips IS NOT NULL);
'''

with engine.connect() as connection:
    nulls_trips = pd.read_sql(text(find_nulls_trips), con = connection)

nulls_scooters.info()
nulls_trips.info()

While the .info() part of the output is a bit counter-intuitave, it shows that there are 770 null values in the scooters table and none in the trips table.

All of the null values are in the chargelevel column, and looking at the full output, they all belong to Bolt and Spin.

In [None]:
date_range = '''
(SELECT
    'scooters' table,
    MIN(pubdatetime) begin,
    MAX(pubdatetime) end
FROM scooters)
UNION
(SELECT
    'trips' table,
    MIN(pubtimestamp) begin,
    MAX(pubtimestamp) end
FROM trips);
'''

with engine.connect() as connection:
    dates = pd.read_sql(text(date_range), con = connection)

dates

Both tables contain 3 months of data and both begin on May 1st, but the trips table ends a day after the scooters table. Looking at why, the trips all began before midnight on July 31st.

In [None]:
long_trips = '''
SELECT 
    companyname,
    COUNT(*)
FROM trips
WHERE tripduration > 1440
GROUP BY companyname;
'''

with engine.connect() as connection:
    long = pd.read_sql(text(long_trips), con = connection)

long

In [None]:
short_trips = '''
SELECT
    companyname,
    COUNT(*)
FROM trips
WHERE tripduration < 1
    AND tripdistance <= 0
GROUP BY companyname;
'''

with engine.connect() as connection:
    short = pd.read_sql(text(short_trips), con = connection)

short

This data was supposed to have been cleaned before being submitted to the city, which includes stripping out all trips shorter than a minute or longer than 24 hours.

There are ~7,000 trips longer than 24 hours and over 9,000 trips shorter than one minute.

In [None]:
available_scooters = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) scooters
FROM scooters
GROUP BY companyname
'''

with engine.connect() as connection:
    available = pd.read_sql(text(available_scooters), con = connection)

active_scooters = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) scooters
FROM trips
GROUP BY companyname
'''

with engine.connect() as connection:
    active = pd.read_sql(text(active_scooters), con = connection)

available

Filter trips table:
- Remove trips under a minute
- Remove all zero-distance trips
- Remove unreasonable long trips
    - These scooters have an average top speed of 15 mph, and generally have a range of about 50 miles.
    - That said, the batteries should last on average ~3.5 hours

In [None]:
colors={'Bird':"#007ACC",'Lyft':"#FF99CC",'Bolt Mobility':"#FFFF4D",'SPIN':"#E62E00",'Gotcha':"#FF9933",'JUMP':"#B366FF",'Lime':"#00CC00"}
pallette = colors

In [None]:
trips_clean = '''
SELECT *
FROM trips
WHERE tripduration > 1.0
    AND tripduration < 200.0
    AND tripdistance > 0
    AND tripdistance < 264000
'''

with engine.connect() as connection:
    trips = pd.read_sql(text(trips_clean), con = connection)
    
trips.describe()

trips.head()

In [None]:
daily_use = '''
WITH daily_use AS
    (SELECT 
        DISTINCT sumdid,
        companyname company,
        COUNT(sumdid) OVER (PARTITION BY sumdid, DATE(pubtimestamp)) uses_per_day
    FROM trips
    WHERE tripduration > 1.0
        AND tripduration < 200.0
        AND tripdistance > 0
        AND tripdistance < 264000
    GROUP BY sumdid, company, pubtimestamp)
SELECT
    DISTINCT company,
    AVG(uses_per_day) OVER (PARTITION BY company)
FROM daily_use
'''

with engine.connect() as connection:
    usage = pd.read_sql(text(daily_use), con = connection)

usage

In [None]:
sns.barplot

**MAPPING**

In [None]:
zipcodes = gpd.read_file('../data/zipcodes.geojson')

zipcodes = zipcodes[['zip', 'po_name', 'geometry']]
# Create separate tables for start & end location data
scooter_start = trips[['companyname', 'triprecordnum', 'sumdid', 'startlongitude', 'startlatitude']]
scooter_start_geo = gpd.GeoDataFrame(scooter_start, crs= zipcodes.crs, geometry=gpd.points_from_xy(scooter_start.startlongitude, scooter_start.startlatitude))
scooter_end = trips[['companyname', 'triprecordnum', 'sumdid', 'endlongitude', 'endlatitude']]
scooter_end_geo = gpd.GeoDataFrame(scooter_end, crs= zipcodes.crs, geometry=gpd.points_from_xy(scooter_end.endlongitude, scooter_end.endlatitude))

In [None]:
# Combine start and end locations by zip code
starting_zip = gpd.sjoin(scooter_start_geo, zipcodes, predicate='within')
ending_zip = gpd.sjoin(scooter_end_geo, zipcodes, predicate='within')

In [None]:
starting_zip.head()

In [None]:
ending_zip.head()

Now let's look at where scooter trips are taking people. Counting up the number of trips that began in each zip code and comparing them with the number of trips ending in each zip code, we can see which areas people are more likely to come from and go to.

In [None]:
start_by_zip = pd.DataFrame(starting_zip['zip'].value_counts())
end_by_zip = pd.DataFrame(ending_zip['zip'].value_counts())
delta = lambda start_by_zip, end_by_zip : end_by_zip - start_by_zip

In [None]:
destinations = delta(start_by_zip,end_by_zip).sort_values(by='count', ascending=False)
destinations.head(3)

In [None]:
origins = delta(start_by_zip,end_by_zip).sort_values(by='count', ascending=True)
origins.head(3)

More trips ended in 37201, 37207, and 37209 than started there, while more trips began in 37204, 37219, and 37203 than ended there. Now let's look into trips to/from these zip areas.

In [None]:
origin_destination_trips= trips[['companyname', 'triprecordnum', 'sumdid', 'startlongitude', 'startlatitude', 'endlongitude', 'endlatitude', 'triproute']]
origin_trips_geo = gpd.GeoDataFrame(origin_destination_trips, crs= zipcodes.crs, geometry=gpd.points_from_xy(origin_destination_trips.startlongitude, origin_destination_trips.startlatitude))
dest_trips_geo = gpd.GeoDataFrame(origin_destination_trips, crs= zipcodes.crs, geometry=gpd.points_from_xy(origin_destination_trips.endlongitude, origin_destination_trips.endlatitude))

In [None]:
origin_zip = gpd.sjoin(origin_trips_geo, zipcodes, predicate='within')
destination_zip = gpd.sjoin(dest_trips_geo, zipcodes, predicate='within')

In [None]:
origin_zip['zip'].value_counts()

In [None]:
destination_zip['zip'].value_counts()

In [None]:
origin_37204 = gpd.GeoDataFrame(origin_zip[origin_zip['zip']=='37204'])

In [None]:
origin_37204.head()

In [None]:
destination_37201 = gpd.GeoDataFrame(destination_zip[destination_zip['zip']=='37201'])

In [None]:
destination_37201.head()

In [None]:
county = gpd.read_file('../data/Davidson County Border (GIS).geojson')
county.geometry.centroid
#Use ID #0
center = county.geometry.centroid[0]
print(center)
map_center = [center.y, center.x]

In [None]:
zipcodes = gpd.read_file('../data/zipcodes.geojson')
zipcodes[zipcodes['zip'].isin(['37201', '37204'])]

Set the color pallete for markers by company to match everyone's graphs.

For future reference on the maps, 37201 is ID #36 and 37204 is ID #2

In [None]:
scooters_37201 = folium.Map(location = map_center, tiles="Cartodb Positron", zoom_start = 12)
marker_cluster = MarkerCluster().add_to(scooters_37201)
folium.GeoJson(zipcodes.loc[36, 'geometry'], style_function=lambda feature: {"color": "black", "weight": 2, "dashArray": "10, 5","fillOpacity":0.5}).add_to(scooters_37201)
for row_index, row_values in destination_37201.iterrows():
    loc = [row_values['startlatitude'], row_values['startlongitude']]
    icon=folium.Icon(color="blue",icon="exclamation-triangle", prefix='fa')
    
    marker = folium.Marker(
        location = loc,
   icon = icon) 
    
    marker.add_to(marker_cluster)

scooters_37201.save('../maps/scooters_37201.html')

In [None]:
scooters_37204 = folium.Map(location = map_center, tiles="Cartodb Positron", zoom_start = 12)
marker_cluster = MarkerCluster().add_to(scooters_37204)
folium.GeoJson(zipcodes.loc[2, 'geometry']).add_to(scooters_37204)
for row_index, row_values in origin_37204.iterrows():
    loc = [row_values['endlatitude'], row_values['endlongitude']]
    icon=folium.Icon(color="blue",icon="exclamation-triangle", prefix='fa')
    
    marker = folium.Marker(
        location = loc,
   icon = icon) 
    
    marker.add_to(marker_cluster)

scooters_37204.save('../maps/scooters_37204.html')

Copy cleaned bus stop data set from geospatial notebook

In [None]:
bus_stops = pd.read_csv('../data/busstops_cleaned.csv')
bus_geo = gpd.GeoDataFrame(bus_stops, crs = zipcodes.crs, geometry = gpd.points_from_xy(bus_stops.lng, bus_stops.lat))
stops_by_zip = gpd.sjoin(bus_geo, zipcodes, predicate = 'within')

In [None]:
trip_start_bus_stop = folium.Map(location = map_center, tiles="Cartodb Positron", zoom_start = 12)
marker_cluster = MarkerCluster().add_to(trip_start_bus_stop)
for row_index, row_values in stops_by_zip.iterrows():
    loc = [row_values['lat'], row_values['lng']]
    
    marker=folium.Circle(location = loc, radius=50, fill_color="purple", fill_opacity=0.8, color="black", weight=1)

    marker.add_to(trip_start_bus_stop)

start_trips = []
for row_index, row_values in origin_zip.iterrows():
    loc = [row_values['startlatitude'], row_values['startlongitude']]
    start_trips.append(loc)
start_trips

HeatMap(start_trips, radius = 15, min_opacity = 0.8, gradient={.8: '#ffc2c2', .95: '#ff7970', 1: '#ff0000'}).add_to(folium.FeatureGroup(name='Heat Map').add_to(trip_start_bus_stop))
folium.LayerControl().add_to(trip_start_bus_stop)

trip_start_bus_stop.save('../maps/trip_start_bus_stop.html')