**START**

Load in pandas and sqlalchemy's create_engine & text
Then connect to Postgres and create the engine

In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
database_name = 'scooters'
connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"
engine = create_engine(connection_string)

**EDA**

In [2]:
count_all_rows = '''
(SELECT
    'scooters' table,
    COUNT(*)
FROM scooters)
UNION
(SELECT
    'trips' table,
    COUNT(*)
FROM trips);
'''

with engine.connect() as connection:
    counts = pd.read_sql(text(count_all_rows), con = connection)

counts

Unnamed: 0,table,count
0,scooters,73414043
1,trips,565522


In [3]:
find_nulls_scooters = '''
SELECT *
FROM scooters
WHERE NOT(scooters IS NOT NULL);
'''

with engine.connect() as connection:
    nulls_scooters = pd.read_sql(text(find_nulls_scooters), con = connection)

find_nulls_trips = '''
SELECT *
FROM trips
WHERE NOT(trips IS NOT NULL);
'''

with engine.connect() as connection:
    nulls_trips = pd.read_sql(text(find_nulls_trips), con = connection)

nulls_scooters.info()
nulls_trips.info()

nulls_scooters

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   pubdatetime  770 non-null    datetime64[ns]
 1   latitude     770 non-null    float64       
 2   longitude    770 non-null    float64       
 3   sumdid       770 non-null    object        
 4   sumdtype     770 non-null    object        
 5   chargelevel  0 non-null      object        
 6   sumdgroup    770 non-null    object        
 7   costpermin   770 non-null    float64       
 8   companyname  770 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(5)
memory usage: 54.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pubtimestamp    0 non-null      object
 1   companyname     0 non-null      object
 2   triprecordn

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-06-15 23:20:36,36.158512,-86.780570,Powered-751e19ec-b697-a51f-04ea-0bf9a2e125c9,Powered,,Scooter,0.15,Bolt
1,2019-06-15 23:25:37,36.158560,-86.780545,Powered-751e19ec-b697-a51f-04ea-0bf9a2e125c9,Powered,,Scooter,0.15,Bolt
2,2019-06-15 23:30:37,36.158646,-86.780541,Powered-751e19ec-b697-a51f-04ea-0bf9a2e125c9,Powered,,Scooter,0.15,Bolt
3,2019-06-15 23:35:37,36.158682,-86.780519,Powered-751e19ec-b697-a51f-04ea-0bf9a2e125c9,Powered,,Scooter,0.15,Bolt
4,2019-06-15 23:40:37,36.158683,-86.780522,Powered-751e19ec-b697-a51f-04ea-0bf9a2e125c9,Powered,,Scooter,0.15,Bolt
...,...,...,...,...,...,...,...,...,...
765,2019-06-14 18:14:32,36.150256,-86.813090,Powered-72a04621-3a01-05f2-3c9b-abae6d7387ce,Powered,,Scooter,0.15,Bolt
766,2019-06-14 18:19:32,36.150247,-86.813098,Powered-72a04621-3a01-05f2-3c9b-abae6d7387ce,Powered,,Scooter,0.15,Bolt
767,2019-06-14 18:49:33,36.150258,-86.813127,Powered-72a04621-3a01-05f2-3c9b-abae6d7387ce,Powered,,Scooter,0.15,Bolt
768,2019-06-14 18:54:33,36.150258,-86.813127,Powered-72a04621-3a01-05f2-3c9b-abae6d7387ce,Powered,,Scooter,0.15,Bolt


While the .info() part of the output is a bit counter-intuitave, it shows that there are 770 null values in the scooters table and none in the trips table.

All of the null values are in the chargelevel column, and looking at the full output, they all belong to Bolt and Spin.

In [4]:
date_range = '''
(SELECT
    'scooters' table,
    MIN(pubdatetime) begin,
    MAX(pubdatetime) end
FROM scooters)
UNION
(SELECT
    'trips' table,
    MIN(pubtimestamp) begin,
    MAX(pubtimestamp) end
FROM trips);
'''

with engine.connect() as connection:
    dates = pd.read_sql(text(date_range), con = connection)

dates

Unnamed: 0,table,begin,end
0,scooters,2019-05-01 00:01:41.247,2019-07-31 23:59:57
1,trips,2019-05-01 00:00:55.423,2019-08-01 07:04:00


In [5]:
aug_first = '''
SELECT *
FROM trips
WHERE enddate > '2019-07-31'
LIMIT 100;
'''

with engine.connect() as connection:
    late = pd.read_sql(text(aug_first), con = connection)

late

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-08-01 00:00:06.593,Bird,BRD1816,PoweredYSEGE,7.00000,1968.50400,2019-07-31,23:54:46.096666,2019-08-01,00:01:29.436666,36.151100,-86.783300,36.152000,-86.79110,"[(36.151103, -86.783327), (36.150872, -86.7831...",2019-08-02 05:30:21.230
1,2019-08-01 00:00:06.593,Bird,BRD1819,PoweredYTT8X,8.00000,0.00000,2019-07-31,23:52:40.086666,2019-08-01,00:00:41.923333,36.146100,-86.799500,36.151800,-86.79730,"[(36.14615, -86.79952), (36.146129, -86.799557...",2019-08-02 05:30:21.340
2,2019-08-01 00:00:06.593,Bird,BRD1818,Powered799T4,7.00000,0.00000,2019-07-31,23:53:17.616666,2019-08-01,00:00:27.170000,36.158100,-86.769200,36.158100,-86.76920,"[(36.158103, -86.769174)]",2019-08-02 05:30:21.290
3,2019-08-01 00:00:06.593,Bird,BRD1820,PoweredJF4AU,12.00000,0.00000,2019-07-31,23:48:24.240000,2019-08-01,00:00:16.660000,36.161200,-86.770900,36.162500,-86.77430,"[(36.161257, -86.770772), (36.161298, -86.7707...",2019-08-02 05:30:21.373
4,2019-08-01 00:00:06.593,Bird,BRD1817,PoweredQDRMQ,8.00000,328.08400,2019-07-31,23:53:41.416666,2019-08-01,00:01:58.773333,36.152300,-86.783900,36.145800,-86.78040,"[(36.152383, -86.783852), (36.152464, -86.7836...",2019-08-02 05:30:21.260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,2019-08-01 04:53:48.000,JUMP,JMP1,Poweredb8a3a269-d1ca-571f-9b2f-89b7399b5537,18.44065,7920.00000,2019-07-31,23:41:52,2019-08-01,00:00:18,36.153687,-86.784580,36.164158,-86.77690,"[('36.153685', '-86.784578'), ('36.153685', '-...",2019-08-02 08:21:26.803
74,2019-08-01 07:04:00.000,JUMP,JMP35,Poweredf077a919-d569-5e70-8ca7-71d179ffacf9,142.34561,20433.60000,2019-07-31,23:26:15,2019-08-01,01:48:35,36.155735,-86.775185,36.173904,-86.78545,"[('36.155736', '-86.775181'), ('36.155531', '-...",2019-08-02 08:24:21.967
75,2019-08-01 07:04:00.000,JUMP,JMP34,Powered784fcc92-bd1a-5f6a-8314-b02aa21b4bfa,209.40965,14889.60000,2019-07-31,22:19:08,2019-08-01,01:48:33,36.160050,-86.776720,36.163410,-86.78210,"[('36.160051', '-86.77672'), ('36.160051', '-8...",2019-08-02 08:24:22.000
76,2019-08-01 00:01:02.110,Lyft,LFT2,Powered318477,3.22660,790.68244,2019-07-31,23:57:48.396666,2019-08-01,00:01:01.993333,36.157490,-86.777140,36.157950,-86.77603,"[(36.15749, -86.77714), (36.15748, -86.7772), ...",2019-08-02 10:27:50.310


Both tables contain 3 months of data and both begin on May 1st, but the trips table ends a day after the scooters table. Looking at why, the trips all began before midnight on July 31st.

In [6]:
long_trips = '''
SELECT 
    companyname,
    COUNT(*)
FROM trips
WHERE tripduration > 1440
GROUP BY companyname;
'''

with engine.connect() as connection:
    long = pd.read_sql(text(long_trips), con = connection)

long

Unnamed: 0,companyname,count
0,Bolt Mobility,6908
1,Lyft,2
2,SPIN,28


In [7]:
short_trips = '''
SELECT
    companyname,
    COUNT(*)
FROM trips
WHERE tripduration < 1
    AND tripdistance <= 0
GROUP BY companyname;
'''

with engine.connect() as connection:
    short = pd.read_sql(text(short_trips), con = connection)

short

Unnamed: 0,companyname,count
0,Bird,3928
1,Lime,377
2,Lyft,3405


This data was supposed to have been cleaned before being submitted to the city, which includes stripping out all trips shorter than a minute or longer than 24 hours.

There are ~7,000 trips longer than 24 hours and over 9,000 trips shorter than one minute.

In [8]:
available_scooters = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) scooters
FROM scooters
GROUP BY companyname
'''

with engine.connect() as connection:
    available = pd.read_sql(text(available_scooters), con = connection)

active_scooters = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) scooters
FROM trips
GROUP BY companyname
'''

with engine.connect() as connection:
    active = pd.read_sql(text(active_scooters), con = connection)

available

In [None]:
active

In [None]:
availability = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) total_scooters
FROM scooters
WHERE sumdid NOT IN
    (SELECT
        DISTINCT sumdid
    FROM trips)
GROUP BY companyname;
'''

with engine.connect() as connection:
    unavailable = pd.read_sql(text(availability), con = connection)
    
unavailable

In [None]:
daily_use = '''
SELECT 
    DISTINCT sumdid,
    companyname company,
    ROUND(AVG(COUNT(sumdid)) OVER (PARTITION BY sumdid, DATE(pubtimestamp)),2) avg_daily_usage
FROM trips
GROUP BY sumdid, company, pubtimestamp
'''

with engine.connect() as connection:
    usage = pd.read_sql(text(daily_use), con = connection)

multiple_uses = usage[usage['avg_daily_usage'] > 1.00]
multiple_uses

Filter trips table:
- Remove trips under a minute
- Remove all zero-distance trips
- Remove unreasonable long trips
    - These scooters have an average top speed of 15 mph, and generally have a range of about 50 miles.
    - That said, the batteries should last on average ~3.5 hours

In [None]:
trips_clean = '''
SELECT *
FROM trips
WHERE tripduration > 1.0
    AND tripduration < 200.0
    AND tripdistance > 0
    AND tripdistance < 264000
'''

with engine.connect() as connection:
    trips = pd.read_sql(text(trips_clean), con = connection)
    
trips.describe()

trips.head()

**MAPPING**

In [None]:
zipcodes = gpd.read_file('../data/zipcodes.geojson')

zipcodes = zipcodes[['zip', 'po_name', 'geometry']]
# Create separate tables for start & end location data
scooter_start = trips[['companyname', 'triprecordnum', 'sumdid', 'startlongitude', 'startlatitude']]
scooter_start_geo = gpd.GeoDataFrame(scooter_start, crs= zipcodes.crs, geometry=gpd.points_from_xy(scooter_start.startlongitude, scooter_start.startlatitude))
scooter_end = trips[['companyname', 'triprecordnum', 'sumdid', 'endlongitude', 'endlatitude']]
scooter_end_geo = gpd.GeoDataFrame(scooter_end, crs= zipcodes.crs, geometry=gpd.points_from_xy(scooter_end.endlongitude, scooter_end.endlatitude))

In [None]:
# Combine start and end locations by zip code
starting_zip = gpd.sjoin(scooter_start_geo, zipcodes, predicate='within')
ending_zip = gpd.sjoin(scooter_end_geo, zipcodes, predicate='within')

In [None]:
starting_zip.head()

In [None]:
ending_zip.head()

In [None]:
start_by_zip = pd.DataFrame(starting_zip['zip'].value_counts())
end_by_zip = pd.DataFrame(ending_zip['zip'].value_counts())
start_by_zip.merge(right = end_by_zip, how='outer', on = 'zip')
start_by_zip.head()

In [None]:
# Get centroid of Davidson County
#county = gpd.read_file('../data/Davidson County Border (GIS).geojson')
#county.geometry.centroid
# Use ID #0
#center = county.geometry.centroid[0]
#map_center = [center.y, center.x]

# Draw map of starting positions for each trip
#scooter_start_map = folium.Map(location = map_center, zoom_start = 10)
# Add marker cluster
#marker_cluster = MarkerCluster().add_to(scooter_start_map)
# Add zipcodes to map
#folium.GeoJson(zipcodes).add_to(scooter_start_map)
# Use a for loop to add projects
#for row_index, row_values in starting_zip.iterrows():
#    loc = [row_values['startlatitude'], row_values['startlongitude']]
#    pop = str(row_values['companyname'])
#    icon=folium.Icon(color="blue",icon="exclamation-triangle", prefix='fa')
#    
#    marker = folium.Marker(
#        location = loc, 
#        popup = pop,
#    icon = icon) 
#    
#    marker.add_to(marker_cluster)
#
#scooter_start_map.save('../maps/starting_location.html')
# Display the map
#scooter_start_map