In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster

Are there any null values in any columns in either table?


In [2]:
database_name = 'nashville_scooters'    # Fill this in with your database name

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

In [3]:
engine = create_engine(connection_string)

In [26]:
query = '''
SELECT *
FROM scooters
WHERE pubdatetime IS NULL
    OR latitude IS NULL
    OR longitude IS NULL
    OR sumdid IS NULL
    OR sumdtype IS NULL
    OR chargelevel IS NULL
    OR sumdgroup IS NULL
    OR costpermin IS NULL
    OR companyname IS NULL
'''


In [27]:
with engine.connect() as connection:    
    scooter_nulls = pd.read_sql(text(query), con = connection)

In [28]:
scooter_nulls

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-05-22 11:39:13.097,36.147647,-86.740524,Powered4233824,Powered,,Scooter,0.23,Spin
1,2019-05-22 11:44:13.317,36.147647,-86.740524,Powered4233824,Powered,,Scooter,0.23,Spin
2,2019-05-22 11:49:13.557,36.147647,-86.740524,Powered4233824,Powered,,Scooter,0.23,Spin
3,2019-05-22 11:54:13.720,36.147647,-86.740524,Powered4233824,Powered,,Scooter,0.23,Spin
4,2019-05-22 11:59:13.800,36.147647,-86.740524,Powered4233824,Powered,,Scooter,0.23,Spin
...,...,...,...,...,...,...,...,...,...
765,2019-07-28 23:28:05.000,36.149802,-86.807646,Powered-55a7ac02-7e17-bb3b-130d-b160d9776405,Powered,,Scooter,0.30,Bolt
766,2019-07-28 23:33:06.000,36.149802,-86.807646,Powered-55a7ac02-7e17-bb3b-130d-b160d9776405,Powered,,Scooter,0.30,Bolt
767,2019-07-28 23:38:06.000,36.149802,-86.807646,Powered-55a7ac02-7e17-bb3b-130d-b160d9776405,Powered,,Scooter,0.30,Bolt
768,2019-07-28 23:43:06.000,36.149802,-86.807646,Powered-55a7ac02-7e17-bb3b-130d-b160d9776405,Powered,,Scooter,0.30,Bolt


In [12]:
# note to self: it looks like the chargelevel none may be our null culprit

In [14]:
query = '''
SELECT *
FROM trips
WHERE pubtimestamp IS NULL
    OR companyname IS NULL
    OR triprecordnum IS NULL
    OR sumdid IS NULL
    OR tripduration IS NULL
    OR tripdistance IS NULL
    OR startdate IS NULL
    OR starttime IS NULL
    OR enddate IS NULL
    OR endtime IS NULL
    OR startlatitude IS NULL
    OR startlongitude IS NULL
    OR endlatitude IS NULL
    OR endlongitude IS NULL
    OR triproute IS NULL
    OR create_dt IS NULL
'''

with engine.connect() as connection:    
    trips_nulls = pd.read_sql(text(query), con = connection)

trips_nulls

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt


In [15]:
#no columns have null values in trips but could try pulling in trips table and use the isna method on that DF

What date range is represented in each of the date columns? Investigate any values that seem odd.


In [16]:
query = '''
SELECT MAX(pubdatetime) as max_date,
       MIN(pubdatetime) as min_date
FROM scooters;
'''

with engine.connect() as connection:
    scooter_dates = pd.read_sql(text(query), con = connection)

scooter_dates

Unnamed: 0,max_date,min_date
0,2019-07-31 23:59:57,2019-05-01 00:01:41.247


In [17]:
query = '''
SELECT MAX(pubtimestamp) as max_date,
       MIN(pubtimestamp) as min_date
FROM trips;
'''

with engine.connect() as connection:
    trips_pubdates = pd.read_sql(text(query), con = connection)

trips_pubdates

Unnamed: 0,max_date,min_date
0,2019-08-01 07:04:00,2019-05-01 00:00:55.423


In [18]:
query = '''
SELECT MAX(pubtimestamp) as max_pub_date,
       MIN(pubtimestamp) as min_pub_date,
       MAX(startdate) as max_start_date,
       MIN(startdate) as min_start_date,
       MAX(enddate) as max_end_date,
       MIN(enddate) as min_end_date
FROM trips;
'''

with engine.connect() as connection:
    trips_dates_max_min = pd.read_sql(text(query), con = connection)

trips_dates_max_min
       

Unnamed: 0,max_pub_date,min_pub_date,max_start_date,min_start_date,max_end_date,min_end_date
0,2019-08-01 07:04:00,2019-05-01 00:00:55.423,2019-07-31,2019-05-01,2019-08-01,2019-05-01


In [29]:
#pull the rows for max pub and end date just to see what time o' day those occurred

In [19]:
query = '''
SELECT DISTINCT sumdgroup
FROM scooters
'''


with engine.connect() as connection:
    sumdgroup_distinct = pd.read_sql(text(query), con = connection)

sumdgroup_distinct

Unnamed: 0,sumdgroup
0,bicycle
1,scooter
2,Scooter


In [20]:
query = '''
SELECT DISTINCT companyname
FROM scooters
'''

with engine.connect() as connection:
    companyname_scooter = pd.read_sql(text(query), con = connection)

companyname_scooter

Unnamed: 0,companyname
0,Bird
1,Bolt
2,Gotcha
3,Jump
4,Lime
5,Lyft
6,Spin


In [21]:
query = '''
SELECT DISTINCT companyname
FROM trips
'''

with engine.connect() as connection:
    companyname_trips = pd.read_sql(text(query), con = connection)

companyname_trips

Unnamed: 0,companyname
0,Bird
1,Bolt Mobility
2,Gotcha
3,JUMP
4,Lime
5,Lyft
6,SPIN


In [None]:
# things to note here : in scooters its Bolt but trips is Bolt Mobility. Scooters is Jump but trips is JUMP and SPIN

In [22]:
query ='''
SELECT companyname,
       COUNT(DISTINCT sumdid) AS scooter_count
FROM scooters
WHERE companyname = 'Bird'
GROUP BY companyname
'''


with engine.connect() as connection:
    companyname_scooters = pd.read_sql(text(query), con = connection)

companyname_scooters


Unnamed: 0,companyname,scooter_count
0,Bird,3860


In [None]:
SELECT *
