**START**

Load in pandas and sqlalchemy's create_engine & text
Then connect to Postgres and create the engine

In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
database_name = 'scooters'
connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"
engine = create_engine(connection_string)

**EDA**

In [2]:
count_all_rows = '''
(SELECT
    'scooters' table,
    COUNT(*)
FROM scooters)
UNION
(SELECT
    'trips' table,
    COUNT(*)
FROM trips);
'''

with engine.connect() as connection:
    counts = pd.read_sql(text(count_all_rows), con = connection)

counts

Unnamed: 0,table,count
0,scooters,73414043
1,trips,565522


In [3]:
find_nulls_scooters = '''
SELECT *
FROM scooters
WHERE NOT(scooters IS NOT NULL);
'''

with engine.connect() as connection:
    nulls_scooters = pd.read_sql(text(find_nulls_scooters), con = connection)

find_nulls_trips = '''
SELECT *
FROM trips
WHERE NOT(trips IS NOT NULL);
'''

with engine.connect() as connection:
    nulls_trips = pd.read_sql(text(find_nulls_trips), con = connection)

nulls_scooters.info()
nulls_trips.info()

nulls_scooters

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   pubdatetime  770 non-null    datetime64[ns]
 1   latitude     770 non-null    float64       
 2   longitude    770 non-null    float64       
 3   sumdid       770 non-null    object        
 4   sumdtype     770 non-null    object        
 5   chargelevel  0 non-null      object        
 6   sumdgroup    770 non-null    object        
 7   costpermin   770 non-null    float64       
 8   companyname  770 non-null    object        
dtypes: datetime64[ns](1), float64(3), object(5)
memory usage: 54.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pubtimestamp    0 non-null      object
 1   companyname     0 non-null      object
 2   triprecordn

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-06-10 13:15:51,36.160504,-86.771362,Powered-9730f8bb-1322-08b6-6a73-cc2ee50a33a8,Powered,,Scooter,0.15,Bolt
1,2019-06-10 13:20:51,36.160486,-86.771356,Powered-9730f8bb-1322-08b6-6a73-cc2ee50a33a8,Powered,,Scooter,0.15,Bolt
2,2019-06-10 13:25:51,36.160486,-86.771356,Powered-9730f8bb-1322-08b6-6a73-cc2ee50a33a8,Powered,,Scooter,0.15,Bolt
3,2019-06-10 13:30:51,36.160469,-86.771343,Powered-9730f8bb-1322-08b6-6a73-cc2ee50a33a8,Powered,,Scooter,0.15,Bolt
4,2019-06-10 13:35:51,36.160478,-86.771336,Powered-9730f8bb-1322-08b6-6a73-cc2ee50a33a8,Powered,,Scooter,0.15,Bolt
...,...,...,...,...,...,...,...,...,...
765,2019-06-04 12:35:12,36.166789,-86.777987,Powered-d4042dd2-f268-1af8-467c-1d8301ebf16f,Powered,,Scooter,0.15,Bolt
766,2019-06-04 12:45:13,36.166891,-86.778037,Powered-d4042dd2-f268-1af8-467c-1d8301ebf16f,Powered,,Scooter,0.15,Bolt
767,2019-06-04 12:50:13,36.166793,-86.778859,Powered-d4042dd2-f268-1af8-467c-1d8301ebf16f,Powered,,Scooter,0.15,Bolt
768,2019-06-04 12:55:13,36.166721,-86.779158,Powered-d4042dd2-f268-1af8-467c-1d8301ebf16f,Powered,,Scooter,0.15,Bolt


While the .info() part of the output is a bit counter-intuitave, it shows that there are 770 null values in the scooters table and none in the trips table.

All of the null values are in the chargelevel column, and looking at the full output, they all belong to Bolt and Spin.

In [4]:
date_range = '''
(SELECT
    'scooters' table,
    MIN(pubdatetime) begin,
    MAX(pubdatetime) end
FROM scooters)
UNION
(SELECT
    'trips' table,
    MIN(pubtimestamp) begin,
    MAX(pubtimestamp) end
FROM trips);
'''

with engine.connect() as connection:
    dates = pd.read_sql(text(date_range), con = connection)

dates

Unnamed: 0,table,begin,end
0,scooters,2019-05-01 00:01:41.247,2019-07-31 23:59:57
1,trips,2019-05-01 00:00:55.423,2019-08-01 07:04:00


In [5]:
aug_first = '''
SELECT *
FROM trips
WHERE enddate > '2019-07-31'
LIMIT 100;
'''

with engine.connect() as connection:
    late = pd.read_sql(text(aug_first), con = connection)

late

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-08-01 00:00:06.593,Bird,BRD1815,Powered3IFAH,3.000000,984.25200,2019-07-31,23:58:09.780000,2019-08-01,00:00:55.853333,36.15720,-86.77060,36.15600,-86.77390,"[(36.157364, -86.770644), (36.157206, -86.7708...",2019-08-02 05:30:21.197
1,2019-08-01 00:00:42.890,Lyft,LFT1,Powered011320,7.949683,2742.78224,2019-07-31,23:52:45.780000,2019-08-01,00:00:42.763333,36.14611,-86.79954,36.15189,-86.79720,"[(36.14611, -86.79954), (36.14616, -86.7995), ...",2019-08-02 10:27:50.277
2,2019-08-01 00:03:30.973,Lyft,LFT12,Powered599249,4.203000,790.68244,2019-07-31,23:59:18.720000,2019-08-01,00:03:30.900000,36.16479,-86.77984,36.16580,-86.77783,"[(36.16479, -86.77984), (36.16487, -86.77991),...",2019-08-02 10:27:50.637
3,2019-08-01 00:00:06.593,Bird,BRD1816,PoweredYSEGE,7.000000,1968.50400,2019-07-31,23:54:46.096666,2019-08-01,00:01:29.436666,36.15110,-86.78330,36.15200,-86.79110,"[(36.151103, -86.783327), (36.150872, -86.7831...",2019-08-02 05:30:21.230
4,2019-08-01 00:00:06.593,Bird,BRD1819,PoweredYTT8X,8.000000,0.00000,2019-07-31,23:52:40.086666,2019-08-01,00:00:41.923333,36.14610,-86.79950,36.15180,-86.79730,"[(36.14615, -86.79952), (36.146129, -86.799557...",2019-08-02 05:30:21.340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,2019-08-01 00:01:31.373,Lyft,LFT4,Powered486812,20.457950,8024.93464,2019-07-31,23:41:03.780000,2019-08-01,00:01:31.256666,36.15253,-86.80295,36.16406,-86.80535,"[(36.15253, -86.80295), (36.15251, -86.8029), ...",2019-08-02 10:27:50.377
74,2019-08-01 00:01:41.190,Lyft,LFT5,Powered213391,31.235300,15469.16060,2019-07-31,23:30:26.966666,2019-08-01,00:01:41.083333,36.15385,-86.78404,36.14935,-86.81410,"[(36.15385, -86.78404), (36.15385, -86.7841), ...",2019-08-02 10:27:50.410
75,2019-08-01 00:01:48.653,Lyft,LFT6,Powered599718,4.983000,3323.49092,2019-07-31,23:56:49.606666,2019-08-01,00:01:48.586666,36.15139,-86.78425,36.14577,-86.78030,"[(36.15139, -86.78425), (36.15121, -86.78428),...",2019-08-02 10:27:50.443
76,2019-08-01 00:02:16.520,Lyft,LFT7,Powered382538,7.814250,3270.99748,2019-07-31,23:54:27.573333,2019-08-01,00:02:16.426666,36.17766,-86.75011,36.16961,-86.75091,"[(36.17766, -86.75011), (36.17754, -86.75013),...",2019-08-02 10:27:50.473


Both tables contain 3 months of data and both begin on May 1st, but the trips table ends a day after the scooters table. Looking at why, the trips all began before midnight on July 31st.

In [6]:
long_trips = '''
SELECT 
    companyname,
    COUNT(*)
FROM trips
WHERE tripduration > 1440
GROUP BY companyname;
'''

with engine.connect() as connection:
    long = pd.read_sql(text(long_trips), con = connection)

long

Unnamed: 0,companyname,count
0,Bolt Mobility,6908
1,Lyft,2
2,SPIN,28


In [7]:
short_trips = '''
SELECT
    companyname,
    COUNT(*)
FROM trips
WHERE tripduration < 1
GROUP BY companyname;
'''

with engine.connect() as connection:
    short = pd.read_sql(text(short_trips), con = connection)

short

Unnamed: 0,companyname,count
0,Bird,3963
1,Lime,661
2,Lyft,4530


This data was supposed to have been cleaned before being submitted to the city, which includes stripping out all trips shorter than a minute or longer than 24 hours.

There are ~7,000 trips longer than 24 hours and over 9,000 trips shorter than one minute.

In [8]:
unused_scooters = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) scooters
FROM scooters
GROUP BY companyname
'''

with engine.connect() as connection:
    unused = pd.read_sql(text(unused_scooters), con = connection)

active_scooters = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) scooters
FROM trips
GROUP BY companyname
'''

with engine.connect() as connection:
    active = pd.read_sql(text(active_scooters), con = connection)

unused

Unnamed: 0,company,scooters
0,Bird,3860
1,Bolt,360
2,Gotcha,224
3,Jump,1210
4,Lime,1824
5,Lyft,1735
6,Spin,805


In [9]:
active

Unnamed: 0,company,scooters
0,Bird,3766
1,Bolt Mobility,356
2,Gotcha,166
3,JUMP,450
4,Lime,1788
5,Lyft,1725
6,SPIN,754


In [12]:
availability = '''
SELECT
    companyname company,
    COUNT(DISTINCT sumdid) total_scooters
FROM scooters
WHERE sumdid NOT IN
    (SELECT
        DISTINCT sumdid
    FROM trips)
GROUP BY companyname;
'''

with engine.connect() as connection:
    result = pd.read_sql(text(availability), con = connection)
    
result

Unnamed: 0,company,total_scooters
0,Bird,105
1,Bolt,4
2,Gotcha,58
3,Jump,761
4,Lime,73
5,Lyft,12
6,Spin,51


In [13]:
daily_use = '''
SELECT 
    DISTINCT sumdid,
    companyname company,
    DATE(pubtimestamp) date,
    COUNT(sumdid) OVER (PARTITION BY sumdid, DATE(pubtimestamp)) uses_per_day
FROM trips
GROUP BY sumdid, company, pubtimestamp
ORDER BY date
'''

with engine.connect() as connection:
    result = pd.read_sql(text(daily_use), con = connection)

result

Unnamed: 0,sumdid,company,date,uses_per_day
0,Powered003176,Lyft,2019-05-01,1
1,Powered005832,Lyft,2019-05-01,1
2,Powered009634,Lyft,2019-05-01,1
3,Powered013719,Lyft,2019-05-01,1
4,Powered020762,Lyft,2019-05-01,2
...,...,...,...,...
202384,PoweredT6HXRJMWD7AQM,Lime,2019-08-01,1
202385,PoweredV152F,Bird,2019-08-01,1
202386,PoweredVGUY6YG2BULGH,Lime,2019-08-01,1
202387,PoweredYSEGE,Bird,2019-08-01,1
