In [1]:
from sqlalchemy import create_engine, text

In [2]:
database_name = 'scooters'    

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

In [3]:
engine = create_engine(connection_string)

In [4]:
import pandas as pd

In [5]:
query = '''
SELECT *
FROM scooters
WHERE pubdatetime IS NULL OR latitude IS NULL OR longitude IS NULL OR sumdid IS NULL OR  sumdtype IS NULL OR  
chargelevel IS NULL OR  sumdgroup IS NULL OR  costpermin IS NULL OR companyname IS NULL;
'''


In [6]:
with engine.connect() as connection:    
    null_scooters = pd.read_sql(text(query), con = connection)

In [7]:
null_scooters.columns

Index(['pubdatetime', 'latitude', 'longitude', 'sumdid', 'sumdtype',
       'chargelevel', 'sumdgroup', 'costpermin', 'companyname'],
      dtype='object')

Are there any null values in any columns in either table? YES- 770 nulls found 

What date range is represented in each of the date columns? Investigate any values that seem odd. min date- 2019-05-01 00:01:41.247 : max date- 2019-07-31 23:59:57; min date possibly accoundated for milliseconds (.247) where as max date has %Y-%m-%d-%H:%M:%S format. 

In [8]:
query = '''
SELECT MIN(pubdatetime) AS min_date, MAX(pubdatetime) AS max_date
FROM scooters
'''


In [9]:
with engine.connect() as connection:    
   dates_range = pd.read_sql(text(query), con = connection)

In [10]:
dates_range

Unnamed: 0,min_date,max_date
0,2019-05-01 00:01:41.247,2019-07-31 23:59:57


Is time represented with am/pm or using 24 hour values in each of the columns that include time? 24 hours 

In [11]:
query = '''
SELECT DISTINCT pubdatetime
FROM scooters
'''
with engine.connect() as connection:    
    time = pd.read_sql(text(query), con = connection)

In [12]:
time

Unnamed: 0,pubdatetime
0,2019-05-01 00:01:41.247
1,2019-05-01 00:02:25.383
2,2019-05-01 00:02:34.753
3,2019-05-01 00:02:48.740
4,2019-05-01 00:02:59.247
...,...
9175721,2019-07-31 23:58:18.267
9175722,2019-07-31 23:58:32.950
9175723,2019-07-31 23:59:15.753
9175724,2019-07-31 23:59:43.000


What values are there in the sumdgroup column? scooter, Scooter and bicycle. 
Are there any that are not of interest for this project? bicycle

In [13]:
query = '''
SELECT DISTINCT sumdgroup
FROM scooters
'''
with engine.connect() as connection:    
    sumdgroup = pd.read_sql(text(query), con = connection)

In [14]:
sumdgroup

Unnamed: 0,sumdgroup
0,bicycle
1,scooter
2,Scooter


What are the minimum and maximum values for all the latitude and longitude columns? Do these ranges make sense, or is there anything surprising?

In [15]:
query = '''
SELECT MIN(latitude) AS min_latitude, MAX(latitude) AS max_latitude
FROM scooters
'''
with engine.connect() as connection:    
   lat = pd.read_sql(text(query), con = connection)

In [16]:
lat

Unnamed: 0,min_latitude,max_latitude
0,0.0,3609874.0


In [17]:
query = '''
SELECT MIN(longitude) AS min_longitude, MAX(longitude) AS max_longitude
FROM scooters
'''
with engine.connect() as connection:    
   lon = pd.read_sql(text(query), con = connection)

In [18]:
lon

Unnamed: 0,min_longitude,max_longitude
0,-97.443879,0.0


What is the range of values for trip duration and trip distance? Do these values make sense? 
Explore values that might seem questionable.

In [19]:
query = '''
SELECT MIN(tripduration) AS trip_duration_min, MAX(tripduration) AS trip_duration_max , MIN(tripdistance) AS trip_distance_min,
MAX(tripdistance) AS trip_distance_max
FROM trips
'''
with engine.connect() as connection:    
    time_distance = pd.read_sql(text(query), con = connection)

In [20]:
time_distance

Unnamed: 0,trip_duration_min,trip_duration_max,trip_distance_min,trip_distance_max
0,-19.358267,512619.0,-20324803.8,31884480.0


In [21]:
query = '''
SELECT *
FROM trips
 ;
'''
with engine.connect() as connection:    
    min_duration = pd.read_sql(text(query), con = connection)

In [22]:
  min_duration

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-06-01 01:50:26.473,Lime,LIM567,PoweredOWWGOR5KA6ZLV,4.400000,4727.6328,2019-06-01,01:45:15,2019-06-01,01:49:39,36.130110,-86.779953,36.126255,-86.789196,"[(36.13011, -86.77995), (36.129850000000005, -...",2019-06-02 07:34:23.380
1,2019-06-01 01:50:26.477,Lime,LIM570,PoweredAHZXFQBLMHONL,4.466667,2381.8608,2019-06-01,01:45:24,2019-06-01,01:49:52,36.154634,-86.777414,36.155203,-86.781702,"[(36.15463, -86.77741), (36.15461, -86.77742),...",2019-06-02 07:34:23.497
2,2019-06-01 01:50:26.477,Lime,LIM569,PoweredM2D35UKHQFSNS,31.950000,7998.5904,2019-06-01,01:17:53,2019-06-01,01:49:50,36.160530,-86.778276,36.145375,-86.786426,"[(36.16053, -86.77828), (36.16042, -86.77825),...",2019-06-02 07:34:23.450
3,2019-06-01 01:50:51.620,Bird,BRD4234,PoweredZAPK8,4.000000,0.0000,2019-06-01,01:47:59.340000,2019-06-01,01:52:23.456666,36.178500,-86.749700,36.178400,-86.749700,"[(36.178469, -86.749623)]",2019-06-02 05:30:41.097
4,2019-06-01 01:50:51.620,Bird,BRD4237,PoweredH3CK9,19.000000,5577.4280,2019-06-01,01:31:59.026666,2019-06-01,01:50:55.390000,36.178200,-86.784600,36.174900,-86.789200,"[(36.178491, -86.784713), (36.176363, -86.7854...",2019-06-02 05:30:41.197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565517,2019-06-01 01:50:26.467,Lime,LIM562,PoweredP5AZXXSG7MVR6,21.533333,7365.3960,2019-06-01,01:27:12,2019-06-01,01:48:44,36.175221,-86.761522,36.174878,-86.761110,"[(36.17522, -86.76152), (36.17521, -86.76147),...",2019-06-02 07:34:23.170
565518,2019-06-01 01:50:26.470,Lime,LIM565,PoweredQJ4AYR4HCIOVR,3.400000,994.0824,2019-06-01,01:45:59,2019-06-01,01:49:23,36.165564,-86.778749,36.162013,-86.776050,"[(36.16432, -86.77785), (36.16298999999999, -8...",2019-06-02 07:34:23.300
565519,2019-06-01 01:50:26.470,Lime,LIM564,PoweredL4XYY53LZM76E,6.500000,4950.7272,2019-06-01,01:42:43,2019-06-01,01:49:13,36.182690,-86.796986,36.174926,-86.788878,"[(36.18269, -86.79699), (36.18273, -86.79709),...",2019-06-02 07:34:23.260
565520,2019-06-01 01:50:26.470,Lime,LIM566,PoweredBP56VKWQKOKLK,3.466667,3080.6712,2019-06-01,01:46:06,2019-06-01,01:49:34,36.149729,-86.796922,36.153064,-86.789877,"[(36.14973, -86.79692), (36.1494, -86.79756), ...",2019-06-02 07:34:23.340


Check out how the values for the company name column in the scooters table compare to those of the trips table. 
What do you notice? the caplization is differes and Bolt is called Bolt Mobility in trips table 


In [23]:
query = '''
SELECT DISTINCT companyname
FROM scooters
'''
with engine.connect() as connection:    
    scooters_company_name = pd.read_sql(text(query), con = connection)

In [24]:
scooters_company_name

Unnamed: 0,companyname
0,Bird
1,Bolt
2,Gotcha
3,Jump
4,Lime
5,Lyft
6,Spin


In [25]:
query = '''
SELECT DISTINCT companyname
FROM trips
'''
with engine.connect() as connection:    
    trips_company_name = pd.read_sql(text(query), con = connection)

In [26]:
 trips_company_name

Unnamed: 0,companyname
0,Bird
1,Bolt Mobility
2,Gotcha
3,JUMP
4,Lime
5,Lyft
6,SPIN


In [27]:
query = '''
SELECT *
FROM scooters
WHERE pubdatetime >= '2019-06-01' AND pubdatetime <= '2019-06-30'
LIMIT 1000;
'''
with engine.connect() as connection:    
    june_data = pd.read_sql(text(query), con = connection)

In [28]:
june_data

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-06-01 00:00:12.000,36.119900,-86.753200,Poweredfc4d4f74-2113-5b1d-a2c4-6ae96dc1be8d,Powered,1.0,scooter,0.06,Jump
1,2019-06-01 00:00:12.000,36.149000,-86.814500,Poweredf8e596d6-e526-5ca0-a026-f04d5e36a2aa,Powered,83.0,scooter,0.06,Jump
2,2019-06-01 00:00:12.000,36.119900,-86.753200,Poweredf893e721-f459-5302-8f4a-d61622a51a01,Powered,86.0,scooter,0.06,Jump
3,2019-06-01 00:00:12.000,36.177300,-86.787800,Powered2b58bb7b-e22b-5b1b-8968-b4db10f560fe,Powered,43.0,scooter,0.06,Jump
4,2019-06-01 00:00:12.000,36.119900,-86.753100,Poweredead70d58-ed67-5a97-bf0a-5bcba02413de,Powered,0.0,scooter,0.06,Jump
...,...,...,...,...,...,...,...,...,...
995,2019-06-01 00:00:21.133,36.150323,-86.796046,Powered994B9,Powered,52.0,scooter,0.15,Bird
996,2019-06-01 00:00:21.133,36.176363,-86.785677,Powered7FVUZ,Powered,40.0,scooter,0.15,Bird
997,2019-06-01 00:00:21.133,36.145999,-86.811382,Powered7ZL3W,Powered,100.0,scooter,0.15,Bird
998,2019-06-01 00:00:21.133,36.162597,-86.776067,PoweredRJY1U,Powered,77.0,scooter,0.15,Bird


In [29]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')


In [30]:
june_data.dtypes

pubdatetime    datetime64[ns]
latitude              float64
longitude             float64
sumdid                 object
sumdtype               object
chargelevel           float64
sumdgroup              object
costpermin            float64
companyname            object
dtype: object

In [31]:
june_data= june_data.rename(columns={'pubdatetime': 'datetime', 'sumdid': 'id', 'companyname': 'company_name'})

In [32]:
june_data=june_data.drop(['sumdtype'], axis=1)

In [33]:
june_data=june_data.drop(['costpermin'], axis=1)

In [34]:
june_data

Unnamed: 0,datetime,latitude,longitude,id,chargelevel,sumdgroup,company_name
0,2019-06-01 00:00:12.000,36.119900,-86.753200,Poweredfc4d4f74-2113-5b1d-a2c4-6ae96dc1be8d,1.0,scooter,Jump
1,2019-06-01 00:00:12.000,36.149000,-86.814500,Poweredf8e596d6-e526-5ca0-a026-f04d5e36a2aa,83.0,scooter,Jump
2,2019-06-01 00:00:12.000,36.119900,-86.753200,Poweredf893e721-f459-5302-8f4a-d61622a51a01,86.0,scooter,Jump
3,2019-06-01 00:00:12.000,36.177300,-86.787800,Powered2b58bb7b-e22b-5b1b-8968-b4db10f560fe,43.0,scooter,Jump
4,2019-06-01 00:00:12.000,36.119900,-86.753100,Poweredead70d58-ed67-5a97-bf0a-5bcba02413de,0.0,scooter,Jump
...,...,...,...,...,...,...,...
995,2019-06-01 00:00:21.133,36.150323,-86.796046,Powered994B9,52.0,scooter,Bird
996,2019-06-01 00:00:21.133,36.176363,-86.785677,Powered7FVUZ,40.0,scooter,Bird
997,2019-06-01 00:00:21.133,36.145999,-86.811382,Powered7ZL3W,100.0,scooter,Bird
998,2019-06-01 00:00:21.133,36.162597,-86.776067,PoweredRJY1U,77.0,scooter,Bird


In [35]:
june_data.isna().sum()

datetime        0
latitude        0
longitude       0
id              0
chargelevel     0
sumdgroup       0
company_name    0
dtype: int64

In [36]:
june_data.loc[~june_data.duplicated(subset= ['datetime','latitude','longitude','id'])]

Unnamed: 0,datetime,latitude,longitude,id,chargelevel,sumdgroup,company_name
0,2019-06-01 00:00:12.000,36.119900,-86.753200,Poweredfc4d4f74-2113-5b1d-a2c4-6ae96dc1be8d,1.0,scooter,Jump
1,2019-06-01 00:00:12.000,36.149000,-86.814500,Poweredf8e596d6-e526-5ca0-a026-f04d5e36a2aa,83.0,scooter,Jump
2,2019-06-01 00:00:12.000,36.119900,-86.753200,Poweredf893e721-f459-5302-8f4a-d61622a51a01,86.0,scooter,Jump
3,2019-06-01 00:00:12.000,36.177300,-86.787800,Powered2b58bb7b-e22b-5b1b-8968-b4db10f560fe,43.0,scooter,Jump
4,2019-06-01 00:00:12.000,36.119900,-86.753100,Poweredead70d58-ed67-5a97-bf0a-5bcba02413de,0.0,scooter,Jump
...,...,...,...,...,...,...,...
995,2019-06-01 00:00:21.133,36.150323,-86.796046,Powered994B9,52.0,scooter,Bird
996,2019-06-01 00:00:21.133,36.176363,-86.785677,Powered7FVUZ,40.0,scooter,Bird
997,2019-06-01 00:00:21.133,36.145999,-86.811382,Powered7ZL3W,100.0,scooter,Bird
998,2019-06-01 00:00:21.133,36.162597,-86.776067,PoweredRJY1U,77.0,scooter,Bird


In [37]:
query = '''
SELECT *
FROM trips
WHERE pubtimestamp >= '2019-06-01' AND pubtimestamp <= '2019-06-30'
LIMIT 1000;
'''
with engine.connect() as connection:    
    june_trips= pd.read_sql(text(query), con = connection)

In [38]:
june_trips

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-06-01 01:50:26.473,Lime,LIM567,PoweredOWWGOR5KA6ZLV,4.400000,4727.63280,2019-06-01,01:45:15,2019-06-01,01:49:39,36.130110,-86.779953,36.126255,-86.789196,"[(36.13011, -86.77995), (36.129850000000005, -...",2019-06-02 07:34:23.380
1,2019-06-01 01:50:26.477,Lime,LIM570,PoweredAHZXFQBLMHONL,4.466667,2381.86080,2019-06-01,01:45:24,2019-06-01,01:49:52,36.154634,-86.777414,36.155203,-86.781702,"[(36.15463, -86.77741), (36.15461, -86.77742),...",2019-06-02 07:34:23.497
2,2019-06-01 01:50:26.477,Lime,LIM569,PoweredM2D35UKHQFSNS,31.950000,7998.59040,2019-06-01,01:17:53,2019-06-01,01:49:50,36.160530,-86.778276,36.145375,-86.786426,"[(36.16053, -86.77828), (36.16042, -86.77825),...",2019-06-02 07:34:23.450
3,2019-06-01 01:50:51.620,Bird,BRD4234,PoweredZAPK8,4.000000,0.00000,2019-06-01,01:47:59.340000,2019-06-01,01:52:23.456666,36.178500,-86.749700,36.178400,-86.749700,"[(36.178469, -86.749623)]",2019-06-02 05:30:41.097
4,2019-06-01 01:50:51.620,Bird,BRD4237,PoweredH3CK9,19.000000,5577.42800,2019-06-01,01:31:59.026666,2019-06-01,01:50:55.390000,36.178200,-86.784600,36.174900,-86.789200,"[(36.178491, -86.784713), (36.176363, -86.7854...",2019-06-02 05:30:41.197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2019-06-01 03:50:27.710,Lime,LIM950,PoweredXTNMEBOS5T2ZK,15.666667,2562.30480,2019-06-01,03:13:12,2019-06-01,03:28:52,36.147140,-86.808641,36.147053,-86.808691,"[(36.14723, -86.80867), (36.14727, -86.8087700...",2019-06-02 07:35:32.460
996,2019-06-01 03:50:27.710,Lime,LIM947,PoweredR64AXPVBCU44L,5.833333,1351.68960,2019-06-01,03:22:16,2019-06-01,03:28:06,36.158780,-86.777015,36.160599,-86.777875,"[(36.15878, -86.77701), (36.15862, -86.77706),...",2019-06-02 07:35:32.367
997,2019-06-01 03:51:55.973,Lyft,LFT572,Powered290936,2.653567,580.70868,2019-06-01,03:49:16.636666,2019-06-01,03:51:55.853333,36.161870,-86.779310,36.160850,-86.778650,"[(36.16187, -86.77931), (36.16183, -86.77935),...",2019-06-02 08:34:49.377
998,2019-06-01 16:46:34.147,Bird,BRD1135,PoweredUK6T7,40.000000,20013.12400,2019-06-01,16:07:21.036666,2019-06-01,16:47:18.433333,36.165600,-86.780200,36.161600,-86.775800,"[(36.165601, -86.7801), (36.161049, -86.777058...",2019-06-02 05:31:08.910


In [39]:
june_trips= june_trips.rename(columns={'pubtimestamp': 'datetime', 'sumdid': 'id', 'companyname': 'company_name', 'tripdistance': 'distance_traveled', })

In [40]:
june_trips 

Unnamed: 0,datetime,company_name,triprecordnum,id,tripduration,distance_traveled,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-06-01 01:50:26.473,Lime,LIM567,PoweredOWWGOR5KA6ZLV,4.400000,4727.63280,2019-06-01,01:45:15,2019-06-01,01:49:39,36.130110,-86.779953,36.126255,-86.789196,"[(36.13011, -86.77995), (36.129850000000005, -...",2019-06-02 07:34:23.380
1,2019-06-01 01:50:26.477,Lime,LIM570,PoweredAHZXFQBLMHONL,4.466667,2381.86080,2019-06-01,01:45:24,2019-06-01,01:49:52,36.154634,-86.777414,36.155203,-86.781702,"[(36.15463, -86.77741), (36.15461, -86.77742),...",2019-06-02 07:34:23.497
2,2019-06-01 01:50:26.477,Lime,LIM569,PoweredM2D35UKHQFSNS,31.950000,7998.59040,2019-06-01,01:17:53,2019-06-01,01:49:50,36.160530,-86.778276,36.145375,-86.786426,"[(36.16053, -86.77828), (36.16042, -86.77825),...",2019-06-02 07:34:23.450
3,2019-06-01 01:50:51.620,Bird,BRD4234,PoweredZAPK8,4.000000,0.00000,2019-06-01,01:47:59.340000,2019-06-01,01:52:23.456666,36.178500,-86.749700,36.178400,-86.749700,"[(36.178469, -86.749623)]",2019-06-02 05:30:41.097
4,2019-06-01 01:50:51.620,Bird,BRD4237,PoweredH3CK9,19.000000,5577.42800,2019-06-01,01:31:59.026666,2019-06-01,01:50:55.390000,36.178200,-86.784600,36.174900,-86.789200,"[(36.178491, -86.784713), (36.176363, -86.7854...",2019-06-02 05:30:41.197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2019-06-01 03:50:27.710,Lime,LIM950,PoweredXTNMEBOS5T2ZK,15.666667,2562.30480,2019-06-01,03:13:12,2019-06-01,03:28:52,36.147140,-86.808641,36.147053,-86.808691,"[(36.14723, -86.80867), (36.14727, -86.8087700...",2019-06-02 07:35:32.460
996,2019-06-01 03:50:27.710,Lime,LIM947,PoweredR64AXPVBCU44L,5.833333,1351.68960,2019-06-01,03:22:16,2019-06-01,03:28:06,36.158780,-86.777015,36.160599,-86.777875,"[(36.15878, -86.77701), (36.15862, -86.77706),...",2019-06-02 07:35:32.367
997,2019-06-01 03:51:55.973,Lyft,LFT572,Powered290936,2.653567,580.70868,2019-06-01,03:49:16.636666,2019-06-01,03:51:55.853333,36.161870,-86.779310,36.160850,-86.778650,"[(36.16187, -86.77931), (36.16183, -86.77935),...",2019-06-02 08:34:49.377
998,2019-06-01 16:46:34.147,Bird,BRD1135,PoweredUK6T7,40.000000,20013.12400,2019-06-01,16:07:21.036666,2019-06-01,16:47:18.433333,36.165600,-86.780200,36.161600,-86.775800,"[(36.165601, -86.7801), (36.161049, -86.777058...",2019-06-02 05:31:08.910


In [41]:
june_trips.dtypes


datetime             datetime64[ns]
company_name                 object
triprecordnum                object
id                           object
tripduration                float64
distance_traveled           float64
startdate                    object
starttime                    object
enddate                      object
endtime                      object
startlatitude               float64
startlongitude              float64
endlatitude                 float64
endlongitude                float64
triproute                    object
create_dt            datetime64[ns]
dtype: object

In [42]:
june_trips.columns

Index(['datetime', 'company_name', 'triprecordnum', 'id', 'tripduration',
       'distance_traveled', 'startdate', 'starttime', 'enddate', 'endtime',
       'startlatitude', 'startlongitude', 'endlatitude', 'endlongitude',
       'triproute', 'create_dt'],
      dtype='object')

In [43]:
#no nulls found in df 
june_trips.isna().sum()

datetime             0
company_name         0
triprecordnum        0
id                   0
tripduration         0
distance_traveled    0
startdate            0
starttime            0
enddate              0
endtime              0
startlatitude        0
startlongitude       0
endlatitude          0
endlongitude         0
triproute            0
create_dt            0
dtype: int64