In this notebook, you'll see how to connect to a Postgres database using the sqlalchemy library.

For this notebook, you'll need both the `sqlalchemy` and `psycopg2` libraries installed.

In [3]:
from sqlalchemy import create_engine

First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

To connect to the Lahman baseball database, you can use the following connection string.

In [4]:
database_name = 'scooters'    # Fill this in with your scooter database name

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

Now, we need to create an engine and use it to connect.

In [5]:
engine = create_engine(connection_string)

Now, we can create our query and pass it into the `.query()` method.

In [6]:
# Look at difference in run time for this:
query = '''
SELECT latitude
FROM scooters;
'''

result = engine.execute(query)

In [7]:
# Vs this:
query = '''
SELECT COUNT(latitude)
FROM scooters;
'''

result = engine.execute(query)

You can then fetch the results as tuples using either `fetchone` or `fetchall`:

In [8]:
result.fetchone()

(73414043,)

On the other hand, sqlalchemy plays nicely with pandas.

In [9]:
import pandas as pd

In [10]:
lat = pd.read_sql(query, con = engine)
lat.head()

Unnamed: 0,count
0,73414043


Are there any null values in any columns in either table?

In [11]:
query = '''

SELECT COUNT(*)
FROM scooters
WHERE pubdatetime IS NULL
'''

In [13]:
pubdatetimenulls = pd.read_sql(query, con=engine)
print(pubdatetimenulls)

   count
0      0


In [14]:
query = '''

SELECT COUNT(*)
FROM scooters
WHERE chargelevel IS NULL

'''

In [15]:
chargelevel_nulls = pd.read_sql(query, con=engine)
print(chargelevel_nulls)

   count
0    770


In [16]:
query= '''
SELECT SUM(CASE WHEN chargelevel is null THEN 1 ELSE 0 END) 
AS charge_nulls 
FROM scooters
'''

In [18]:
charge_nulls = pd.read_sql(query, con=engine)
print(charge_nulls)

   charge_nulls
0           770


In [20]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE enddate IS NULL

'''


In [22]:
enddate_nulls = pd.read_sql(query, con=engine)
print(enddate_nulls)

   count
0      0


In [23]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE endtime IS NULL

'''

In [24]:
endtime_nulls = pd.read_sql(query, con=engine)
print(endtime_nulls)

   count
0      0


In [25]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE startlatitude IS NULL

'''

In [26]:
startlatitude_nulls = pd.read_sql(query, con=engine)
print(startlatitude_nulls)

   count
0      0


In [27]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE startlongitude IS NULL

'''

In [28]:
startlongitude_nulls = pd.read_sql(query, con=engine)
print(startlongitude_nulls)

   count
0      0


In [31]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE endlatitude IS NULL

'''

In [32]:
endlatitude_nulls = pd.read_sql(query, con=engine)
print(endlatitude_nulls)

   count
0      0


In [33]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE endlongitude IS NULL

'''

In [34]:
endlongitude_nulls = pd.read_sql(query, con=engine)
print(endlongitude_nulls)

   count
0      0


In [35]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE triproute IS NULL

'''

In [36]:
triproute_nulls = pd.read_sql(query, con=engine)
print(triproute_nulls)

   count
0      0


In [37]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE create_dt IS NULL

'''

In [38]:
create_dt_nulls = pd.read_sql(query, con=engine)
print(create_dt_nulls)

   count
0      0


In [39]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE sumdid IS NULL

'''

In [40]:
sumdid_nulls = pd.read_sql(query, con=engine)
print(sumdid_nulls)

   count
0      0


In [41]:
query = '''

SELECT COUNT(*)
FROM trips
WHERE tripdistance IS NULL

'''

In [42]:
tripdistance_nulls = pd.read_sql(query, con=engine)
print(tripdistance_nulls)

   count
0      0


Answer: 770 nulls on scooters chargelevel


What date range is represented in each of the date columns? Investigate any values that seem odd.

In [50]:
query = '''SELECT MIN(pubdatetime)
FROM scooters
'''

In [51]:
min_scooter_pubdatetime = pd.read_sql(query, con=engine)
print(min_scooter_pubdatetime)

                      min
0 2019-05-01 00:01:41.247


In [54]:
query = '''SELECT MAX(pubdatetime)
FROM scooters
'''

In [55]:
max_scooter_pubdatetime = pd.read_sql(query, con=engine)
print(max_scooter_pubdatetime)

                  max
0 2019-07-31 23:59:57


In [56]:
query = '''SELECT MIN(pubtimestamp)
FROM trips
'''

In [53]:
min_trips_pubtimestamp = pd.read_sql(query, con=engine)
print(min_trips_pubtimestamp)

                      min
0 2019-05-01 00:00:55.423


In [57]:
query = '''SELECT MAX(pubtimestamp)
FROM trips
'''

In [58]:
max_trips_pubtimestamp = pd.read_sql(query, con=engine)
print(max_trips_pubtimestamp)

                  max
0 2019-08-01 07:04:00


In [59]:
query = '''SELECT DISTINCT sumdgroup
FROM scooters
'''


In [60]:
distinct_sumdgroup = pd.read_sql(query, con=engine)
print(distinct_sumdgroup)

  sumdgroup
0   bicycle
1   scooter
2   Scooter


What are the minimum and maximum values for all the latitude and longitude columns? Do these ranges make sense, or is there anything surprising?

In [61]:
query = '''SELECT MIN(startlatitude)
FROM trips
'''

In [62]:
min_start_latitude = pd.read_sql(query, con=engine)
print(min_start_latitude)

       min
0  35.8532


In [63]:
query = '''SELECT MAX(startlatitude)
FROM trips
'''

In [64]:
max_start_latitude = pd.read_sql(query, con=engine)
print(max_start_latitude)

         max
0  36.300029


In [65]:
query = '''SELECT MIN(startlongitude)
FROM trips
'''

In [66]:
min_start_longitude = pd.read_sql(query, con=engine)
print(min_start_longitude)

         min
0 -86.918008


In [67]:
query = '''SELECT MAX(startlongitude)
FROM trips
'''

In [68]:
max_start_longitude = pd.read_sql(query, con=engine)
print(max_start_longitude)

       max
0 -86.3662


In [69]:
query = '''SELECT MAX(tripduration)
FROM trips
'''

In [70]:
max_trip_duration = pd.read_sql(query, con=engine)
print(max_trip_duration)

        max
0  512619.0


In [71]:
query = '''SELECT MIN(tripduration)
FROM trips
'''

In [72]:
min_trip_duration = pd.read_sql(query, con=engine)
print(min_trip_duration)

         min
0 -19.358267
