In this notebook, you'll see how to connect to a Postgres database using the sqlalchemy library.

For this notebook, you'll need both the `sqlalchemy` and `psycopg2` libraries installed.

In [4]:
from sqlalchemy import create_engine, text

First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

To connect to the Lahman baseball database, you can use the following connection string.

In [5]:
database_name = 'scooters'    

connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"

Now, we need to create an engine and use it to connect.

In [6]:
engine = create_engine(connection_string)

Now, we can create our query and pass it into the `.query()` method.

In [7]:
query = '''
SELECT *
FROM scooters
LIMIT 100;
'''

with engine.connect() as connection:
    result = connection.execute(text(query))

You can then fetch the results as tuples using either `fetchone` or `fetchall`:

In [8]:
result.fetchone()

(datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.136822'), Decimal('-86.799877'), 'PoweredLIRL1', 'Powered', Decimal('93.00'), 'scooter', Decimal('0.00'), 'Bird')

In [9]:
result.fetchall()

[(datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.191252'), Decimal('-86.772945'), 'PoweredXWRWC', 'Powered', Decimal('35.00'), 'scooter', Decimal('0.00'), 'Bird'),
 (datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.144752'), Decimal('-86.806293'), 'PoweredMEJEH', 'Powered', Decimal('90.00'), 'scooter', Decimal('0.00'), 'Bird'),
 (datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.162056'), Decimal('-86.774688'), 'Powered1A7TC', 'Powered', Decimal('88.00'), 'scooter', Decimal('0.00'), 'Bird'),
 (datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.150973'), Decimal('-86.783109'), 'Powered2TYEF', 'Powered', Decimal('98.00'), 'scooter', Decimal('0.00'), 'Bird'),
 (datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.157188'), Decimal('-86.769978'), 'Powered3F3VK', 'Powered', Decimal('82.00'), 'scooter', Decimal('0.00'), 'Bird'),
 (datetime.datetime(2019, 5, 1, 0, 1, 41, 247000), Decimal('36.154348'), Decimal('-86.784765'), 'PoweredVL7YG'

On the other hand, sqlalchemy plays nicely with pandas.

In [10]:
from sqlalchemy import create_engine, text
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as pyplot

In [11]:
with engine.connect() as connection:
    scooters = pd.read_sql(text(query), con = connection)

scooters.head()

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-05-01 00:01:41.247,36.136822,-86.799877,PoweredLIRL1,Powered,93.0,scooter,0.0,Bird
1,2019-05-01 00:01:41.247,36.191252,-86.772945,PoweredXWRWC,Powered,35.0,scooter,0.0,Bird
2,2019-05-01 00:01:41.247,36.144752,-86.806293,PoweredMEJEH,Powered,90.0,scooter,0.0,Bird
3,2019-05-01 00:01:41.247,36.162056,-86.774688,Powered1A7TC,Powered,88.0,scooter,0.0,Bird
4,2019-05-01 00:01:41.247,36.150973,-86.783109,Powered2TYEF,Powered,98.0,scooter,0.0,Bird


For much more information about SQLAlchemy and to see a more “Pythonic” way to execute queries, see Introduction to Databases in Python: https://www.datacamp.com/courses/introduction-to-relational-databases-in-python

In [12]:
query = '''
SELECT *
FROM trips
LIMIT 100;
'''

with engine.connect() as connection:
    result = connection.execute(text(query))

In [13]:
result.fetchone()

(datetime.datetime(2019, 5, 1, 0, 0, 55, 423000), 'Bird', 'BRD2134', 'Powered9EAJL', Decimal('3.0'), Decimal('958.00528'), datetime.date(2019, 5, 1), datetime.time(0, 0, 20, 460000), datetime.date(2019, 5, 1), datetime.time(0, 2, 52, 346666), Decimal('36.1571'), Decimal('-86.8036'), Decimal('36.1566'), Decimal('-86.8067'), '[(36.157235, -86.803612), (36.157235, -86.80362), (36.157226, -86.803642), (36.157226, -86.803665), (36.157226, -86.803665), (36.157226, -86.803665), ... (2204 characters truncated) ... (36.155941, -86.809517), (36.155941, -86.809517), (36.155941, -86.809517), (36.155918, -86.809671), (36.155918, -86.809671), (36.155918, -86.809671)]', datetime.datetime(2019, 5, 2, 5, 30, 23, 780000))

In [14]:
result.fetchall()

[(datetime.datetime(2019, 5, 1, 0, 3, 33, 147000), 'Lyft', 'LFT5', 'Powered296631', Decimal('1.7156'), Decimal('1371.39112'), datetime.date(2019, 5, 1), datetime.time(0, 1, 50, 90000), datetime.date(2019, 5, 1), datetime.time(0, 3, 33, 26666), Decimal('36.15797'), Decimal('-86.77896'), Decimal('36.16054'), Decimal('-86.77689'), '[(36.15797, -86.77896), (36.15795, -86.77873), (36.15798, -86.77859), (36.15809, -86.77822), (36.15825, -86.77785), (36.1583, -86.77768), (36.15838,  ... (92 characters truncated) ... -86.77703), (36.15963, -86.77678), (36.15977, -86.77665), (36.15994, -86.77654), (36.16024, -86.77673), (36.16053, -86.77694), (36.16054, -86.77689)]', datetime.datetime(2019, 5, 2, 7, 20, 32, 757000)),
 (datetime.datetime(2019, 5, 1, 0, 5, 55, 570000), 'Bird', 'BRD2168', 'Powered7S2UU', Decimal('3.0'), Decimal('2296.588'), datetime.date(2019, 5, 1), datetime.time(0, 3, 47, 363333), datetime.date(2019, 5, 1), datetime.time(0, 7, 13, 596666), Decimal('36.1547'), Decimal('-86.7818')

In [15]:
with engine.connect() as connection:
    trips = pd.read_sql(text(query), con = connection)

trips.head()

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-05-01 00:00:55.423,Bird,BRD2134,Powered9EAJL,3.0,958.00528,2019-05-01,00:00:20.460000,2019-05-01,00:02:52.346666,36.1571,-86.8036,36.1566,-86.8067,"[(36.157235, -86.803612), (36.157235, -86.8036...",2019-05-02 05:30:23.780
1,2019-05-01 00:03:33.147,Lyft,LFT5,Powered296631,1.7156,1371.39112,2019-05-01,00:01:50.090000,2019-05-01,00:03:33.026666,36.15797,-86.77896,36.16054,-86.77689,"[(36.15797, -86.77896), (36.15795, -86.77873),...",2019-05-02 07:20:32.757
2,2019-05-01 00:05:55.570,Bird,BRD2168,Powered7S2UU,3.0,2296.588,2019-05-01,00:03:47.363333,2019-05-01,00:07:13.596666,36.1547,-86.7818,36.1565,-86.7868,"[(36.155068, -86.782124), (36.156597, -86.78675)]",2019-05-02 05:30:24.530
3,2019-05-01 00:05:55.570,Bird,BRD2166,PoweredZIIVX,3.0,1200.78744,2019-05-01,00:04:21.386666,2019-05-01,00:06:59.176666,36.1494,-86.7795,36.1531,-86.7796,"[(36.149741, -86.779344), (36.149741, -86.7793...",2019-05-02 05:30:24.237
4,2019-05-01 00:05:55.570,Bird,BRD2165,PoweredJ7MB3,2.0,351.04988,2019-05-01,00:04:27.796666,2019-05-01,00:06:23.150000,36.1778,-86.7866,36.1774,-86.7876,"[(36.177699, -86.786477), (36.177711, -86.7864...",2019-05-02 05:30:24.207


In [16]:
query='''
SELECT
     COUNT(sumdid)
FROM scooters;
'''
with engine.connect() as connection:
        scooters = pd.read_sql(text(query), con = connection)

scooters.head()


Unnamed: 0,count
0,73414043


In [17]:
  # Are there any null values in any columns in either table? 

In [18]:
scooters.isna().any()

count    False
dtype: bool

In [19]:
trips.isna().any()

pubtimestamp      False
companyname       False
triprecordnum     False
sumdid            False
tripduration      False
tripdistance      False
startdate         False
starttime         False
enddate           False
endtime           False
startlatitude     False
startlongitude    False
endlatitude       False
endlongitude      False
triproute         False
create_dt         False
dtype: bool

In [20]:
# What date range is represented in each of the date columns? Investigate any values that seem odd.

In [21]:
query='''
SELECT
     DISTINCT sumdid
FROM scooters;
'''
with engine.connect() as connection:
        scooters = pd.read_sql(text(query), con = connection)

scooters.head()



Unnamed: 0,sumdid
0,Powered-017d3133-f14a-2b83-ee4f-d777e7c5b619
1,Powered-01a24436-0315-e1bb-7ce0-d081d05dff7d
2,Powered-03be23ca-d43b-222f-be54-e44b5b4690df
3,Powered-046201fb-6532-1f37-6334-3612fb1e61f7
4,Powered-0479bb84-afbd-0426-f1c4-df628542a88c


In [22]:
query = '''
SELECT min(pubdatetime), max(pubdatetime)
FROM scooters;
'''

In [23]:
with engine.connect() as connection:
    test = pd.read_sql(query, con = connection)
test

Unnamed: 0,min,max
0,2019-05-01 00:01:41.247,2019-07-31 23:59:57


In [24]:
query='''
SELECT
     COUNT(sumdid)
FROM trips;
'''
with engine.connect() as connection:
        trips = pd.read_sql(text(query), con = connection)

trips.head()


Unnamed: 0,count
0,565522


In [25]:
query = '''
SELECT min(pubtimestamp), max(pubtimestamp)
FROM trips;
'''

In [26]:
with engine.connect() as connection:
    test = pd.read_sql(query, con = connection)
test

Unnamed: 0,min,max
0,2019-05-01 00:00:55.423,2019-08-01 07:04:00


In [27]:
query = '''
SELECT MIN (startdate) AS min_startdate, 
MAX (startdate) AS max_startdate,
MIN (enddate) AS min_enddate,
MAX (enddate) AS max_enddate
FROM trips;
'''

In [28]:
with engine.connect() as connection:
    test = pd.read_sql(query, con = connection)
test

Unnamed: 0,min_startdate,max_startdate,min_enddate,max_enddate
0,2019-05-01,2019-07-31,2019-05-01,2019-08-01


In [29]:
# Is time represented with am/pm or using 24 hour values in each of the columns that include time?

In [30]:
 # What values are there in the sumdgroup column? Are there any that are not of interest for this project?

In [31]:
query='''
SELECT distinct sumdgroup
FROM scooters
'''

In [32]:
with engine.connect() as connection:
    test = pd.read_sql(query, con = connection)
test

Unnamed: 0,sumdgroup
0,bicycle
1,scooter
2,Scooter


 we don't need bicycle

In [33]:
# What are the minimum and maximum values for all the latitude and longitude columns? Do these ranges make sense, or is there anything surprising?

In [34]:
query = '''
SELECT MIN (startlatitude) AS min_startlatitude, 
 MAX (startlatitude) AS max_startlatitude, 
 MIN (startlongitude) AS min_startlongitude, 
 MAX (startlongitude) AS max_startlongitude,
 MIN (endlatitude) AS min_endlatitude,
 MAX (endlatitude) AS max_endlatitude, 
 MIN (endlongitude) AS min_endlongitude,
 MAX (endlatitude) AS max_endlatitude
FROM trips
'''

In [35]:
with engine.connect() as connection:
    test = pd.read_sql(query, con = connection)
test

Unnamed: 0,min_startlatitude,max_startlatitude,min_startlongitude,max_startlongitude,min_endlatitude,max_endlatitude,min_endlongitude,max_endlatitude.1
0,35.8532,36.300029,-86.918008,-86.3662,-36.850405,51.045409,-122.673729,51.045409


In [36]:
query = '''
SELECT min(latitude) as min_latitude,
max(latitude) as max_latitude,
min(longitude) as min_longitude,
max(longitude) as max_longitude
FROM scooters;
'''

In [37]:
with engine.connect() as connection:
    test1 = pd.read_sql(query, con = connection)
test1

Unnamed: 0,min_latitude,max_latitude,min_longitude,max_longitude
0,0.0,3609874.0,-97.443879,0.0


In [38]:
# What is the range of values for trip duration and trip distance? Do these values make sense? Explore values that might seem questionable.

In [39]:
query = '''
SELECT MIN (tripduration) AS min_tripduration, 
 MAX (tripduration) AS max_tripduration, 
 MIN (tripdistance) AS min_tripdistance, 
 MAX (tripdistance) AS max_tripdistance
FROM trips
'''

In [40]:
with engine.connect() as connection:
    test2 = pd.read_sql(query, con = connection)
test2

Unnamed: 0,min_tripduration,max_tripduration,min_tripdistance,max_tripdistance
0,-19.358267,512619.0,-20324803.8,31884480.0


In [41]:
 # Check out how the values for the company name column in the scooters table compare to those of the trips table. What do you notice? 