Connect to Postgres database using the sqlalchemy library

Install `sqlalchemy` and `psycopg2` libraries 

In [64]:
from sqlalchemy import create_engine, MetaData, Table 
import psycopg2
import pandas as pd

First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

To connect to the Lahman baseball database, you can use the following connection string.

In [65]:
database_name = 'scooters'    # Fill this in with your scooter database name

connection_string = f"postgresql://postgres:postgres@localhost:5432/scooters"

Now, we need to create an engine and use it to connect.

In [66]:
engine = create_engine(connection_string)

In [67]:
# get column info on scooters table 
metadata = MetaData()
scooters = Table('scooters', metadata, autoload=True, autoload_with=engine)
print(repr(scooters))
trips = Table('trips', metadata, autoload=True, autoload_with=engine)
print(repr(trips))

Table('scooters', MetaData(), Column('pubdatetime', TIMESTAMP(), table=<scooters>), Column('latitude', NUMERIC(), table=<scooters>), Column('longitude', NUMERIC(), table=<scooters>), Column('sumdid', TEXT(), table=<scooters>), Column('sumdtype', TEXT(), table=<scooters>), Column('chargelevel', NUMERIC(), table=<scooters>), Column('sumdgroup', TEXT(), table=<scooters>), Column('costpermin', NUMERIC(), table=<scooters>), Column('companyname', TEXT(), table=<scooters>), schema=None)
Table('trips', MetaData(), Column('pubtimestamp', TIMESTAMP(), table=<trips>), Column('companyname', TEXT(), table=<trips>), Column('triprecordnum', TEXT(), table=<trips>), Column('sumdid', TEXT(), table=<trips>), Column('tripduration', NUMERIC(), table=<trips>), Column('tripdistance', NUMERIC(), table=<trips>), Column('startdate', DATE(), table=<trips>), Column('starttime', TIME(), table=<trips>), Column('enddate', DATE(), table=<trips>), Column('endtime', TIME(), table=<trips>), Column('startlatitude', NUMER

Now, we can create our query and pass it into the `.query()` method.

In [68]:
query = '''
SELECT MIN(pubdatetime), MAX(pubdatetime)
FROM scooters;
'''

result = engine.execute(query)

In [69]:
date_range = pd.read_sql(query, con = engine)
date_range.head()

Unnamed: 0,min,max
0,2019-05-01 00:01:41.247,2019-07-31 23:59:57


In [70]:
# looking for null values, not sure how to do this without doing it column by column, tbd
query1 = '''
SELECT COUNT(*)
FROM trips
WHERE enddate IS NULL
'''

result1 = engine.execute(query1)

For much more information about SQLAlchemy and to see a more “Pythonic” way to execute queries, see Introduction to Databases in Python: https://www.datacamp.com/courses/introduction-to-relational-databases-in-python

In [71]:
null_trips = pd.read_sql(query1, con = engine)
null_trips.head()

Unnamed: 0,count
0,0


In [72]:
#  Is time represented with am/pm or using 24 hour values in each of the columns that include time?
query2 = '''
SELECT DISTINCT pubdatetime
FROM scooters
LIMIT 5
'''

result2 = engine.execute(query2)

In [None]:
time_values = pd.read_sql(query2, con = engine)
time_values.head()