Connect to Postgres database using the sqlalchemy library

Install `sqlalchemy` and `psycopg2` libraries 

In [57]:
from sqlalchemy import create_engine, MetaData, Table 
import pandas as pd
import geopandas as gpd
import folium
from matplotlib import pyplot as plt
import seaborn as sns



First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

To connect to the Lahman baseball database, you can use the following connection string.

In [58]:
database_name = 'scooters'    # Fill this in with your scooter database name

connection_string = f"postgresql://postgres:postgres@localhost:5432/scooters"

Now, we need to create an engine and use it to connect.

In [59]:
engine = create_engine(connection_string)

In [60]:
# get column info on scooters table 
metadata = MetaData()
scooters = Table('scooters', metadata, autoload=True, autoload_with=engine)
print(repr(scooters))
trips = Table('trips', metadata, autoload=True, autoload_with=engine)
print(repr(trips))

Table('scooters', MetaData(), Column('pubdatetime', TIMESTAMP(), table=<scooters>), Column('latitude', NUMERIC(), table=<scooters>), Column('longitude', NUMERIC(), table=<scooters>), Column('sumdid', TEXT(), table=<scooters>), Column('sumdtype', TEXT(), table=<scooters>), Column('chargelevel', NUMERIC(), table=<scooters>), Column('sumdgroup', TEXT(), table=<scooters>), Column('costpermin', NUMERIC(), table=<scooters>), Column('companyname', TEXT(), table=<scooters>), schema=None)
Table('trips', MetaData(), Column('pubtimestamp', TIMESTAMP(), table=<trips>), Column('companyname', TEXT(), table=<trips>), Column('triprecordnum', TEXT(), table=<trips>), Column('sumdid', TEXT(), table=<trips>), Column('tripduration', NUMERIC(), table=<trips>), Column('tripdistance', NUMERIC(), table=<trips>), Column('startdate', DATE(), table=<trips>), Column('starttime', TIME(), table=<trips>), Column('enddate', DATE(), table=<trips>), Column('endtime', TIME(), table=<trips>), Column('startlatitude', NUMER

Now, we can create our query and pass it into the `.query()` method.

In [61]:
query = '''
SELECT MIN(pubdatetime), MAX(pubdatetime)
FROM scooters;
'''

result = engine.execute(query)

In [62]:
date_range = pd.read_sql(query, con = engine)
date_range.head()

Unnamed: 0,min,max
0,2019-05-01 00:01:41.247,2019-07-31 23:59:57


In [63]:
# looking for null values, not sure how to do this without doing it column by column, tbd
query1 = '''
SELECT COUNT(*)
FROM trips
WHERE enddate IS NULL
'''

result1 = engine.execute(query1)

For much more information about SQLAlchemy and to see a more “Pythonic” way to execute queries, see Introduction to Databases in Python: https://www.datacamp.com/courses/introduction-to-relational-databases-in-python

In [64]:
null_trips = pd.read_sql(query1, con = engine)
null_trips.head()

Unnamed: 0,count
0,0


What is the range of values for trip duration and trip distance? Do these values make sense? Explore values that might seem questionable.

In [65]:
query2 = '''
SELECT MIN(tripduration) as min_duration, MAX(tripduration) as max_duration
FROM trips;
'''
result = engine.execute(query2)


In [66]:
trip_range = pd.read_sql(query2, con = engine)
trip_range.head()

Unnamed: 0,min_duration,max_duration
0,-19.358267,512619.0


1. During this period, seven companies offered scooters.
How many scooters did each company have in this time frame?Did the number 
for each company change over time? Did scooter usage vary by company?

In [1]:
num_of_scooters = '''
SELECT companyname, COUNT(sumdid)
FROM scooters
Group BY companyname;
'''
result = engine.execute(num_of_scooters)

count_each_company = pd.read_sql(num_of_scooters, con = engine)
count_each_company.head(7)

NameError: name 'engine' is not defined

In [None]:
num_of_scooters_over_time = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, companyname, COUNT(sumdid) as scooter_count
FROM scooters
Group BY month, companyname;
'''
result = engine.execute(num_of_scooters_over_time)

count_each_company_time = pd.read_sql(num_of_scooters_over_time, con = engine)
count_each_company_time.head(21)

In [None]:
q = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Bird'
GROUP BY month;
'''
result = engine.execute(q)

bird = pd.read_sql(q, con = engine)
bird.head()

In [None]:
q1 = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Bolt'
GROUP BY month;
'''
result = engine.execute(q1)

bolt = pd.read_sql(q1, con = engine)
bolt.head()

In [None]:
q2 = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Gotcha'
GROUP BY month;
'''
result = engine.execute(q2)

gotcha = pd.read_sql(q2, con = engine)
gotcha.head()

In [None]:
q3 = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Jump'
GROUP BY month;
'''
result = engine.execute(q3)

jump = pd.read_sql(q3, con = engine)
jump.head()

In [None]:
q4 = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Lime'
GROUP BY month;
'''
result = engine.execute(q4)

lime = pd.read_sql(q4, con = engine)
lime.head()

In [None]:
q5 = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Spin'
GROUP BY month;
'''
result = engine.execute(q5)

spin = pd.read_sql(q5, con = engine)
spin.head()

In [None]:
q6 = '''
SELECT EXTRACT(MONTH FROM pubdatetime) as month, COUNT(DISTINCT sumdid) as scooter_count
FROM scooters
WHERE companyname = 'Lyft'
GROUP BY month;
'''
result = engine.execute(q6)

lyft = pd.read_sql(q6, con = engine)
lyft.head()

In [None]:
d = gotcha
df = pd.DataFrame(data=gotcha)
plt.plot(df.month, df.scooter_count)
d1 = bird
df1 = pd.DataFrame(data=bird)
plt.plot(df1.month, df1.scooter_count)
d2 = lime
df2 = pd.DataFrame(data=lime)
plt.plot(df2.month, df2.scooter_count)
d3 = jump
df3 = pd.DataFrame(data=jump)
plt.plot(df3.month, df3.scooter_count)
d4 = lyft
df4 = pd.DataFrame(data=lyft)
plt.plot(df4.month, df4.scooter_count)
d5 = spin
df5 = pd.DataFrame(data=spin)
plt.plot(df5.month, df5.scooter_count)


plt.show()

In [None]:
pivot_df = count_each_company_time.pivot(index='month', columns='companyname', values='count')

In [None]:
sns.set_theme(style="darkgrid")
# Plot the responses for different events and regions
sns.lineplot(data=pivot_df)