In [67]:
# import holder
from sqlalchemy import create_engine, text
import pandas as pd

#### EXPLORATION

As you know, it's important to gain an understanding of new datasets before diving headlong into analysis. Here are some suggestions for guiding the process of getting to know the data contained in these tables:
- Are there any null values in any columns in either table?
- What date range is represented in each of the date columns? Investigate any values that seem odd.
- Is time represented with am/pm or using 24 hour values in each of the columns that include time?
- What values are there in the sumdgroup column? Are there any that are not of interest for this project?
- What are the minimum and maximum values for all the latitude and longitude columns? Do these ranges make sense, or is there anything surprising?
-What is the range of values for trip duration and trip distance? Do these values make sense? Explore values that might seem questionable.
- Check out how the values for the company name column in the scooters table compare to those of the trips table. What do you notice?

In [68]:
# bring in postgres database

database_name = 'scooters'
connection_string = f"postgresql://postgres:postgres@localhost:5432/{database_name}"
engine = create_engine(connection_string)

In [69]:
# header queries

scooters_sample = '''SELECT * FROM scooters LIMIT 10'''
trips_sample = '''SELECT * FROM trips limit 10'''

with engine.connect() as connection:
    scooters = pd.read_sql(text(scooters_sample), con = connection)
    trips = pd.read_sql(text(trips_sample), con = connection)

In [70]:
scooters.head()

Unnamed: 0,pubdatetime,latitude,longitude,sumdid,sumdtype,chargelevel,sumdgroup,costpermin,companyname
0,2019-05-01 19:27:25.753,36.152178,-86.789039,PoweredX7GYW,Powered,100.0,scooter,0.0,Bird
1,2019-05-01 19:27:25.753,36.166189,-86.803752,PoweredGYWLU,Powered,84.0,scooter,0.0,Bird
2,2019-05-01 19:27:25.753,36.155278,-86.785099,PoweredH32H2,Powered,97.0,scooter,0.0,Bird
3,2019-05-01 19:27:25.753,36.168366,-86.770232,PoweredJDKSU,Powered,75.0,scooter,0.0,Bird
4,2019-05-01 19:27:25.753,36.150046,-86.805518,Powered48ZXT,Powered,100.0,scooter,0.0,Bird


In [71]:
trips.head()

Unnamed: 0,pubtimestamp,companyname,triprecordnum,sumdid,tripduration,tripdistance,startdate,starttime,enddate,endtime,startlatitude,startlongitude,endlatitude,endlongitude,triproute,create_dt
0,2019-05-01 00:00:55.423,Bird,BRD2134,Powered9EAJL,3.0,958.00528,2019-05-01,00:00:20.460000,2019-05-01,00:02:52.346666,36.1571,-86.8036,36.1566,-86.8067,"[(36.157235, -86.803612), (36.157235, -86.8036...",2019-05-02 05:30:23.780
1,2019-05-01 00:03:33.147,Lyft,LFT5,Powered296631,1.7156,1371.39112,2019-05-01,00:01:50.090000,2019-05-01,00:03:33.026666,36.15797,-86.77896,36.16054,-86.77689,"[(36.15797, -86.77896), (36.15795, -86.77873),...",2019-05-02 07:20:32.757
2,2019-05-01 00:05:55.570,Bird,BRD2168,Powered7S2UU,3.0,2296.588,2019-05-01,00:03:47.363333,2019-05-01,00:07:13.596666,36.1547,-86.7818,36.1565,-86.7868,"[(36.155068, -86.782124), (36.156597, -86.78675)]",2019-05-02 05:30:24.530
3,2019-05-01 00:05:55.570,Bird,BRD2166,PoweredZIIVX,3.0,1200.78744,2019-05-01,00:04:21.386666,2019-05-01,00:06:59.176666,36.1494,-86.7795,36.1531,-86.7796,"[(36.149741, -86.779344), (36.149741, -86.7793...",2019-05-02 05:30:24.237
4,2019-05-01 00:05:55.570,Bird,BRD2165,PoweredJ7MB3,2.0,351.04988,2019-05-01,00:04:27.796666,2019-05-01,00:06:23.150000,36.1778,-86.7866,36.1774,-86.7876,"[(36.177699, -86.786477), (36.177711, -86.7864...",2019-05-02 05:30:24.207


In [61]:
# Are there any null values in any columns in either table?

## charge level
## parse by company and use "company.info(verbose=True, show_counts=True)""

###  ERROR: ProgrammingError: (psycopg2.errors.UndefinedColumn) column "bolt" does not exist
###  LINE 1: SELECT * FROM scooters WHERE companyname LIKE "bolt"

# company queries
bird_query = '''SELECT * FROM scooters WHERE companyname ilike "bird"'''
bolt_query = '''SELECT * FROM scooters WHERE companyname ilike "bolt"'''
# gotcha_query = '''SELECT * FROM scooters WHERE companyname ilike "gotcha"'''
# jump_query = '''SELECT * FROM scooters WHERE companyname ilike "jump"'''
# lime_query = '''SELECT * FROM scooters WHERE companyname ilike "lime"'''
# lyft_query = '''SELECT * FROM scooters WHERE companyname ilike "lyft"'''
# spin_query = '''SELECT * FROM scooters WHERE companyname ilike "spin"'''

with engine.connect() as connection:
    bird = pd.read_sql(text(bolt_query), con = connection)
#     bolt = pd.read_sql(text(bolt_query), con = connection)
#     gotcha = pd.read_sql(text(gotcha_query), con = connection) 
#     jump = pd.read_sql(text(jump_query), con = connection)
#     lime = pd.read_sql(text(lime_query), con = connection)
#     lyft = pd.read_sql(text(lyft_query), con = connection)
#     spin = pd.read_sql(text(spin_query), con = connection)
    
print(bolt.info(verbose=True, show_counts=True))

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "bolt" does not exist
LINE 1: SELECT * FROM scooters WHERE companyname LIKE "bolt"
                                                      ^

[SQL: SELECT * FROM scooters WHERE companyname LIKE "bolt"]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [66]:
# What date range is represented in each of the date columns? Investigate any values that seem odd.
# all_unique_datetime = '''SELECT pubdatetime as datetime
#                 FROM scooters
#                 UNION
#                 SELECT pubtimestamp
#                 FROM trips'''

scoot_date_query = '''SELECT min(pubdatetime), max(pubdatetime) FROM scooters'''
trip_date_query = '''SELECT min(pubtimestamp), max(pubtimestamp) FROM trips'''
with engine.connect() as connection:
    scoot_dates = pd.read_sql(text(scoot_date_query), con = connection)
    trip_dates = pd.read_sql(text(trip_date_query), con = connection)

print(scoot_dates)

# dates = pd.to_datetime(dates.datetime)

# print(dates.dtypes)

                      min                 max
0 2019-05-01 00:01:41.247 2019-07-31 23:59:57


In [23]:
# min_date
min_date = dates.dt.date.min()
print('The earliest date is: ' + str(min_date))
# max_date
max_date = dates.dt.date.max()
print('The latest date is: ' + str(max_date))
#range
print ('The date range is: ' + str(max_date - min_date))

The earliest date is: 2019-05-01
The latest date is: 2019-08-01
The date range is: 92 days, 0:00:00


In [None]:
# Is time represented with am/pm or using 24 hour values in each of the columns that include time?


In [None]:
# What values are there in the sumdgroup column? Are there any that are not of interest for this project?

In [35]:
# What are the minimum and maximum values for all the latitude and longitude columns? Do these ranges make sense, or is there anything surprising? -What is the range of values for trip duration and trip distance? Do these values make sense? Explore values that might seem questionable.

sll_query = '''SELECT
                MIN(latitude) as min_latitude,
                MAX(latitude) as max_latitude,
                MIN(longitude) as min_longitude,
                MAX(latitude) as max_longitude
            FROM scooters'''
            
tll_query = '''SELECT
                MIN(startlatitude) as min_latitude_start,
                MAX(startlatitude) as max_latitude_start,
                MIN(startlongitude) as min_longitude_start,
                MAX(startlongitude) as max_longitude_start,
                MIN(endlatitude) as min_latitude_end,
                MAX(endlatitude) as max_latitude_end,
                MIN(endlongitude) as min_longitude_end,
                MAX(endlongitude) as max_longitude_end
            FROM trips'''

with engine.connect() as connection:
    scoot_result = pd.read_sql(text(sll_query), con = connection)
    trip_result = pd.read_sql(text(tll_query), con = connection)
    
print(scoot_result)

print(trip_result)

   min_latitude  max_latitude  min_longitude  max_longitude
0           0.0  3.609874e+06     -97.443879   3.609874e+06
   min_latitude_start  max_latitude_start  min_longitude_start  \
0             35.8532           36.300029           -86.918008   

   max_longitude_start  min_latitude_end  max_latitude_end  min_longitude_end  \
0             -86.3662        -36.850405         51.045409        -122.673729   

   max_longitude_end  
0         174.764886  


In [33]:
# Check out how the values for the company name column in the scooters table compare to those of the trips table. What do you notice?
cname_query = '''SELECT DISTINCT 
                    companyname, 
                    'scooters' as source_table
                FROM scooters
                UNION ALL
                SELECT DISTINCT 
                    companyname, 
                    'trips'
                FROM scooters
                ORDER BY companyname, source_table DESC;
                '''
with engine.connect() as connection:
    cname = pd.read_sql(text(cname_query), con = connection)
    
cname

Unnamed: 0,companyname,source_table
0,Bird,trips
1,Bird,scooters
2,Bolt,trips
3,Bolt,scooters
4,Gotcha,trips
5,Gotcha,scooters
6,Jump,trips
7,Jump,scooters
8,Lime,trips
9,Lime,scooters
