In [12]:
%load_ext sql

In [1]:
import pandas as pd
import json
from sqlalchemy import create_engine

## Import Zone Dataset

## Data
https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.parquet

In [6]:
csv_file = '../data/taxi+_zone_lookup.csv'
db_conf = None
with open('./db_connection.json', 'r') as f:
    db_conf = json.load(f)

db_url = f'postgresql://{db_conf["user"]}:{db_conf["password"]}@{db_conf["host"]}:{db_conf["port"]}/{db_conf["db"]}'

In [7]:
engine = create_engine(db_url)
df_zone = pd.read_csv(csv_file)
df_zone.to_sql(name='zones', con=engine, if_exists='replace')

265

## Playing SQL

In [14]:
%%sql $db_url

select * from zones limit 10

10 rows affected.


index,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
5,6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
6,7,Queens,Astoria,Boro Zone
7,8,Queens,Astoria Park,Boro Zone
8,9,Queens,Auburndale,Boro Zone
9,10,Queens,Baisley Park,Boro Zone


query 5 tripts with actual zone name with `JOIN`

In [20]:
%%sql
select
    tpep_pickup_datetime,
    tpep_dropoff_datetime,
    total_amount,
    CONCAT(zpu."Borough", ' / ', zpu."Zone") as pickup_loc,
    CONCAT(zdo."Borough", ' / ', zdo."Zone") as dropoff_loc
from
    yellow_taxi_trips t,
    zones zpu,
    zones zdo
where
    t."PULocationID" = zpu."LocationID" AND
    t."DOLocationID" = zdo."LocationID"
limit 5 

 * postgresql://root:***@localhost:5432/ny_taxi
5 rows affected.


tpep_pickup_datetime,tpep_dropoff_datetime,total_amount,pickup_loc,dropoff_loc
2021-01-01 00:30:10,2021-01-01 00:36:12,11.8,Manhattan / Lincoln Square East,Manhattan / Central Park
2021-01-01 00:51:20,2021-01-01 00:52:19,4.3,Manhattan / Upper West Side North,Manhattan / Manhattan Valley
2021-01-01 00:43:30,2021-01-01 01:11:06,51.95,Queens / JFK Airport,Brooklyn / Midwood
2021-01-01 00:15:48,2021-01-01 00:31:01,36.35,Queens / LaGuardia Airport,Queens / JFK Airport
2021-01-01 00:31:49,2021-01-01 00:48:21,24.36,Manhattan / East Chelsea,Brooklyn / Brooklyn Heights


same as above

In [21]:
%%sql
select
    tpep_pickup_datetime,
    tpep_dropoff_datetime,
    total_amount,
    CONCAT(zpu."Borough", ' / ', zpu."Zone") as pickup_loc,
    CONCAT(zdo."Borough", ' / ', zdo."Zone") as dropoff_loc
from
    yellow_taxi_trips t
    join zones zpu on t."PULocationID" = zpu."LocationID"
    join zones zdo on t."DOLocationID" = zdo."LocationID"
limit 5 

 * postgresql://root:***@localhost:5432/ny_taxi
5 rows affected.


tpep_pickup_datetime,tpep_dropoff_datetime,total_amount,pickup_loc,dropoff_loc
2021-01-01 00:30:10,2021-01-01 00:36:12,11.8,Manhattan / Lincoln Square East,Manhattan / Central Park
2021-01-01 00:51:20,2021-01-01 00:52:19,4.3,Manhattan / Upper West Side North,Manhattan / Manhattan Valley
2021-01-01 00:43:30,2021-01-01 01:11:06,51.95,Queens / JFK Airport,Brooklyn / Midwood
2021-01-01 00:15:48,2021-01-01 00:31:01,36.35,Queens / LaGuardia Airport,Queens / JFK Airport
2021-01-01 00:31:49,2021-01-01 00:48:21,24.36,Manhattan / East Chelsea,Brooklyn / Brooklyn Heights


Check if there's missing values of the pick or dropoff location

In [29]:
%%sql
select
    tpep_pickup_datetime,
    tpep_dropoff_datetime,
    total_amount,
    "PULocationID",
    "DOLocationID"
from
    yellow_taxi_trips t
where
    "PULocationID" not in (select "LocationID" from zones)
limit 5 

 * postgresql://root:***@localhost:5432/ny_taxi
0 rows affected.


tpep_pickup_datetime,tpep_dropoff_datetime,total_amount,PULocationID,DOLocationID


groupby

In [36]:
%%sql
select
    cast(tpep_dropoff_datetime as DATE) as "day",
    count(1),
    max(total_amount),
    max(passenger_count)
from 
    yellow_taxi_trips t
group by day
order by day DESC
limit 10

 * postgresql://root:***@localhost:5432/ny_taxi
10 rows affected.


day,count,max,max_1
2021-02-22,1,10.56,1.0
2021-02-01,122,166.55,6.0
2021-01-31,31278,420.96,6.0
2021-01-30,39228,360.96,6.0
2021-01-29,54601,400.3,8.0
2021-01-28,56385,203.8,6.0
2021-01-27,52676,831.0,6.0
2021-01-26,47821,275.8,7.0
2021-01-25,47409,715.3,6.0
2021-01-24,31093,340.67,6.0
