In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
pd.__version__

'2.1.4'

In [3]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz -O $(pwd)/data/green_tripdata_2019-09.csv.gz
!gzip -d $(pwd)/data/green_tripdata_2019-09.csv.gz
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv -O $(pwd)/data/taxi_zone_lookup.csv

--2024-01-28 02:55:59--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-09.csv.gz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/b5af7693-2f26-4bd5-8854-75edeb650bae?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240128%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240128T025559Z&X-Amz-Expires=300&X-Amz-Signature=1cb17c7a6a3be22a233bf7371ee9eceada031d8fa37e56a44d71edc1b37fd472&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dgreen_tripdata_2019-09.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-01-28 02:55:59--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/b5af7693

In [4]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [5]:
df = pd.read_csv('data/green_tripdata_2019-09.csv', nrows=100)

In [6]:
df = (df
 .assign(lpep_pickup_datetime=lambda x: pd.to_datetime(x['lpep_pickup_datetime']),
         lpep_dropoff_datetime=lambda x: pd.to_datetime(x['lpep_dropoff_datetime']))
)

In [7]:
print(pd.io.sql.get_schema(df, name='green_taxi_data', con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [8]:
df_iter = pd.read_csv('data/green_tripdata_2019-09.csv', iterator=True, chunksize=100000)

In [9]:
df = next(df_iter)

df.lpep_pickup_datetime=pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime=pd.to_datetime(df.lpep_dropoff_datetime)

In [10]:
len(df)

100000

In [11]:
df.head(n=0).to_sql(name='green_taxi_data', con=engine, if_exists='replace')

0

In [12]:
%time df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

CPU times: user 6.97 s, sys: 92 ms, total: 7.06 s
Wall time: 11.2 s


1000

In [13]:
from time import time

In [14]:
while True: 
    try:
        t_start = time()

        df = next(df_iter)
        
        df.lpep_pickup_datetime=pd.to_datetime(df.lpep_pickup_datetime)
        df.lpep_dropoff_datetime=pd.to_datetime(df.lpep_dropoff_datetime)
            
        df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

        t_end = time()

        print('inserted another chunk, took %.3f second' % (t_end - t_start))
    except StopIteration:
        break

inserted another chunk, took 11.258 second
inserted another chunk, took 11.299 second


  df = next(df_iter)


inserted another chunk, took 11.650 second
inserted another chunk, took 4.695 second


In [15]:
df_zones = pd.read_csv('data/taxi_zone_lookup.csv')

In [16]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [17]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265