In [29]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

In [30]:
df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100)

In [31]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

In [32]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [33]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [45]:
df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', 
                      iterator=True, 
                      chunksize=100000, 
                      dtype={
                            "Unnamed: 0": 'int', 
                            "VendorID": 'int', 
                            'tpep_pickup_datetime': 'str', 
                            'tpep_dropoff_datetime': 'str', 
                            'passenger_count': 'float', 
                            'trip_distance': 'float', 
                            "RatecodeID": 'float', 
                            'store_and_fwd_flag': 'str', 
                            "PULocationID": 'int', 
                            "DOLocationID": 'int', 
                            'payment_type': 'int', 
                            'fare_amount': 'float', 
                            'extra': 'float', 
                            'mta_tax': 'float', 
                            'tip_amount': 'float',
                            'tolls_amount': 'float', 
                            'improvement_surcharge': 'float', 
                            'total_amount': 'float', 
                            'congestion_surcharge': 'float', 
                            'airport_fee': 'float'
                      },
                      parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])

In [46]:
df = next(df_iter)

In [47]:
df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

In [48]:
%time df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

CPU times: user 4.32 s, sys: 25.3 ms, total: 4.34 s
Wall time: 7.19 s


1000

In [49]:
while True: 
    t_start = time()

    df = next(df_iter)

    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk, took %.3f second' % (t_end - t_start))

inserted another chunk, took 7.434 second
inserted another chunk, took 7.228 second
inserted another chunk, took 7.946 second
inserted another chunk, took 8.020 second
inserted another chunk, took 7.229 second
inserted another chunk, took 7.596 second
inserted another chunk, took 7.392 second
inserted another chunk, took 7.547 second
inserted another chunk, took 8.577 second
inserted another chunk, took 7.251 second
inserted another chunk, took 7.155 second


ValueError: Integer column has NA values in column 0

In [None]:
#!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

In [None]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [None]:
df_zones.head()

In [None]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')