In [2]:
import pandas as pd
import pyarrow.parquet as pq

In [3]:
raw_df = pd.read_parquet('yellow_tripdata_2024-01.parquet')

In [4]:
len(raw_df)

2964624

In [5]:
df = raw_df.iloc[:100]

In [6]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data'))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" REAL,
  "trip_distance" REAL,
  "RatecodeID" REAL,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL,
  "Airport_fee" REAL
)


In [7]:
from sqlalchemy import create_engine

In [8]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [9]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x1d4df3675f0>

In [10]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" INTEGER, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" INTEGER, 
	"DOLocationID" INTEGER, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	"Airport_fee" FLOAT(53)
)




The above is what Pandas will execute when it wants to create a table in the database

In [11]:
# Open the Parquet file
parquet_file = pq.ParquetFile('yellow_tripdata_2024-01.parquet')

# Create an iterator from the batches
batch_iterator = parquet_file.iter_batches(batch_size=100000)

# Manually get the next batch using next()
try:
    while True:
        batch = next(batch_iterator)  # Get the next batch
        df_batch = batch.to_pandas()  # Convert to a Pandas DataFrame
        
        # Process the batch as needed
        print(df_batch.head())  # For example, print the first few rows
except StopIteration:
    print("No more batches to process.")


   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2024-01-01 00:57:55   2024-01-01 01:17:43                1   
1         1  2024-01-01 00:03:00   2024-01-01 00:09:36                1   
2         1  2024-01-01 00:17:06   2024-01-01 00:35:01                1   
3         1  2024-01-01 00:36:38   2024-01-01 00:44:56                1   
4         1  2024-01-01 00:46:51   2024-01-01 00:52:57                1   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.72           1                  N           186            79   
1           1.80           1                  N           140           236   
2           4.70           1                  N           236            79   
3           1.40           1                  N            79           211   
4           0.80           1                  N           211           148   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [12]:
df_batch

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2024-01-18 07:47:58,2024-01-18 08:17:46,,0.00,,,188,97,0,34.04,0.00,0.5,0.00,0.00,1.0,35.54,,
1,1,2024-01-18 07:14:47,2024-01-18 07:27:51,,0.00,,,238,246,0,22.93,0.00,0.5,0.00,0.00,1.0,26.93,,
2,2,2024-01-18 07:02:12,2024-01-18 07:11:12,,1.61,,,143,237,0,11.74,0.00,0.5,2.83,0.00,1.0,18.57,,
3,2,2024-01-18 07:51:17,2024-01-18 08:22:04,,5.27,,,80,107,0,29.37,0.00,0.5,0.00,0.00,1.0,33.37,,
4,1,2024-01-18 07:28:23,2024-01-18 07:37:54,,3.20,,,231,230,0,14.90,0.00,0.5,2.83,0.00,1.0,21.73,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64619,2,2024-01-31 23:45:59,2024-01-31 23:54:36,,3.18,,,107,263,0,15.77,0.00,0.5,2.00,0.00,1.0,21.77,,
64620,1,2024-01-31 23:13:07,2024-01-31 23:27:52,,4.00,,,114,236,0,18.40,1.00,0.5,2.34,0.00,1.0,25.74,,
64621,2,2024-01-31 23:19:00,2024-01-31 23:38:00,,3.33,,,211,25,0,19.97,0.00,0.5,0.00,0.00,1.0,23.97,,
64622,2,2024-01-31 23:07:23,2024-01-31 23:25:14,,3.06,,,107,13,0,23.88,0.00,0.5,5.58,0.00,1.0,33.46,,


In [14]:
# ## Test
# next(batch_iterator)

In [15]:
batch_iterator = parquet_file.iter_batches(batch_size=100000)

In [16]:
batch = next(batch_iterator)  # Get the next batch
df_batch = batch.to_pandas()

In [17]:
df_batch

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1,1.72,1,N,186,79,2,17.7,1.0,0.5,0.00,0.0,1.0,22.70,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1,1.80,1,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1,4.70,1,N,236,79,1,23.3,3.5,0.5,3.00,0.0,1.0,31.30,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1,1.40,1,N,79,211,1,10.0,3.5,0.5,2.00,0.0,1.0,17.00,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1,0.80,1,N,211,148,1,7.9,3.5,0.5,3.20,0.0,1.0,16.10,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,2024-01-02 13:34:18,2024-01-02 13:38:13,1,0.70,1,N,263,75,1,5.8,2.5,0.5,2.00,0.0,1.0,11.80,2.5,0.0
99996,1,2024-01-02 13:47:42,2024-01-02 13:59:26,1,1.70,1,N,236,238,1,11.4,2.5,0.5,4.60,0.0,1.0,20.00,2.5,0.0
99997,2,2024-01-02 13:23:51,2024-01-02 13:36:29,1,1.87,1,N,236,162,1,12.8,0.0,0.5,3.36,0.0,1.0,20.16,2.5,0.0
99998,2,2024-01-02 13:42:04,2024-01-02 13:48:39,1,1.58,1,N,229,263,2,9.3,0.0,0.5,0.00,0.0,1.0,13.30,2.5,0.0


In [18]:
# First batch of data
df_batch.to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

1000

In [19]:
from time import time

In [20]:
try:
    while True:
        start_time = time()

        batch = next(batch_iterator)  # Get the next batch
        df_batch = batch.to_pandas()
        
        df_batch.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
        end_time = time()
        print("next batch inserted ..., took %.3f seconds" % (end_time-start_time))
except StopIteration:
    print("reached the end of last batch")

next batch inserted ..., took 61.967 seconds
next batch inserted ..., took 48.220 seconds
next batch inserted ..., took 96.969 seconds
next batch inserted ..., took 110.498 seconds
next batch inserted ..., took 81.154 seconds
next batch inserted ..., took 63.863 seconds
next batch inserted ..., took 44.656 seconds
next batch inserted ..., took 65.447 seconds
next batch inserted ..., took 91.752 seconds
next batch inserted ..., took 75.833 seconds
next batch inserted ..., took 59.201 seconds
next batch inserted ..., took 54.892 seconds
next batch inserted ..., took 51.377 seconds
next batch inserted ..., took 61.372 seconds
next batch inserted ..., took 69.730 seconds
next batch inserted ..., took 68.013 seconds
next batch inserted ..., took 60.796 seconds
next batch inserted ..., took 60.618 seconds
next batch inserted ..., took 76.659 seconds
next batch inserted ..., took 76.159 seconds
next batch inserted ..., took 62.803 seconds
next batch inserted ..., took 55.604 seconds
next batc