In [1]:
! python -V

Python 3.11.5


In [2]:
import os
import sys
from time import time
import pandas as pd
import pyarrow as pq
from sqlalchemy import create_engine

 <h3>Download data Files</h3>
! wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz
<br>
! wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv


In [3]:
# engine = create_engine('postgresql://root:root@pgdatabase:5432/ny_taxi') #for jupyter container
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi') #for local system

In [4]:
query = """
SELECT 1 as number;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,number
0,1


In [5]:
query = """
SELECT * from cars;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,brand,model,year
0,Ford,Mustang,1964
1,Tata,Punch,2020


In [8]:
df = pd.read_csv('green_tripdata_2019-10.csv', nrows=100)

In [9]:
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,18.0,0.50,0.5,0.00,0.0,,0.3,19.30,2,1,0.0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.80,5.0,3.25,0.5,0.00,0.0,,0.3,9.05,2,1,0.0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.50,21.5,0.50,0.5,0.00,0.0,,0.3,22.80,2,1,0.0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.90,5.5,0.50,0.5,0.00,0.0,,0.3,6.80,2,1,0.0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,10.0,0.50,0.5,2.26,0.0,,0.3,13.56,1,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2019-10-01 00:02:53,2019-10-01 00:14:32,N,1,126,74,1,3.10,12.0,0.50,0.5,0.00,0.0,,0.3,13.30,1,1,0.0
96,2,2019-10-01 00:18:45,2019-10-01 00:29:23,N,1,42,74,1,1.64,9.5,0.50,0.5,0.00,0.0,,0.3,10.80,2,1,0.0
97,2,2019-10-01 00:41:32,2019-10-01 00:52:51,N,1,75,42,1,3.17,11.5,0.50,0.5,1.50,0.0,,0.3,14.30,1,1,0.0
98,2,2019-10-01 00:36:54,2019-10-01 00:54:20,N,1,92,179,1,5.48,19.5,0.50,0.5,0.00,0.0,,0.3,20.80,2,1,0.0


In [10]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [11]:
print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [16]:
table_name = 'yellow_taxi_data'
file_name = 'green_tripdata_2019-10.csv'

In [22]:
"""Create a SQL table based on the DataFrame schema."""
df.head(0).to_sql(name=table_name, con=engine, if_exists='replace')
print(f'Table {table_name} created successfully.\n')

Table yellow_taxi_data created successfully.



In [23]:
df_iter = pd.read_csv(file_name, iterator=True, chunksize=100000)

In [24]:
def insert_batches(engine, table_name, df_iter):
    """Insert data into the table in batches."""
    t_start = time()
    batch_count = 0

    for batch_df in df_iter:
        batch_count += 1
        print(f'Inserting batch {batch_count}...')
        b_start = time()
        batch_df.lpep_pickup_datetime = pd.to_datetime(batch_df.lpep_pickup_datetime)
        batch_df.lpep_dropoff_datetime = pd.to_datetime(batch_df.lpep_dropoff_datetime)
        batch_df.to_sql(name=table_name, con=engine, if_exists='append', index=False)
        b_end = time()
        print(f'Batch {batch_count} inserted in {b_end - b_start:.3f} seconds.\n')

    t_end = time()
    print(f'Insertion completed! Total time: {t_end - t_start:.3f} seconds for {batch_count} batches.')

In [25]:
# Insert data in batches
insert_batches(engine,table_name,df_iter)

Inserting batch 1...
Batch 1 inserted in 7.287 seconds.

Inserting batch 2...
Batch 2 inserted in 7.162 seconds.

Inserting batch 3...
Batch 3 inserted in 7.211 seconds.

Inserting batch 4...


  for batch_df in df_iter:


Batch 4 inserted in 7.164 seconds.

Inserting batch 5...
Batch 5 inserted in 4.736 seconds.

Insertion completed! Total time: 34.422 seconds for 5 batches.


In [None]:
query = """
SELECT count*(* from yellow_taxi_data;
"""
pd.read_sql(query, con=engine)

### Ingest Lookup table

In [6]:
table_name = 'taxi_zone_lookup'
file_name = 'taxi_zone_lookup.csv'

In [10]:
df = pd.read_csv(file_name)

In [9]:
"""Create a SQL table based on the DataFrame schema."""
df.head(0).to_sql(name=table_name, con=engine, if_exists='replace')
print(f'Table {table_name} created successfully.\n')

Table taxi_zone_lookup created successfully.



In [12]:
t_start = time()
df.to_sql(name=table_name, con=engine, if_exists='append', index=False)
t_end = time()
print(f'Insertion completed! Total time: {t_end - t_start:.3f} seconds')

Insertion completed! Total time: 0.065 seconds
