In [22]:
from google.cloud import storage
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import requests


# https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet

In [18]:
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-{month:02d}.parquet"

for month in range(1, 13): 
    url = base_url.format(month=month)
    response = requests.get(url)
    if response.status_code == 200:
        filename = f"green_tripdata_2022-{month:02d}.parquet"
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Failed to download data for month {month:02d}: HTTP {response.status_code}")



Downloaded green_tripdata_2022-01.parquet
Downloaded green_tripdata_2022-02.parquet
Downloaded green_tripdata_2022-03.parquet
Downloaded green_tripdata_2022-04.parquet
Downloaded green_tripdata_2022-05.parquet
Downloaded green_tripdata_2022-06.parquet
Downloaded green_tripdata_2022-07.parquet
Downloaded green_tripdata_2022-08.parquet
Downloaded green_tripdata_2022-09.parquet
Downloaded green_tripdata_2022-10.parquet
Downloaded green_tripdata_2022-11.parquet
Downloaded green_tripdata_2022-12.parquet


In [21]:
file_path = 'green_tripdata_2022-01.parquet'

table = pq.read_table(file_path)

df = table.to_pandas()
print(df.head())

   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2022-01-01 00:14:21   2022-01-01 00:15:33                  N   
1         1  2022-01-01 00:20:55   2022-01-01 00:29:38                  N   
2         1  2022-01-01 00:57:02   2022-01-01 01:13:14                  N   
3         2  2022-01-01 00:07:42   2022-01-01 00:15:57                  N   
4         2  2022-01-01 00:07:50   2022-01-01 00:28:52                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0         1.0            42            42              1.0           0.44   
1         1.0           116            41              1.0           2.10   
2         1.0            41           140              1.0           3.70   
3         1.0           181           181              1.0           1.69   
4         1.0            33           170              1.0           6.26   

   fare_amount  extra  mta_tax  tip_amount  tolls_amount ehail_fee  \
0   

In [23]:
file_paths = [f'green_tripdata_2022-{month:02d}.parquet' for month in range(1, 13)]

merged_table = pq.read_table(file_paths[0])

for file_path in file_paths[1:]:
    table = pq.read_table(file_path)
    merged_table = pa.concat_tables([merged_table, table])

output_file = 'merged_green_tripdata_2022.parquet'
pq.write_table(merged_table, output_file)

print(f'Merged file saved as {output_file}')

Merged file saved as merged_green_tripdata_2022.parquet


In [24]:
file_path = 'merged_green_tripdata_2022.parquet'

table = pq.read_table(file_path)

df = table.to_pandas()
print(df.head())

   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2022-01-01 00:14:21   2022-01-01 00:15:33                  N   
1         1  2022-01-01 00:20:55   2022-01-01 00:29:38                  N   
2         1  2022-01-01 00:57:02   2022-01-01 01:13:14                  N   
3         2  2022-01-01 00:07:42   2022-01-01 00:15:57                  N   
4         2  2022-01-01 00:07:50   2022-01-01 00:28:52                  N   

   RatecodeID  PULocationID  DOLocationID  passenger_count  trip_distance  \
0         1.0            42            42              1.0           0.44   
1         1.0           116            41              1.0           2.10   
2         1.0            41           140              1.0           3.70   
3         1.0           181           181              1.0           1.69   
4         1.0            33           170              1.0           6.26   

   fare_amount  extra  mta_tax  tip_amount  tolls_amount ehail_fee  \
0   

In [27]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    bucket_name = "green_taxi_data_22"
    # The path to your file to upload
    source_file_name = "merged_green_tripdata_2022.parquet"
    # The ID of your GCS object
    destination_blob_name = "green_taxi_data_2022"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )
upload_blob(bucket_name="green_taxi_data_22", source_file_name="merged_green_tripdata_2022.parquet", destination_blob_name="green_taxi_data_2022")

File merged_green_tripdata_2022.parquet uploaded to green_taxi_data_2022.
