In [1]:
import requests

In [2]:
for month in range(1, 13):  # Loop from January (1) to December (12)
    base_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-{month:02d}.csv.gz"
    url = base_url.format(month=month)
    response = requests.get(base_url)
    if response.status_code == 200:
        filename = f"fhv_tripdata_2019_{month}.csv.gz"
        with open(filename, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {filename}")
    else:
        print(f"Failed to download data for month {month:02d}: HTTP {response.status_code}")

Downloaded fhv_tripdata_2019_1.csv.gz
Downloaded fhv_tripdata_2019_2.csv.gz
Downloaded fhv_tripdata_2019_3.csv.gz
Downloaded fhv_tripdata_2019_4.csv.gz
Downloaded fhv_tripdata_2019_5.csv.gz
Downloaded fhv_tripdata_2019_6.csv.gz
Downloaded fhv_tripdata_2019_7.csv.gz
Downloaded fhv_tripdata_2019_8.csv.gz
Downloaded fhv_tripdata_2019_9.csv.gz
Downloaded fhv_tripdata_2019_10.csv.gz
Downloaded fhv_tripdata_2019_11.csv.gz
Downloaded fhv_tripdata_2019_12.csv.gz


In [3]:
import pandas as pd
import glob

# Get a list of all CSV files in a directory
csv_files = glob.glob('*.csv.gz')

# Create an empty dataframe to store the combined data
combined_df = pd.DataFrame()

# Loop through each CSV file and append its contents to the combined dataframe
for csv_file in csv_files:
    df = pd.read_csv(csv_file, compression='gzip')
    combined_df = pd.concat([combined_df, df])

# Print the combined dataframe
print(combined_df)
combined_df.to_csv('fhv_data_2019.csv')

: 

In [10]:
from google.cloud.storage import Client, transfer_manager

def upload_many_blobs_with_transfer_manager(filenames, bucket_name="taxi_2019_2020", source_directory="", workers=8):
    """Upload every file in a list to a bucket, concurrently in a process pool."""

    storage_client = Client()
    bucket = storage_client.bucket(bucket_name)

    results = transfer_manager.upload_many_from_filenames(
        bucket, filenames, source_directory=source_directory, max_workers=workers
    )

    for name, result in zip(filenames, results):
        if isinstance(result, Exception):
            print(f"Failed to upload {name} due to exception: {result}")
        else:
            print(f"Uploaded {name} to {bucket.name}.")

# Create a flat list of filenames
filenames = [f"fhv_tripdata_2019_{month}.csv.gz" for month in range(1, 13)]

# Call the function with the corrected filenames list
upload_many_blobs_with_transfer_manager(filenames=filenames, bucket_name="taxi_2019_2020")


Uploaded fhv_tripdata_2019_1.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_2.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_3.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_4.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_5.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_6.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_7.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_8.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_9.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_10.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_11.csv.gz to taxi_2019_2020.
Uploaded fhv_tripdata_2019_12.csv.gz to taxi_2019_2020.


In [14]:

from google.cloud import bigquery

# Initialize a BigQuery client
client = bigquery.Client()

# Define your dataset and table
dataset_id = 'trips_data_all'
table_id = 'fhv_data'

# The table where you want to append the data
table_ref = client.dataset(dataset_id).table(table_id)

# Configure the load job to append the data to the existing table
job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,  # Assuming your CSV files have a header row
    autodetect=False,  # Detect schema automatically for the first file
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,  # Append to the table
)

# The URL to the files in Google Cloud Storage, using a wildcard for the months
uri = "gs://taxi_2019_2020/fhv_tripdata_2019_*.csv.gz"

# Start the load job
load_job = client.load_table_from_uri(
    uri,
    table_ref,
    job_config=job_config
)

# Wait for the load job to complete
load_job.result()

print(f"Data from {uri} appended to {dataset_id}.{table_id}")