In [1]:
import os
import requests
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm

# NYC Taxi Data Base URL
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"

# Save directory
SAVE_DIR = "nyc_taxi_data"
os.makedirs(SAVE_DIR, exist_ok=True)

# Months to Download
months = ["2024-01", "2024-02", "2024-03", "2024-04", "2024-05", "2024-06"]

for month in tqdm(months, desc="Downloading NYC Taxi Data"):
    file_url = f"{BASE_URL}{month}.parquet"
    save_path = os.path.join(SAVE_DIR, f"yellow_tripdata_{month}.parquet")

    # Download the file
    response = requests.get(file_url, stream=True)
    if response.status_code == 200:
        with open(save_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"✅ {month} data saved at {save_path}")
    else:
        print(f"❌ Failed to download {month}")

print("✅ All files downloaded and saved as Parquet.")


Downloading NYC Taxi Data:  17%|█▋        | 1/6 [00:13<01:05, 13.06s/it]

✅ 2024-01 data saved at nyc_taxi_data/yellow_tripdata_2024-01.parquet


Downloading NYC Taxi Data:  33%|███▎      | 2/6 [00:26<00:54, 13.57s/it]

✅ 2024-02 data saved at nyc_taxi_data/yellow_tripdata_2024-02.parquet


Downloading NYC Taxi Data:  50%|█████     | 3/6 [00:46<00:48, 16.24s/it]

✅ 2024-03 data saved at nyc_taxi_data/yellow_tripdata_2024-03.parquet


Downloading NYC Taxi Data:  67%|██████▋   | 4/6 [01:03<00:33, 16.77s/it]

✅ 2024-04 data saved at nyc_taxi_data/yellow_tripdata_2024-04.parquet


Downloading NYC Taxi Data:  83%|████████▎ | 5/6 [01:19<00:16, 16.21s/it]

✅ 2024-05 data saved at nyc_taxi_data/yellow_tripdata_2024-05.parquet


Downloading NYC Taxi Data: 100%|██████████| 6/6 [01:40<00:00, 16.79s/it]

✅ 2024-06 data saved at nyc_taxi_data/yellow_tripdata_2024-06.parquet
✅ All files downloaded and saved as Parquet.





In [1]:
import pandas as pd
import os

# Directory where Parquet files are stored
DATA_DIR = "nyc_taxi_data"

# List all parquet files
parquet_files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f.endswith(".parquet")]

# Count total records
total_records = sum(pd.read_parquet(file).shape[0] for file in parquet_files)

print(f"✅ Total records in 2024 Yellow Taxi Data: {total_records}")


✅ Total records in 2024 Yellow Taxi Data: 20332093


In [3]:
import pandas as pd
import os

# Directory where Parquet files are stored
DATA_DIR = "nyc_taxi_data"

# List all Parquet files
parquet_files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f.endswith(".parquet")]

# Load data and count distinct PULocationIDs
distinct_pu_set = set()
for file in parquet_files:
    df = pd.read_parquet(file, columns=["PULocationID"])  # Load only the needed column
    distinct_pu_set.update(df["PULocationID"].dropna().unique())

print(f"✅ Distinct PULocationID count: {len(distinct_pu_set)}")


✅ Distinct PULocationID count: 262
