In [None]:
from pathlib import Path
import requests

def download_all_zip_files():
    # List of confirmed file names based on the screenshot
    file_names = [
        "202401-citibike-tripdata.csv.zip",
        "202402-citibike-tripdata.csv.zip",
        "202403-citibike-tripdata.csv.zip",
        "202404-citibike-tripdata.csv.zip",
        "202405-citibike-tripdata.zip",
        "202406-citibike-tripdata.zip",
        "202407-citibike-tripdata.zip",
        "202408-citibike-tripdata.zip",
        "202409-citibike-tripdata.zip",
        "202410-citibike-tripdata.zip",
        "202411-citibike-tripdata.zip",
        "202412-citibike-tripdata.zip",
        "202501-citibike-tripdata.zip",
        "202502-citibike-tripdata.zip",
        "202503-citibike-tripdata.csv.zip"
    ]

    for fname in file_names:
        url = f"https://s3.amazonaws.com/tripdata/{fname}"
        response = requests.get(url)
        if response.status_code == 200:
            path = Path("..") / "data" / "raw" / fname
            path.parent.mkdir(parents=True, exist_ok=True)
            path.write_bytes(response.content)
            print(f"✅ Downloaded: {path}")
        else:
            print(f"❌ Failed: {url}")

# Run the function
download_all_zip_files()


✅ Downloaded: ..\data\raw\202401-citibike-tripdata.csv.zip
✅ Downloaded: ..\data\raw\202402-citibike-tripdata.csv.zip
✅ Downloaded: ..\data\raw\202403-citibike-tripdata.csv.zip
✅ Downloaded: ..\data\raw\202404-citibike-tripdata.csv.zip
✅ Downloaded: ..\data\raw\202405-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202406-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202407-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202408-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202409-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202410-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202411-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202412-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202501-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202502-citibike-tripdata.zip
✅ Downloaded: ..\data\raw\202503-citibike-tripdata.csv.zip
✅ Downloaded: ..\data\raw\202504-citibike-tripdata.zip


In [1]:
import pandas as pd
from pathlib import Path
import zipfile

def fetch_raw_data(year: int, month: int) -> str:
    fname_zip = f"{year}{month:02}-citibike-tripdata.csv.zip"
    alt_fname_zip = f"{year}{month:02}-citibike-tripdata.zip"
    raw_dir = Path("..") / "data" / "raw"
    unzip_dir = Path("..") / "data" / "unzipped"
    unzip_dir.mkdir(parents=True, exist_ok=True)

    # Choose correct file format
    zip_path = raw_dir / fname_zip
    if not zip_path.exists():
        zip_path = raw_dir / alt_fname_zip
        if not zip_path.exists():
            print(f"❌ Not found: {fname_zip} or {alt_fname_zip}")
            return None

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(unzip_dir)
            for file in zip_ref.namelist():
                if file.endswith(".csv"):
                    return str(unzip_dir / file)
    except zipfile.BadZipFile:
        print(f"❌ Corrupted ZIP: {zip_path}")
        return None




In [2]:
def load_all_data(start_year=2024, end_year=2025, end_month=3):
    all_data = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == end_year and month > end_month:
                break
            print(f"📦 Processing: {year}-{month:02}")
            csv_path = fetch_raw_data(year, month)
            if csv_path:
                try:
                    df = pd.read_csv(csv_path, dtype={"start_station_id": str, "end_station_id": str})
                    all_data.append(df)
                except Exception as e:
                    print(f"⚠️ Error reading {csv_path}: {e}")
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

# Load and combine all data
combined_df = load_all_data()
print(f"✅ Total rows combined: {len(combined_df)}")
print(combined_df.head())

📦 Processing: 2024-01
📦 Processing: 2024-02
📦 Processing: 2024-03
📦 Processing: 2024-04
📦 Processing: 2024-05
📦 Processing: 2024-06
📦 Processing: 2024-07
📦 Processing: 2024-08
📦 Processing: 2024-09
📦 Processing: 2024-10
📦 Processing: 2024-11
📦 Processing: 2024-12
📦 Processing: 2025-01
📦 Processing: 2025-02
📦 Processing: 2025-03
✅ Total rows combined: 21023102
            ride_id  rideable_type               started_at  \
0  5078F3D302000BD2  electric_bike  2024-01-22 18:43:19.012   
1  814337105D37302A  electric_bike  2024-01-11 19:19:18.721   
2  A33A920E2B10710C  electric_bike  2024-01-30 19:17:41.693   
3  A3A5FC0DD7D34D74  electric_bike  2024-01-27 11:27:01.759   
4  6F96728ECEFBDAA4  electric_bike  2024-01-16 15:15:41.000   

                  ended_at                  start_station_name  \
0  2024-01-22 18:48:10.708  Frederick Douglass Blvd & W 145 St   
1  2024-01-11 19:47:36.007                     W 54 St & 6 Ave   
2  2024-01-30 19:32:49.857                     E 11 St & Ave 

In [None]:
import pandas as pd
from pathlib import Path
import re

def save_monthly_files_with_chunks(unzip_dir="../data/unzipped", output_dir="../data/processed/monthly"):
    unzip_path = Path(unzip_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    monthly_files = {}

    # Group files by month using regex
    for file in unzip_path.rglob("*.csv"):
        if file.name.startswith("._"):  # Skip macOS system files
            continue
        match = re.search(r"(20\d{2})(\d{2})", file.name)
        if match:
            year, month = match.groups()
            key = f"{year}_{month}"
            monthly_files.setdefault(key, []).append(file)

    for key, file_list in monthly_files.items():
        print(f"\n📦 Processing month: {key} with {len(file_list)} files")
        monthly_data = []

        for file in file_list:
            try:
                print(f"  🔄 Reading: {file.name}")
                chunks = pd.read_csv(file, low_memory=False, dtype={"start_station_id": str, "end_station_id": str}, chunksize=500_000)
                for chunk in chunks:
                    monthly_data.append(chunk)
            except Exception as e:
                print(f"⚠️ Skipped {file.name} due to error: {e}")

        if monthly_data:
            combined = pd.concat(monthly_data, ignore_index=True)
            output_file = output_path / f"rides_{key}.parquet"
            combined.to_parquet(output_file, index=False)
            print(f"✅ Saved: {output_file} with {len(combined)} rows")
        else:
            print(f"🚫 No valid data for {key}")

# Run the improved function
save_monthly_files_with_chunks()



📦 Processing month: 2024_01 with 1 files
  🔄 Reading: 202401-citibike-tripdata.csv
✅ Saved: ..\data\processed\monthly\rides_2024_01.parquet with 1888085 rows

📦 Processing month: 2024_02 with 1 files
  🔄 Reading: 202402-citibike-tripdata.csv
✅ Saved: ..\data\processed\monthly\rides_2024_02.parquet with 2121501 rows

📦 Processing month: 2024_03 with 1 files
  🔄 Reading: 202403-citibike-tripdata.csv
✅ Saved: ..\data\processed\monthly\rides_2024_03.parquet with 2663295 rows

📦 Processing month: 2024_04 with 1 files
  🔄 Reading: 202404-citibike-tripdata.csv
✅ Saved: ..\data\processed\monthly\rides_2024_04.parquet with 3217063 rows

📦 Processing month: 2024_05 with 5 files
  🔄 Reading: 202405-citibike-tripdata_1.csv
  🔄 Reading: 202405-citibike-tripdata_2.csv
  🔄 Reading: 202405-citibike-tripdata_3.csv
  🔄 Reading: 202405-citibike-tripdata_4.csv
  🔄 Reading: 202405-citibike-tripdata_5.csv
✅ Saved: ..\data\processed\monthly\rides_2024_05.parquet with 4230360 rows

📦 Processing month: 2024_0