In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
import time
import os

# Initialize geolocator (Nominatim is free for small-scale use, but it has usage limits)
geolocator = Nominatim(user_agent="bike_station_locator")

# Function to fetch zip code using reverse geocoding
def get_zip_code(lat, lng, cache):
    # Check if this lat/lng pair is already in cache
    key = (lat, lng)
    if key in cache:
        return cache[key]
    
    try:
        location = geolocator.reverse((lat, lng), language='en', exactly_one=True)
        address = location.raw['address']
        zip_code = address.get('postcode', None)
        cache[key] = zip_code  # Store the result in cache
        return zip_code
    except Exception as e:
        print(f"Error fetching zip code for ({lat}, {lng}): {e}")
        cache[key] = None  # Store None in cache for failed requests
        return None

# List of CSV files you want to process
csv_files = [
    'JC-202301-citibike-tripdata.csv',
    'JC-202302-citibike-tripdata.csv',
    'JC-202303-citibike-tripdata.csv',
    'JC-202304-citibike-tripdata.csv',
    'JC-202305-citibike-tripdata.csv',
    'JC-202401-citibike-tripdata.csv',
    'JC-202402-citibike-tripdata.csv',
    'JC-202403-citibike-tripdata.csv',
    'JC-202404-citibike-tripdata.csv',
    'JC-202405-citibike-tripdata.csv'
]

# Initialize an empty list to store DataFrames
combined_df = []

# Loop through each file
for file_path in csv_files:
    print(f"Processing {file_path}...")
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Step 1: Find unique start and end stations, taking the first lat/lng for each station
    unique_start_stations = df[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates(subset='start_station_name', keep='first')
    unique_end_stations = df[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates(subset='end_station_name', keep='first')

    # Step 2: Create a cache to store lat/lng to zip code mappings
    cache = {}

    # Fetch zip codes for unique start and end station lat/lng pairs
    start_station_zip_codes = {}
    end_station_zip_codes = {}

    # Reverse geocoding for start stations
    for index, row in unique_start_stations.iterrows():
        start_zip = get_zip_code(row['start_lat'], row['start_lng'], cache)
        start_station_zip_codes[row['start_station_name']] = start_zip
        time.sleep(1)  # Adding delay between requests to avoid hitting rate limits

    # Reverse geocoding for end stations
    for index, row in unique_end_stations.iterrows():
        end_zip = get_zip_code(row['end_lat'], row['end_lng'], cache)
        end_station_zip_codes[row['end_station_name']] = end_zip
        time.sleep(1)  # Adding delay between requests to avoid hitting rate limits

    # Step 3: Map the zip codes back to the original dataframe
    df['start_zip'] = df['start_station_name'].map(start_station_zip_codes)
    df['end_zip'] = df['end_station_name'].map(end_station_zip_codes)

    # Step 4: Convert the zip code columns to strings to preserve leading zeros
    df['start_zip'] = df['start_zip'].astype(str)
    df['end_zip'] = df['end_zip'].astype(str)

    # Step 5: Append the DataFrame to the combined list
    combined_df.append(df)

    print(f"CSV processed: {file_path}")

# Step 6: Concatenate all DataFrames into one
final_combined_df = pd.concat(combined_df, ignore_index=True)

# Step 7: Save the combined DataFrame to a new CSV file
final_combined_df.to_csv("combined_bike_rentals_with_zipcodes.csv", index=False)

print("All files processed and combined successfully!")


Processing JC-202301-citibike-tripdata.csv...
Error fetching zip code for (40.80722, -73.9418): HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /reverse?lat=40.80722&lon=-73.9418&format=json&accept-language=en&addressdetails=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
CSV processed: JC-202301-citibike-tripdata.csv
Processing JC-202302-citibike-tripdata.csv...
Error fetching zip code for (40.7462009, -73.98855723): HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /reverse?lat=40.7462009&lon=-73.98855723&format=json&accept-language=en&addressdetails=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
CSV processed: JC-202302-citibike-tripdata.csv
Processing JC-202303-citibike-tripdata.csv...
Error fetching zip code for (4