In [None]:
!pip install pandas
!pip install numpy
!pip install requests
!pip install tqdm

In [None]:
import requests
import pandas as pd
from tqdm import tqdm
import time
import os


In [None]:
# Load full dataset
df = pd.read_csv("us_accidents_40k_with_weather.csv")

# Ensure necessary columns are present
required_cols = {"ID", "Street", "City"}
assert required_cols.issubset(df.columns), f"Missing required columns: {required_cols - set(df.columns)}"

# Display sample
df.head()


In [None]:
def get_lat_lng_from_address(street, city):
    query = f"{street}, {city}"
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": query,
        "format": "json",
        "limit": 1
    }
    try:
        response = requests.get(url, params=params, headers={'User-Agent': 'speed-limit-fetcher'})
        data = response.json()
        if data:
            return data[0]["lat"], data[0]["lon"]
    except:
        pass
    return None, None

def get_speed_limit_from_osm(lat, lon):
    overpass_url = "http://overpass-api.de/api/interpreter"
    query = f"""
    [out:json];
    way(around:20,{lat},{lon})["highway"]["maxspeed"];
    out tags;
    """
    try:
        response = requests.post(overpass_url, data={"data": query})
        data = response.json()
        for element in data["elements"]:
            if "maxspeed" in element["tags"]:
                return element["tags"]["maxspeed"]
    except:
        pass
    return None


In [None]:
# Output file path
output_file = "speed_limits_by_street.csv"

# Load existing results if resuming
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
    completed_ids = set(existing_df["ID"])
    results = existing_df.to_dict("records")
    print(f"Resuming from {len(completed_ids)} completed records.")
else:
    completed_ids = set()
    results = []


In [None]:
# Filter rows not yet processed
remaining_df = df[~df["ID"].isin(completed_ids)]

# Iterate with tqdm
for _, row in tqdm(remaining_df.iterrows(), total=len(remaining_df)):
    record_id = row["ID"]
    street, city = row["Street"], row["City"]

    lat, lon = get_lat_lng_from_address(street, city)
    time.sleep(1.1)  # Rate limit for Nominatim

    if lat is not None and lon is not None:
        speed = get_speed_limit_from_osm(lat, lon)
        time.sleep(1.0)  # Be nice to Overpass API
    else:
        speed = None

    results.append({
        "ID": record_id,
        "Street": street,
        "City": city,
        "Speed_Limit": speed
    })

    # Save incrementally after every record
    pd.DataFrame(results).to_csv(output_file, index=False)


In [None]:
# Reload final output to confirm
final_df = pd.read_csv(output_file)
print(f"Total collected: {len(final_df)} speed limits")
final_df.head()
