### Description:

- script for processing un-automated datasets to compare with the tool (such as car crashes and traffic)

In [None]:
import osmnx as ox
import pandas as pd
from shapely.geometry import Point, Polygon
import geopandas as gpd
import os

def filter_crashes_within_area(data_file, place_name, output_file):
    """
    Reads crash data and saves entries located within the bounding area defined by osmnx's graph_from_place.

    Parameters:
        data_file (str): Path to the CSV file containing crash data.
        place_name (str): The place name to create the bounding area (e.g., "Fredericksburg, VA, USA").
        output_file (str): Path to save the filtered data.

    Returns:
        None: Saves the filtered data to a new CSV file.
    """

      # 1. Load the graph and convert it into a GeoDataFrame (bounding area)
    print("Loading graph for the specified place...")
    graph = ox.graph_from_place(place_name, network_type="drive")
    gdf_boundary = ox.geocode_to_gdf(place_name)
    print(gdf_boundary)
    
    # 2. Read the crash dataset
    print("Reading crash dataset...")
    df = pd.read_csv(data_file)

    # Check if the required columns (LAT and LON) exist
    if 'LAT' not in df.columns or 'LON' not in df.columns:
        print("Error: LAT and LON columns are missing in the dataset.")
        return

    # 3. Convert crash data to GeoDataFrame using LAT and LON
    print("Converting crash data to GeoDataFrame...")
    df['geometry'] = df.apply(lambda row: Point(row['LON'], row['LAT']), axis=1)
    gdf_crashes = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

    # 4. Filter crashes that fall within the bounding area
    print("Filtering crashes within the area...")
    gdf_crashes_within = gdf_crashes[gdf_crashes.within(gdf_boundary.unary_union)]

    # 5. Save the filtered crashes to a new CSV file
    print(f"Saving filtered crashes to {output_file}...")
    gdf_crashes_within.drop(columns=['geometry'], inplace=True)  # Remove geometry column before saving
    gdf_crashes_within.to_csv(output_file, index=False)

    print("Process complete. Filtered data saved successfully.")



def main():
    
    place_name = "fredericksburg, VA, USA"
    simple_place_name = place_name.split(",")[0]

    os.makedirs(f"../data/{simple_place_name}", exist_ok=True)

    data_file = f"../data/{simple_place_name}/features (1).csv"
    output_file = f"../data/{simple_place_name}/car_crashes_{simple_place_name}.csv"
    filter_crashes_within_area(data_file, place_name, output_file)



main()

### Viewing Crashes

In [None]:
from sklearn.neighbors import BallTree
import numpy as np
import pandas as pd
def find_nearest_pano(crash_df, ball_tree, pano_df, threshold=25):
    """
    Finds the nearest panoramic point to each crash point within a given threshold.
    """
    crash_coords = np.radians(crash_df[['LAT', 'LON']].values)  # Convert to radians for latitude and longitude
    distances, indices = ball_tree.query(crash_coords, k=1)  # Query nearest neighbor

    # Convert distances from radians to meters (Earth's radius ~ 6371000 meters)
    distances_meters = distances[:, 0] * 6371000

    # Assign pano_id if within threshold
    crash_df['matched_pano_id'] = [
        pano_df.iloc[indices[i][0]]['pano_id'] if distances_meters[i] <= threshold else None
        for i in range(len(distances_meters))
    ]

    return crash_df

def get_car_crashes_with_panaramics():
    simple_name = "fredericksburg"
    base_directory = f"data/{simple_name}"

    # Load datasets
    panoramic_dataset = pd.read_csv(f"{base_directory}/{simple_name}_panoramic_data.csv")
    crash_dataset = pd.read_csv(f"{base_directory}/car_crashes_{simple_name}.csv")

    # Check the column names of the panoramic dataset
    print(panoramic_dataset.columns)  # Debug: Check actual column names

    # Assuming the columns are different, adjust based on the output.
    # If the actual names are 'lat' and 'long', modify the following line:
    pano_coords = np.radians(panoramic_dataset[['lat', 'long']].values)  # Adjust based on your column names

    # Build the BallTree
    ball_tree = BallTree(pano_coords, metric='haversine')
    # Match crash points to panoramic points
    result_df = find_nearest_pano(crash_dataset, ball_tree, panoramic_dataset, threshold=25)

    # Display the result for verification
    print(result_df.head())

    # Remove rows where matched_pano_id is None
    result_df = result_df.dropna(subset=['matched_pano_id'])

    # Save the result to a new CSV file
    result_df.to_csv(f"{base_directory}/car_crashes_with_panoramics_{simple_name}.csv", index=False)

# Call the function to execute
get_car_crashes_with_panaramics()
