In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic

def compute_route_features(df):
    """
    Aggregates waypoint data into OSM-style features.
    """
    route_features = []

    for track_id, group in df.groupby("id_tracking"):
        group = group.sort_values("time")

        # Compute total distance traveled (sum of Haversine distances)
        coords = list(zip(group.latitude, group.longitude))
        distances = [geodesic(coords[i], coords[i + 1]).km for i in range(len(coords) - 1)]
        total_distance = sum(distances)

        # Compute avg street segment length
        avg_street_segment_length = np.mean(distances) if len(distances) > 0 else 0

        # Compute intersection density (sharp direction changes)
        heading_changes = np.abs(np.diff(group.heading))
        intersection_density = np.sum(heading_changes > 30) / total_distance if total_distance > 0 else 0

        # Compute circuity (total distance / direct distance)
        start, end = coords[0], coords[-1]
        direct_distance = geodesic(start, end).km if start != end else 0
        circuity_avg = total_distance / direct_distance if direct_distance > 0 else 1

        # Compute node density (waypoints per km²)
        min_lat, max_lat = group.latitude.min(), group.latitude.max()
        min_lon, max_lon = group.longitude.min(), group.longitude.max()
        area_km2 = (geodesic((min_lat, min_lon), (max_lat, min_lon)).km *
                    geodesic((min_lat, min_lon), (min_lat, max_lon)).km)
        node_density = len(group) / area_km2 if area_km2 > 0 else 0

        # Compute self-loop proportion (waypoints revisited within 50m)
        revisit_count = np.sum([geodesic(coords[i], coords[j]).m < 50 for i in range(len(coords)) for j in range(i+1, len(coords))])
        self_loop_proportion = revisit_count / len(coords) if len(coords) > 0 else 0

        # Append the extracted features
        route_features.append({
            "id_tracking": track_id,
            "total_distance": total_distance,
            "node_density": node_density,
            "street_density": total_distance / area_km2 if area_km2 > 0 else 0,
            "avg_street_segment_length": avg_street_segment_length,
            "intersection_density": intersection_density,
            "circuity_avg": circuity_avg,
            "self_loop_proportion": self_loop_proportion,
            "street_segment_count": len(group),
            "streets_per_node_avg": len(group) / total_distance if total_distance > 0 else 0
        })

    return pd.DataFrame(route_features)


In [3]:
track_features = pd.read_parquet("./gps_data_relaxed_parameters_more.parquet")
track_features_osm = compute_route_features(track_features)
track_features_osm.to_parquet("./osm_style_data/gps_data_osm_style.parquet")
