In [4]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from tqdm import tqdm

df = pd.read_parquet("../data/filtered_waypoints.parquet")

df = df.sort_values(by=["id_tracking", "sequence"]).reset_index(drop=True)

grouped = df.groupby("id_tracking")

print(len(grouped))

features = []

for tracking_id, group in tqdm(grouped, desc="Extracting features"):
    group = group.dropna(subset=["latitude", "longitude"])
    coords = list(zip(group["latitude"], group["longitude"]))

    if len(coords) < 2:
        continue  

    num_points = len(coords)

    lat_span = max(group["latitude"]) - min(group["latitude"])
    lon_span = max(group["longitude"]) - min(group["longitude"])
    bbox_area = lat_span * lon_span

    point_density = num_points / (bbox_area + 1e-6)

    dists = [geodesic(coords[i], coords[i+1]).meters for i in range(len(coords)-1)]
    avg_segment_distance = np.mean(dists)

    num_stops = (group["speed"] == 0).sum()

    # duration = group["duration"].iloc[0] / 1e9  
    # length = group["length"].iloc[0]            

    features.append({
        "tracking_id": tracking_id,
        "num_points": num_points,
        "bbox_area": bbox_area,
        "point_density": point_density,
        "avg_segment_distance": avg_segment_distance,
        "num_stops": num_stops,
        # "duration": duration,
        # "length": length
    })

features_df = pd.DataFrame(features)
features_df.to_csv("tracking_features.csv", index=False)
print("Feature extraction complete. Saved to tracking_features.csv")


850


Extracting features: 100%|██████████| 850/850 [00:23<00:00, 36.28it/s]

Feature extraction complete. Saved to tracking_features.csv



