In [5]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from tqdm import tqdm

df = pd.read_parquet("../data/all_gps_data_without_sorting.parquet")

df = df.sort_values(by=["id_tracking", "sequence"]).reset_index(drop=True)

grouped = df.groupby("id_tracking")

print(len(grouped))

print(df.head())

features = []

for tracking_id, group in tqdm(grouped, desc="Extracting features"):
    group = group.dropna(subset=["latitude", "longitude"])
    coords = list(zip(group["latitude"], group["longitude"]))

    if len(coords) < 2:
        continue  

    num_points = len(coords)

    lat_span = max(group["latitude"]) - min(group["latitude"])
    lon_span = max(group["longitude"]) - min(group["longitude"])
    bbox_area = lat_span * lon_span

    point_density = num_points / (bbox_area + 1e-6)

    dists = [geodesic(coords[i], coords[i+1]).meters for i in range(len(coords)-1)]
    avg_segment_distance = np.mean(dists)

    num_stops = (group["speed"] == 0).sum()

    # duration = group["duration"].iloc[0] / 1e9  
    # length = group["length"].iloc[0]     
    
    # if "duration" in group.columns:
    #     duration = group["duration"].iloc[0] / 1e9 

    # if "length" in group.columns:
    #     length = group["length"].iloc[0] 

    features.append({
        "tracking_id": tracking_id,
        "num_points": num_points,
        "bbox_area": bbox_area,
        "point_density": point_density,
        "avg_segment_distance": avg_segment_distance,
        "num_stops": num_stops,
        # "duration": duration,
        # "length": length
    })

features_df = pd.DataFrame(features)
features_df.to_csv("tracking_features.csv", index=False)
print("Feature extraction complete. Saved to tracking_features.csv")


4919
   id_tracking         id                time  type  sequence comment  speed  \
0        96435  120000001 2024-05-10 06:57:10     0       391    None    0.0   
1        96435  120000002 2024-05-10 06:57:12     8       392    None    0.0   
2        96435  120000003 2024-05-10 06:57:21     0       393    None    0.0   
3        96435  120000004 2024-05-10 06:57:42     8       394    None    0.0   
4        96435  120000005 2024-05-10 06:58:26    34       395    None    0.0   

   heading  duration  block_type   log   latitude  longitude  altitude  \
0      0.0         0           0  None  48.296030  15.140959       0.0   
1      0.0         0           0  None  48.296030  15.140959       0.0   
2      0.0         0           0  None  48.296120  15.140970       0.0   
3      0.0         0           0  None  48.296371  15.140750       0.0   
4      0.0         0           0  None  48.296371  15.140750       0.0   

   meta_tag meta_value  
0         0             
1        14        

Extracting features: 100%|██████████| 4919/4919 [13:49<00:00,  5.93it/s]

Feature extraction complete. Saved to tracking_features.csv



