In [16]:
import os
import pandas as pd

BASE_PATH = "."  # current directory

In [17]:
def get_drive_folders(base_path):
    return [
        f for f in os.listdir(base_path)
        if f.startswith("D5-") and "Normal2" not in f and os.path.isdir(os.path.join(base_path, f))
    ]

drive_folders = get_drive_folders(BASE_PATH)
drive_folders

['D5-Aggressive-motor',
 'D5-Aggressive-secondary',
 'D5-Drowsy-motor',
 'D5-Drowsy-secondary',
 'D5-Normal-motor',
 'D5-Normal-secondary']

In [18]:
def get_behavior(folder):
    name = folder.lower()
    if "drowsy" in name:
        return "drowsy"
    elif "aggressive" in name:
        return "aggressive"
    else:
        return "normal"

def get_road_type(folder):
    return "motor" if "motor" in folder.lower() else "secondary"

In [19]:
def load_gps(folder):
    gps = pd.read_csv(
        f"{folder}/RAW_GPS.txt",
        delim_whitespace=True,
        header=None,
        names=[
            "timestamp",      # 1
            "speed_kmh",      # 2
            "lat",            # 3
            "lon",            # 4
            "alt",            # 5
            "vert_acc",       # 6
            "horiz_acc",      # 7
            "course",         # 8
            "difcourse"       # 9
            "hdop",             # 10
            "vdop",             # 11
            "pdop"              # 12
        ]
    )

    gps["timestamp"] = gps["timestamp"].astype(int)
    return gps

In [20]:
def load_acc(folder):
    acc = pd.read_csv(
        f"{folder}/RAW_ACCELEROMETERS.txt",
        delim_whitespace=True,
        header=None,
        names=[
            "timestamp",          # 1
            "system_active",     # 2
            "acc_x",              # 3
            "acc_y",              # 4
            "acc_z",              # 5
            "acc_x_kf",           # 6
            "acc_y_kf",           # 7
            "acc_z_kf",           # 8
            "roll",               # 9
            "pitch",              # 10
            "yaw"                 # 11
        ]
    )

    acc["timestamp"] = acc["timestamp"].astype(int)
    return acc

In [21]:
def load_lane(folder):
    return pd.read_csv(
        f"{folder}/PROC_LANE_DETECTION.txt",
        delim_whitespace=True,
        header=None,
        names=[
            "timestamp",
            "x_lane",
            "phi_lane",
            "road_width",
            "lane_state"
        ]
    )

In [22]:
def load_vehicle(folder):
    return pd.read_csv(
        f"{folder}/PROC_VEHICLE_DETECTION.txt",
        delim_whitespace=True,
        header=None,
        names=[
            "timestamp",
            "dist_ahead_vehicle",
            "time_to_impact",
            "num_detected_vehicles",
            "gps_speed_kmh"
        ]
    )

In [23]:
def load_osm(folder):
    return pd.read_csv(
        f"{folder}/PROC_OPENSTREETMAP_DATA.txt",
        delim_whitespace=True,
        header=None,
        names=[
            "timestamp",
            "max_speed_kmh",
            "max_speed_reliability",
            "road_type",
            "num_lanes",
            "estimated_lane",
            "osm_lat",
            "osm_lon",
            "osm_query_delay",
            "gps_speed_kmh"
        ]
    )

In [24]:
# =========================================================
# STEP 1: Build unified 10Hz timeline (paper-compliant)
# =========================================================

def build_10hz_timebase(max_timestamp_sec):
    """
    Creates a 10Hz timeline from t=0 to t=max_timestamp_sec
    """
    t_10hz = pd.DataFrame({
        "t_10hz": range(0, int(max_timestamp_sec * 10) + 1)
    })
    t_10hz["timestamp"] = t_10hz["t_10hz"] / 10.0
    return t_10hz

In [25]:
def build_phase2_drive(folder):
    path = os.path.join(BASE_PATH, folder)

    # =====================================================
    # RAW_GPS (1 Hz → 10 Hz)
    # =====================================================
    gps = pd.read_csv(
        f"{path}/RAW_GPS.txt",
        sep=r"\s+",
        header=None
    )
    gps.columns = [
        "timestamp", "speed_kmh", "lat", "lon", "alt",
        "vert_acc", "horiz_acc", "course", "difcourse",
        "hdop", "vdop", "pdop"
    ]

    max_time = gps["timestamp"].max()
    timebase_10hz = build_10hz_timebase(max_time)

    gps_10hz = pd.merge_asof(
        timebase_10hz.sort_values("timestamp"),
        gps.sort_values("timestamp"),
        on="timestamp",
        direction="backward"
    )

    # =====================================================
    # RAW_ACCELEROMETERS (already 10 Hz)
    # =====================================================
    acc = pd.read_csv(
        f"{path}/RAW_ACCELEROMETERS.txt",
        sep=r"\s+",
        header=None
    )
    acc.columns = [
        "timestamp", "active",
        "acc_x", "acc_y", "acc_z",
        "acc_x_kf", "acc_y_kf", "acc_z_kf",
        "roll", "pitch", "yaw"
    ]

    acc_10hz = pd.merge_asof(
        timebase_10hz.sort_values("timestamp"),
        acc.sort_values("timestamp"),
        on="timestamp",
        direction="nearest"
    )

    # =====================================================
    # PROC_LANE_DETECTION (~30 Hz → 10 Hz)
    # =====================================================
    lane = pd.read_csv(
        f"{path}/PROC_LANE_DETECTION.txt",
        sep=r"\s+",
        header=None
    )
    lane.columns = ["timestamp", "x_lane", "phi", "road_width", "lane_state"]

    lane_10hz = pd.merge_asof(
        timebase_10hz.sort_values("timestamp"),
        lane.sort_values("timestamp"),
        on="timestamp",
        direction="nearest"
    )

    # =====================================================
    # PROC_VEHICLE_DETECTION (~10 Hz)
    # =====================================================
    veh = pd.read_csv(
        f"{path}/PROC_VEHICLE_DETECTION.txt",
        sep=r"\s+",
        header=None
    )
    veh.columns = [
        "timestamp", "dist_front", "ttc_front",
        "num_vehicles", "gps_speed"
    ]

    veh_10hz = pd.merge_asof(
        timebase_10hz.sort_values("timestamp"),
        veh.sort_values("timestamp"),
        on="timestamp",
        direction="nearest"
    )

    # =====================================================
    # PROC_OPENSTREETMAP_DATA (~1 Hz → 10 Hz)
    # =====================================================
    osm = pd.read_csv(
        f"{path}/PROC_OPENSTREETMAP_DATA.txt",
        sep=r"\s+",
        header=None
    )
    osm.columns = [
        "timestamp", "max_speed", "speed_rel",
        "road_type_osm", "num_lanes", "lane_id",
        "lat_osm", "lon_osm", "osm_delay", "gps_speed_osm"
    ]

    osm_10hz = pd.merge_asof(
        timebase_10hz.sort_values("timestamp"),
        osm.sort_values("timestamp"),
        on="timestamp",
        direction="backward"
    )

    # =====================================================
    # MERGE ALL STREAMS (10 Hz aligned)
    # =====================================================
    data = gps_10hz.copy()

    for df in [acc_10hz, lane_10hz, veh_10hz, osm_10hz]:
        data = data.merge(df, on=["timestamp", "t_10hz"], how="left")

    # =====================================================
    # LABELS
    # =====================================================
    driver, behavior, road_type = folder.split("-")

    data["driver"] = driver
    data["behavior"] = behavior
    data["road_type"] = road_type

    # Keep time columns first
    cols = ["t_10hz", "timestamp"] + [c for c in data.columns if c not in ["t_10hz", "timestamp"]]
    data = data[cols]

    return data

In [26]:
all_drives = []

for folder in drive_folders:
    print("Processing:", folder)
    df = build_phase2_drive(folder)
    all_drives.append(df)

driver5_phase2 = pd.concat(all_drives, ignore_index=True)


Processing: D5-Aggressive-motor
Processing: D5-Aggressive-secondary
Processing: D5-Drowsy-motor
Processing: D5-Drowsy-secondary
Processing: D5-Normal-motor
Processing: D5-Normal-secondary


The replacement of missing values is done using the K-nearest-neighbor (KNN) algorithm by approximating a point value using the points that are closest to it, based on other features.

In [27]:
from sklearn.impute import KNNImputer

# =====================================================
# KNN IMPUTATION (paper-compliant)
# =====================================================

exclude_cols = [
    "t_10hz",
    "timestamp",
    "driver",
    "behavior",
    "road_type",        
    "road_type_osm"
]

feature_cols = [c for c in driver5_phase2.columns if c not in exclude_cols]

X = driver5_phase2[feature_cols]

# Sanity check (before KNN)
print("NaNs before KNN:")
print(X.isna().sum().sort_values(ascending=False).head(10))

# Apply KNN
imputer = KNNImputer(
    n_neighbors=5,
    weights="distance"
)

X_imputed = pd.DataFrame(
    imputer.fit_transform(X),
    columns=feature_cols
)

# Put imputed values back
driver5_phase2_imputed = driver5_phase2.copy()
driver5_phase2_imputed[feature_cols] = X_imputed


NaNs before KNN:
lat_osm          1135
lane_id          1135
num_lanes        1135
speed_rel        1135
max_speed        1135
osm_delay        1135
lon_osm          1135
gps_speed_osm    1135
vert_acc          946
speed_kmh         946
dtype: int64


In [28]:
driver5_phase2_imputed.isna().sum().sort_values(ascending=False).head(10)

road_type_osm    1135
timestamp           0
t_10hz              0
lat                 0
lon                 0
alt                 0
vert_acc            0
horiz_acc           0
course              0
difcourse           0
dtype: int64

In [29]:
driver5_phase2_imputed.shape

(47046, 43)

In [30]:
driver5_phase2_imputed.to_csv("D5_FINAL_10Hz.csv", index=False)