In [1]:
import pandas as pd

# Load datasets
d1 = pd.read_csv("D1_phase2_LESS.csv")
d2 = pd.read_csv("D2_phase2_LESS.csv")
d3 = pd.read_csv("D3_phase2_LESS.csv")


print(d1.shape)
print(d2.shape)
print(d3.shape)

(748, 42)
(755, 42)
(1050, 42)


In [2]:
print("D1 duplicate rows:", d1.duplicated().sum())
print("D2 duplicate rows:", d2.duplicated().sum())
print("D3 duplicate rows:", d2.duplicated().sum())

D1 duplicate rows: 16
D2 duplicate rows: 0
D3 duplicate rows: 0


In [3]:
d1 = d1.drop_duplicates().reset_index(drop=True)
d2 = d2.drop_duplicates().reset_index(drop=True)
d3 = d3.drop_duplicates().reset_index(drop=True)


print("D1 shape after duplicate removal:", d1.shape)
print("D2 shape after duplicate removal:", d2.shape)
print("D3 shape after duplicate removal:", d3.shape)

D1 shape after duplicate removal: (732, 42)
D2 shape after duplicate removal: (755, 42)
D3 shape after duplicate removal: (1050, 42)


In [4]:
print("D1 missing values per column:")
print(d1.isna().sum())

print("\nD2 missing values per column:")
print(d2.isna().sum())

print("\nD3 missing values per column:")
print(d3.isna().sum())

D1 missing values per column:
t_sec            0
speed_kmh        0
lat              0
lon              0
alt              0
vert_acc         0
horiz_acc        0
course           0
difcourse        0
hdop             0
vdop             0
pdop             0
active           0
acc_x            0
acc_y            0
acc_z            0
acc_x_kf         0
acc_y_kf         0
acc_z_kf         0
roll             0
pitch            0
yaw              0
x_lane           0
phi              0
road_width       0
lane_state       0
dist_front       0
ttc_front        0
num_vehicles     0
gps_speed        0
max_speed        0
speed_rel        0
road_type_osm    0
num_lanes        0
lane_id          0
lat_osm          0
lon_osm          0
osm_delay        0
gps_speed_osm    0
driver           0
behavior         0
road_type        0
dtype: int64

D2 missing values per column:
t_sec            0
speed_kmh        0
lat              0
lon              0
alt              0
vert_acc         0
horiz_acc     

In [5]:
NON_FEATURE_COLS = [
    "t_sec",          # time index
    "driver",         # driver ID
    "behavior",       # label
    "road_type",      # meta label
    "road_type_osm",  # map category
    "lane_id"         # identifier
]

feature_cols = [col for col in d1.columns if col not in NON_FEATURE_COLS]

print("Number of features:", len(feature_cols))
print(feature_cols)


Number of features: 36
['speed_kmh', 'lat', 'lon', 'alt', 'vert_acc', 'horiz_acc', 'course', 'difcourse', 'hdop', 'vdop', 'pdop', 'active', 'acc_x', 'acc_y', 'acc_z', 'acc_x_kf', 'acc_y_kf', 'acc_z_kf', 'roll', 'pitch', 'yaw', 'x_lane', 'phi', 'road_width', 'lane_state', 'dist_front', 'ttc_front', 'num_vehicles', 'gps_speed', 'max_speed', 'speed_rel', 'num_lanes', 'lat_osm', 'lon_osm', 'osm_delay', 'gps_speed_osm']


In [6]:
LABEL_COL = "behavior"

NON_FEATURE_COLS = [
    "t_sec",
    "driver",
    LABEL_COL,
    "road_type",
    "road_type_osm",
    "lane_id"
]

In [7]:
def get_feature_columns(df, non_feature_cols):
    return [c for c in df.columns if c not in non_feature_cols]

feature_cols = get_feature_columns(d1, NON_FEATURE_COLS)

print("Features used:", len(feature_cols))
print(feature_cols)


Features used: 36
['speed_kmh', 'lat', 'lon', 'alt', 'vert_acc', 'horiz_acc', 'course', 'difcourse', 'hdop', 'vdop', 'pdop', 'active', 'acc_x', 'acc_y', 'acc_z', 'acc_x_kf', 'acc_y_kf', 'acc_z_kf', 'roll', 'pitch', 'yaw', 'x_lane', 'phi', 'road_width', 'lane_state', 'dist_front', 'ttc_front', 'num_vehicles', 'gps_speed', 'max_speed', 'speed_rel', 'num_lanes', 'lat_osm', 'lon_osm', 'osm_delay', 'gps_speed_osm']


In [8]:
def split_by_label(df, label_col):
    splits = {}
    for label in df[label_col].unique():
        splits[label] = df[df[label_col] == label].reset_index(drop=True)
    return splits

In [9]:
d1_splits = split_by_label(d1, LABEL_COL)
d2_splits = split_by_label(d2, LABEL_COL)
d3_splits = split_by_label(d3, LABEL_COL)


print("D1 behaviors:", d1_splits.keys())
print("D2 behaviors:", d2_splits.keys())
print("D3 behaviors:", d3_splits.keys())

D1 behaviors: dict_keys(['Normal', 'Aggressive'])
D2 behaviors: dict_keys(['Normal', 'Aggressive'])
D3 behaviors: dict_keys(['Normal', 'Aggressive'])


In [10]:
import numpy as np

def window_data(df, feature_cols, label,
                window_size=96, stride= 24):

    X, y = [], []
    features = df[feature_cols].values

    for start in range(0, len(df) - window_size + 1, stride):
        X.append(features[start:start + window_size])
        y.append(label)

    return np.array(X), np.array(y)

In [11]:
X_all, y_all = [], []

for label, df_part in d1_splits.items():
    X_tmp, y_tmp = window_data(df_part, feature_cols, label)
    X_all.append(X_tmp)
    y_all.append(y_tmp)

for label, df_part in d2_splits.items():
    X_tmp, y_tmp = window_data(df_part, feature_cols, label)
    X_all.append(X_tmp)
    y_all.append(y_tmp)

for label, df_part in d3_splits.items():
    X_tmp, y_tmp = window_data(df_part, feature_cols, label)
    X_all.append(X_tmp)
    y_all.append(y_tmp)

In [12]:
X = np.concatenate(X_all)
y = np.concatenate(y_all)

print("Final X shape:", X.shape)
print("Final y shape:", y.shape)

Final X shape: (84, 96, 36)
Final y shape: (84,)


In [13]:
for label, df_part in d1_splits.items():
    print("D1 behavior", label, "rows:", len(df_part))

for label, df_part in d2_splits.items():
    print("D2 behavior", label, "rows:", len(df_part))

for label, df_part in d3_splits.items():
    print("D3 behavior", label, "rows:", len(df_part))

D1 behavior Normal rows: 377
D1 behavior Aggressive rows: 355
D2 behavior Normal rows: 415
D2 behavior Aggressive rows: 340
D3 behavior Normal rows: 575
D3 behavior Aggressive rows: 475


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


Train: (67, 96, 36)
Test : (17, 96, 36)


In [15]:
# Compute min and max per feature across samples & time
min_val = X_train.min(axis=(0, 1))
max_val = X_train.max(axis=(0, 1))

In [16]:
def min_max_normalize(X, min_val, max_val):
    return (X - min_val) / (max_val - min_val + 1e-8)

In [17]:
X_train_norm = min_max_normalize(X_train, min_val, max_val)
X_test_norm  = min_max_normalize(X_test,  min_val, max_val)

In [18]:
print("Train min:", X_train_norm.min())
print("Train max:", X_train_norm.max())

Train min: 0.0
Train max: 0.9999999999623352


In [19]:
timesteps = X_train_norm.shape[1]
num_features = X_train_norm.shape[2]
num_classes = len(np.unique(y_train))

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, LSTM, Conv1D, BatchNormalization,
    Activation, GlobalAveragePooling1D,
    Dense, Concatenate
)
from tensorflow.keras.models import Model

In [21]:
def build_lstm_fcn(timesteps, num_features, num_classes):

    input_layer = Input(shape=(timesteps, num_features))

    # -------- LSTM branch --------
    x_lstm = LSTM(128, dropout=0.3)(input_layer)

    # -------- FCN branch --------
    x = Conv1D(128, kernel_size=8, padding="same")(input_layer) #Detects short temporal patterns
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    x = Conv1D(256, kernel_size=5, padding="same")(x) #Detects medium-scale patterns
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    x = Conv1D(128, kernel_size=3, padding="same")(x) #Detects very short, sharp changes
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    x_fcn = GlobalAveragePooling1D()(x) #Compresses the time dimension & keeps most important signals

    # -------- Merge --------
    x = Concatenate()([x_lstm, x_fcn])

    output = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs=input_layer, outputs=output)
    return model

In [22]:
model = build_lstm_fcn(timesteps, num_features, num_classes)

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc  = label_encoder.transform(y_test)

print("Label mapping:", dict(zip(label_encoder.classes_,
                                 label_encoder.transform(label_encoder.classes_))))


Label mapping: {np.str_('Aggressive'): np.int64(0), np.str_('Normal'): np.int64(1)}


In [24]:
history = model.fit(
    X_train_norm, y_train_enc,
    validation_data=(X_test_norm, y_test_enc),
    epochs=80,
    batch_size=16,
    verbose=1
)

Epoch 1/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 225ms/step - accuracy: 0.5337 - loss: 0.6640 - val_accuracy: 0.5294 - val_loss: 0.7048
Epoch 2/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 120ms/step - accuracy: 0.8303 - loss: 0.4220 - val_accuracy: 0.6471 - val_loss: 0.6361
Epoch 3/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step - accuracy: 0.8919 - loss: 0.2617 - val_accuracy: 0.5882 - val_loss: 0.6120
Epoch 4/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - accuracy: 0.9683 - loss: 0.1657 - val_accuracy: 0.6471 - val_loss: 0.6096
Epoch 5/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - accuracy: 0.9118 - loss: 0.1968 - val_accuracy: 0.7647 - val_loss: 0.5636
Epoch 6/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - accuracy: 0.9950 - loss: 0.1277 - val_accuracy: 0.7647 - val_loss: 0.5452
Epoch 7/80
[1m5/5[0m [32m━━━━━━━━━━━━

In [26]:
test_loss, test_acc = model.evaluate(X_test_norm, y_test_enc)
print("Test accuracy:", test_acc)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.8235 - loss: 0.4663
Test accuracy: 0.8235294222831726
