In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import joblib

In [3]:
from sklearn.impute import SimpleImputer


df = pd.read_csv('../data/driving_data_raw.csv')
df = df.drop(columns=['timestamp'])
df["speed_limit"] = df['road_type'].map({"city":50, "highway":110, "rural":90}) 
df["speed_diff"] = df["speed"] - df["speed_limit"]
df["abs_jerk"] = df["jerk"].abs()
df["turn_sharpness"] = df["turn_angle"] * df["speed"] / 100.0
df["brake_positive"] = df["brake"].clip(lower=0.0)

num_features = ["speed","accel","brake_positive","turn_angle","steering_var","lateral_accel",
                "jerk","abs_jerk","speed_diff","turn_sharpness","night"]

cat_features = ["road_type","weather"]

X = df[num_features + cat_features]
y_style = df["style_label"]
target_cols = ["hard_brake","aggressive_accel","sharp_turn","speeding"]
y_events = df[target_cols]

num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent", )),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features)
])

style_le = LabelEncoder()
y_style_enc = style_le.fit_transform(y_style)

X_train, X_test, y_style_train, y_style_test, y_events_train, y_events_test = train_test_split(
    X, y_style_enc, y_events, test_size=0.2, random_state=42
)
X_train


Unnamed: 0,speed,accel,brake_positive,turn_angle,steering_var,lateral_accel,jerk,abs_jerk,speed_diff,turn_sharpness,night,road_type,weather
4227,63.399243,0.189088,0.000000,14.108120,0.343608,-0.585515,0.179307,0.179307,-46.600757,8.944441,0,highway,rain
4676,69.009518,0.014437,0.077754,5.566902,0.144629,0.185172,0.090449,0.090449,-40.990482,3.841692,0,highway,rain
800,106.235951,1.896413,0.535837,10.821897,0.793132,0.144117,-0.154190,0.154190,56.235951,11.496745,1,city,fog
3671,47.372785,-0.237857,0.054890,7.578618,1.802333,0.193900,0.010063,0.010063,-62.627215,3.590202,0,highway,fog
4193,66.967343,1.186432,0.084274,15.083000,0.676140,0.030273,0.257110,0.257110,-23.032657,10.100684,0,rural,fog
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,50.022448,0.592856,0.019173,9.872829,0.942346,0.333406,-0.080069,0.080069,0.022448,4.938631,1,city,fog
466,86.430784,2.438205,0.566354,16.647305,0.114826,0.024398,0.146289,0.146289,-3.569216,14.388396,1,rural,fog
3092,136.656035,2.358910,1.304406,8.655711,1.229923,0.108890,-0.056195,0.056195,26.656035,11.828551,1,highway,clear
3772,70.784639,0.869066,0.249220,7.512960,0.025966,0.456902,0.149242,0.149242,-19.215361,5.318022,1,rural,fog


In [4]:
print("\n CHECKING FOR DATA LEAKAGE:")
print(f"Features: {X.columns.tolist()}")

duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Vérifier corrélations suspectes - UNIQUEMENT SUR FEATURES NUMERIQUES
temp = X[num_features].copy()  # ← Seulement les features numériques
temp['target'] = y_style_enc

corr = temp.corr()['target'].abs().sort_values(ascending=False)
print("\nTop correlations with target (numerical features only):")
print(corr.head(10))

if any(corr[1:] > 0.95):  # Exclure la corrélation avec elle-même
    print(" WARNING: Very high correlation detected! Possible data leakage!")
else:
    print(" No suspicious correlations detected")


 CHECKING FOR DATA LEAKAGE:
Features: ['speed', 'accel', 'brake_positive', 'turn_angle', 'steering_var', 'lateral_accel', 'jerk', 'abs_jerk', 'speed_diff', 'turn_sharpness', 'night', 'road_type', 'weather']
Duplicate rows: 0

Top correlations with target (numerical features only):
target            1.000000
brake_positive    0.151379
accel             0.133289
speed             0.084045
speed_diff        0.050548
night             0.022073
turn_sharpness    0.018406
jerk              0.017138
turn_angle        0.014162
abs_jerk          0.011233
Name: target, dtype: float64
 No suspicious correlations detected


In [5]:
classifier = RandomForestClassifier(
    n_estimators=50,        
    max_depth=4,            
    min_samples_split=50,   
    min_samples_leaf=10,    
)
style_pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", classifier )
])
style_pipeline.fit(X_train, y_style_train)
print("Style model trained.")


Style model trained.


In [6]:
from sklearn.metrics import classification_report, accuracy_score
y_style_pred = style_pipeline.predict(X_test)
print("Training Style accuracy:", accuracy_score(y_style_train, style_pipeline.predict(X_train)))
print("Test Style accuracy:", accuracy_score(y_style_test, y_style_pred))
print(classification_report(y_style_test, y_style_pred, target_names=style_le.classes_))


Training Style accuracy: 0.90225
Test Style accuracy: 0.885
              precision    recall  f1-score   support

  aggressive       1.00      0.15      0.26        86
        calm       0.98      0.80      0.88       153
   dangerous       0.82      0.98      0.89       221
      normal       0.89      0.99      0.94       540

    accuracy                           0.89      1000
   macro avg       0.92      0.73      0.74      1000
weighted avg       0.90      0.89      0.86      1000



In [10]:
# Saving the style model
joblib.dump(classifier, "../ai/models/style_model.pkl")
print(" Model saved: driving_style_model_balanced.pkl")

 Model saved: driving_style_model_balanced.pkl
