In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib


df = pd.read_csv("../data/driving_data_raw.csv").dropna()

df['road_type'] = df['road_type'].astype(str).str.lower()
df['weather']   = df['weather'].astype(str).str.lower()


numerical_features = [
    'speed','accel','brake','turn_angle','steering_var',
    'lateral_accel','jerk','night'
]

categorical_features = ['road_type', 'weather']

X = df[numerical_features + categorical_features]
y = df['style_label']  # safe, normal, aggressive, dangerous


preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(
        max_iter=2000,
        class_weight='balanced',   # handle class imbalance
    ))
])


params = {'model__C': [0.001, 0.01, 0.1, 1]}  # houni sagharna fi C --> regularization tekber --> overfitting yon9os

clf = GridSearchCV(pipe, params, cv=5, scoring='f1_macro')


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


#fit the model : 
clf.fit(X_train, y_train)

#evaluation
y_pred = clf.predict(X_test)

print("Best params:", clf.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# save the model 
joblib.dump(clf, "driving_style_model_balanced.pkl")
print(" Model saved: driving_style_model_balanced.pkl")


Best params: {'model__C': 1}

Confusion Matrix:
 [[ 71   0  13   6]
 [  0 154   0   4]
 [ 44   0 163   0]
 [ 56  47   0 442]]

Classification Report:
               precision    recall  f1-score   support

  aggressive       0.42      0.79      0.54        90
        calm       0.77      0.97      0.86       158
   dangerous       0.93      0.79      0.85       207
      normal       0.98      0.81      0.89       545

    accuracy                           0.83      1000
   macro avg       0.77      0.84      0.78      1000
weighted avg       0.88      0.83      0.84      1000

 Model saved: driving_style_model_balanced.pkl


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier # hedhy ili zedneha : permet de transformer un modèle de classification binaire ou multi-classe classique en un modèle multi-label.
from sklearn.metrics import classification_report, accuracy_score
import joblib


df = pd.read_csv("../data/driving_data_raw.csv").dropna()

df['road_type'] = df['road_type'].astype(str).str.lower()
df['weather']   = df['weather'].astype(str).str.lower()

# On supprime les features trop proches des événements bruts pour éviter overfitting
numerical_features = [
    'speed','accel','brake','turn_angle','steering_var',
    'lateral_accel','jerk','night'
]

categorical_features = ['road_type', 'weather']

X = df[numerical_features + categorical_features]

# Multi-label targets
event_cols = ['hard_brake', 'aggressive_accel', 'sharp_turn', 'speeding']
y = df[event_cols]


# Preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# !!!!!! MultiOutput Logistic Regression avec régularisation forte
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    C=0.01 
)

multi_pipe = Pipeline([
    ('preprocess', preprocess),
    ('model', MultiOutputClassifier(log_reg))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


#Entraîner le modèle
multi_pipe.fit(X_train, y_train)

# Évaluation

y_pred = multi_pipe.predict(X_test)

for i, col in enumerate(event_cols):
    print(f"\n=== Event: {col} ===")
    print(classification_report(y_test[col], y_pred[:, i]))
    print("Accuracy:", accuracy_score(y_test[col], y_pred[:, i]))

# Sauvegarder le modèle
joblib.dump(multi_pipe, "driving_event_model_balanced.pkl")
print("Modèle multi-label d’événements sauvegardé: driving_event_model_balanced.pkl")



=== Event: hard_brake ===
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       873
           1       0.73      0.99      0.84       127

    accuracy                           0.95      1000
   macro avg       0.86      0.97      0.91      1000
weighted avg       0.96      0.95      0.96      1000

Accuracy: 0.952

=== Event: aggressive_accel ===
              precision    recall  f1-score   support

           0       0.99      0.94      0.97       808
           1       0.80      0.98      0.88       192

    accuracy                           0.95      1000
   macro avg       0.90      0.96      0.92      1000
weighted avg       0.96      0.95      0.95      1000

Accuracy: 0.948

=== Event: sharp_turn ===
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       995
           1       0.13      1.00      0.23         5

    accuracy                           0.97      1000
   macro 