In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib


# 1) Load dataset
df = pd.read_csv("../data/driving_data_raw.csv").dropna()

# Normalize categorical features
df['road_type'] = df['road_type'].astype(str).str.lower()
df['weather']   = df['weather'].astype(str).str.lower()


# 2) Define features (remove event columns)

numerical_features = [
    'speed','accel','brake','turn_angle','steering_var',
    'lateral_accel','jerk','night'
]

categorical_features = ['road_type', 'weather']

X = df[numerical_features + categorical_features]
y = df['style_label']  # safe, normal, aggressive, dangerous


# 3) Preprocessing pipeline

preprocess = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


# 4) ML Pipeline

pipe = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(
        max_iter=2000,
        class_weight='balanced',   # handle class imbalance
    ))
])


# 5) Hyperparameter search (stronger regularization)

params = {'model__C': [0.001, 0.01, 0.1, 1]}  # smaller C → less overfit

clf = GridSearchCV(pipe, params, cv=5, scoring='f1_macro')


# 6) Train/Test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# 7) Train

clf.fit(X_train, y_train)


# 8) Evaluate

y_pred = clf.predict(X_test)

print("Best params:", clf.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


# 9) Save pipeline

joblib.dump(clf, "driving_style_model_balanced.pkl")
print(" Model saved: driving_style_model_balanced.pkl")


Best params: {'model__C': 1}

Confusion Matrix:
 [[ 71   0  13   6]
 [  0 154   0   4]
 [ 44   0 163   0]
 [ 56  47   0 442]]

Classification Report:
               precision    recall  f1-score   support

  aggressive       0.42      0.79      0.54        90
        calm       0.77      0.97      0.86       158
   dangerous       0.93      0.79      0.85       207
      normal       0.98      0.81      0.89       545

    accuracy                           0.83      1000
   macro avg       0.77      0.84      0.78      1000
weighted avg       0.88      0.83      0.84      1000

✅ Model saved: driving_style_model_balanced.pkl
