## Future driver performance prediction

In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
import pickle

In [94]:
data = pd.read_csv("/content/driver_future_performance.csv")

In [95]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Level                10000 non-null  object 
 1   Engagement Score     10000 non-null  float64
 2   Engagement Time (s)  10000 non-null  float64
 3   Predicted Time (s)   10000 non-null  float64
 4   Future Performance   10000 non-null  object 
dtypes: float64(3), object(2)
memory usage: 390.8+ KB
None


In [96]:
X = data[["Level", "Engagement Score", "Engagement Time (s)", "Predicted Time (s)"]]
y = data["Future Performance"]

In [97]:
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[["Level"]]).toarray()
encoded_feature_names = encoder.get_feature_names_out(["Level"])

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [98]:
X_combined = pd.DataFrame(X_encoded, columns=encoded_feature_names)
X_combined = pd.concat([X_combined, X[["Engagement Score", "Engagement Time (s)", "Predicted Time (s)"]].reset_index(drop=True)], axis=1)

In [99]:
scaler = StandardScaler()
X_combined.iloc[:, len(encoded_feature_names):] = scaler.fit_transform(X_combined.iloc[:, len(encoded_feature_names):])

In [100]:
print(y.value_counts())

Future Performance
Needs Improvement    6513
Excellent            1861
Good                 1626
Name: count, dtype: int64


In [101]:
sampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_combined, y_encoded)

print(pd.Series(y_resampled).value_counts())

0    6513
2    6513
1    6513
Name: count, dtype: int64


In [102]:
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [103]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [104]:
cv = cross_val_score(model, X_train, y_train, cv=10)
print(cv)
print('\nMean: ', cv.mean())

[0.87280702 0.8625731  0.87865497 0.87426901 0.86038012 0.86769006
 0.86842105 0.87564009 0.86100951 0.87417703]

Mean:  0.8695621949289218


In [105]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       998
           1       0.80      0.88      0.84       939
           2       0.88      0.82      0.85       994

    accuracy                           0.87      2931
   macro avg       0.87      0.87      0.87      2931
weighted avg       0.88      0.87      0.87      2931

Accuracy: 0.8723984988058683


In [106]:
y_pred_test = model.predict(X_test)
print(classification_report(y_test, y_pred_test))

accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       984
           1       0.79      0.87      0.83       972
           2       0.86      0.83      0.85       975

    accuracy                           0.86      2931
   macro avg       0.87      0.86      0.86      2931
weighted avg       0.87      0.86      0.86      2931

Accuracy: 0.8631866257250085


In [107]:
with open('future_prediction_model.pkl', 'wb') as f:
    pickle.dump(model, f)