## Synthetic data generation for driver future performance

In [22]:
import numpy as np
import pandas as pd

In [23]:
n_samples = 10000

In [24]:

# Levels with probabilities
levels = ["Starting Vehicle", "Turning", "Stopping Vehicle", "Navigating Turns"]
level_probs = [0.4, 0.3, 0.2, 0.1]
level = np.random.choice(levels, n_samples, p=level_probs)

# Engagement scores correlated with levels
engagement_score = {
    "Starting Vehicle": (30, 100),
    "Turning": (30, 90),
    "Stopping Vehicle": (30, 80),
    "Navigating Turns": (30, 90),
}

# Adding Gaussian noise
engagement_score = np.array([
    np.random.uniform(*engagement_score[l]) + np.random.normal(0, 5)
    for l in level
])

# Engagement and predicted times correlated with levels
engagement_time = {
    "Starting Vehicle": (30, 60),
    "Turning": (40, 80),
    "Stopping Vehicle": (50, 100),
    "Navigating Turns": (60, 120),
}
predicted_time = {
    "Starting Vehicle": (10, 30),
    "Turning": (15, 35),
    "Stopping Vehicle": (20, 40),
    "Navigating Turns": (25, 50),
}

# Adding Gaussian noise
engagement_time = np.array([
    np.random.uniform(*engagement_time[l]) + np.random.normal(0, 3)
    for l in level
])

# Adding Gaussian noise
predicted_time = np.array([
    engagement_time[i] + np.random.uniform(*predicted_time[level[i]]) + np.random.normal(0, 3)
    for i in range(n_samples)
])

In [25]:
# Function to add future performance labels
def performance_label(score, eng_time, pred_time):
    if score > 80 and eng_time < pred_time:
        label = "Excellent"
    elif score > 70:
        label = "Good"
    else:
        label = "Needs Improvement"

    # Random mislabeling
    if np.random.rand() < 0.05:
        label = np.random.choice(["Excellent", "Good", "Needs Improvement"])
    return label

future_performance = [
    performance_label(score, time, pred)
    for score, time, pred in zip(engagement_score, engagement_time, predicted_time)
]

In [26]:
data = pd.DataFrame({
    "Level": level,
    "Engagement Score": np.clip(engagement_score, 0, 100),
    "Engagement Time (s)": engagement_time,
    "Predicted Time (s)": predicted_time,
    "Future Performance": future_performance
})

data.to_csv("driver_future_performance.csv", index=False)

In [27]:
print(data.head())

              Level  Engagement Score  Engagement Time (s)  \
0           Turning         90.217708            46.141552   
1  Starting Vehicle         36.568457            46.613846   
2  Starting Vehicle         47.628482            35.779767   
3           Turning         44.091658            52.568557   
4  Stopping Vehicle         82.847142            72.037649   

   Predicted Time (s) Future Performance  
0           79.814961          Excellent  
1           58.350919  Needs Improvement  
2           62.553693  Needs Improvement  
3           85.185325  Needs Improvement  
4           95.932005          Excellent  


In [28]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Level                10000 non-null  object 
 1   Engagement Score     10000 non-null  float64
 2   Engagement Time (s)  10000 non-null  float64
 3   Predicted Time (s)   10000 non-null  float64
 4   Future Performance   10000 non-null  object 
dtypes: float64(3), object(2)
memory usage: 390.8+ KB
None
