In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

np.random.seed(42)
samples = 10000
data = {
    'timestamp': pd.date_range(start='2025-01-01', periods=samples, freq='15min').strftime('%Y-%m-%d %H:%M:%S'),
    'vehicle_count': [],
    'speed': [],
    'cyclist_presence': [],
    'time_of_day': [],
    'day_of_week': [],
    'lag_vehicle_count': []  # Previous interval's vehicle count
}

for i in range(samples):
    hour = pd.to_datetime(data['timestamp'][i]).hour
    day = pd.to_datetime(data['timestamp'][i]).weekday()
    # Simulating realistic traffic patterns
    if 7 <= hour <= 9 or 17 <= hour <= 19:  # Rush hours
        vehicles = np.random.randint(30, 51)
        speed = np.random.uniform(5, 20)
    else:  # Non-rush hours
        vehicles = np.random.randint(5, 25)
        speed = np.random.uniform(20, 50)
    cyclist = 1 if (6 <= hour <= 18 and day < 5 and np.random.random() < 0.4) else 0
    data['vehicle_count'].append(vehicles)
    data['speed'].append(speed)
    data['cyclist_presence'].append(cyclist)
    data['time_of_day'].append(hour)
    data['day_of_week'].append(day)
    # Lag feature (use 0 for first sample)
    data['lag_vehicle_count'].append(data['vehicle_count'][i-1] if i > 0 else 0)

df = pd.DataFrame(data)
# Adding cyclic encodings
df['time_of_day_sin'] = np.sin(2 * np.pi * df['time_of_day'] / 24)
df['time_of_day_cos'] = np.cos(2 * np.pi * df['time_of_day'] / 24)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
# Prediction targets (next interval)
df['pred_vehicle_count'] = df['vehicle_count'].shift(-1, fill_value=df['vehicle_count'].iloc[-1])
df['pred_speed'] = df['speed'].shift(-1, fill_value=df['speed'].iloc[-1])
df['pred_cyclist_presence'] = df['cyclist_presence'].shift(-1, fill_value=df['cyclist_presence'].iloc[-1])

# Normalizing continuous features and targets
scaler = MinMaxScaler()
columns_to_normalize = ['vehicle_count', 'speed', 'time_of_day', 'lag_vehicle_count', 'pred_vehicle_count', 'pred_speed']
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

# Verifying normalization
for col in columns_to_normalize:
    assert df[col].between(0, 1).all(), f"{col} not normalized"
df.to_csv('traffic_data_preprocessed.csv', index=False)
print("Dataset generated and normalized successfully")
print(df.head())

Dataset generated and normalized successfully
             timestamp  vehicle_count     speed  cyclist_presence  \
0  2025-01-01 00:00:00       0.133333  0.864411                 0   
1  2025-01-01 00:15:00       0.311111  0.821372                 0   
2  2025-01-01 00:30:00       0.133333  0.630574                 0   
3  2025-01-01 00:45:00       0.222222  0.639519                 0   
4  2025-01-01 01:00:00       0.066667  0.428571                 0   

   time_of_day  day_of_week  lag_vehicle_count  time_of_day_sin  \
0     0.000000            2               0.00         0.000000   
1     0.000000            2               0.22         0.000000   
2     0.000000            2               0.38         0.000000   
3     0.000000            2               0.22         0.000000   
4     0.043478            2               0.30         0.258819   

   time_of_day_cos  day_of_week_sin  day_of_week_cos  pred_vehicle_count  \
0         1.000000         0.974928        -0.222521        

In [None]:
# Validation checks
df = pd.read_csv('traffic_data_preprocessed.csv')
assert 'pred_vehicle_count' in df.columns, "pred_vehicle_count missing"
assert df['pred_vehicle_count'].between(0, 1).all(), "pred_vehicle_count not normalized"
assert len(df['timestamp'].unique()) == len(df), "Duplicate timestamps"
assert df['timestamp'].is_monotonic_increasing, "Non-sequential timestamps"
assert df.notna().all().all(), "Missing values"
print(f"Frequency: {1 / (pd.to_datetime(df['timestamp']).diff().dt.total_seconds().mean())} Hz")
print(f"pred_vehicle_count sample: {df['pred_vehicle_count'].head(5)}")

Frequency: 0.0011111111111111111 Hz
pred_vehicle_count sample: 0    0.311111
1    0.133333
2    0.222222
3    0.066667
4    0.044444
Name: pred_vehicle_count, dtype: float64


In [None]:
# Splitting the dataset into training and testing sets
df = pd.read_csv('traffic_data_preprocessed.csv')
split = ['train'] * int(0.8 * len(df)) + ['test'] * (len(df) - int(0.8 * len(df)))
df['split'] = split
df.to_csv('traffic_data_with_split.csv', index=False)

In [45]:
df = pd.read_csv('traffic_data_preprocessed.csv')
assert 'pred_vehicle_count' in df.columns, "pred_vehicle_count missing"
assert df['pred_vehicle_count'].between(0, 1).all(), "pred_vehicle_count not normalized"
assert df['pred_speed'].between(0, 1).all(), "pred_speed not normalized"
assert len(df['timestamp'].unique()) == len(df), "Duplicate timestamps"
assert df['timestamp'].is_monotonic_increasing, "Non-sequential timestamps"
assert df.notna().all().all(), "Missing values"
print(f"Frequency: {1 / (pd.to_datetime(df['timestamp']).diff().dt.total_seconds().mean())} Hz")
print(f"pred_vehicle_count sample: {df['pred_vehicle_count'].head(5)}")

Frequency: 0.0011111111111111111 Hz
pred_vehicle_count sample: 0    0.311111
1    0.133333
2    0.222222
3    0.066667
4    0.044444
Name: pred_vehicle_count, dtype: float64
