In [5]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('traffic_data.csv')

# Features and labels
features = ['vehicle_count', 'speed', 'cyclist_presence', 'time_of_day', 'day_of_week']
labels = ['pred_vehicle_count', 'pred_speed', 'pred_cyclist_presence']

# Normalize continuous features
scaler = MinMaxScaler()
df[['vehicle_count', 'speed', 'time_of_day']] = scaler.fit_transform(df[['vehicle_count', 'speed', 'time_of_day']])

# Save preprocessed data
df.to_csv('traffic_data_preprocessed.csv', index=False)

To ensure model stability, normalize continuous features (vehicle_count, speed, time_of_day) to [0, 1] and encode day_of_week as integers (0–6). Cyclist presence is binary, so no preprocessing is needed.

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

# Splitting data
X = df[features]
y = df[['pred_vehicle_count', 'pred_speed']] # Exclude cyclist_presence for now
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred, multioutput='raw_values')
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
print(f"R²: {r2} (Vehicle Count: {r2[0]:.3f}, Speed: {r2[1]:.3f})")
print(f"MAE: {mae} (Vehicle Count: {mae[0]:.3f}, Speed: {mae[1]:.3f})")
print(f"MAE % Error: {(mae / [50, 50]) * 100} (Vehicle Count: {mae[0]/50*100:.2f}%, Speed: {mae[1]/50*100:.2f}%)")

R²: [0.42812966 0.3662777 ] (Vehicle Count: 0.428, Speed: 0.366)
MAE: [7.41343045 8.81458203] (Vehicle Count: 7.413, Speed: 8.815)
MAE % Error: [14.82686091 17.62916405] (Vehicle Count: 14.83%, Speed: 17.63%)


Since Edge Impulse supports linear regression but may require a neural network for multi-output regression, I’ll first train a model locally using scikit-learn to verify performance, then replicate in Edge Impulse. For cyclist presence (binary), linear regression may not be ideal, so I’ll train a separate logistic regression model or adjust outputs later.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

y_cyclist = df['pred_cyclist_presence']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_cyclist, test_size=0.2, random_state=42)

model_cyclist = LogisticRegression()
model_cyclist.fit(X_train_c, y_train_c)
y_pred_c = model_cyclist.predict(X_test_c)

accuracy = accuracy_score(y_test_c, y_pred_c)
print(f"Cyclist Presence Accuracy: {accuracy:.3f}")

Cyclist Presence Accuracy: 0.879


In [None]:
# 1. Load and preprocess data (if not already done)
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load data
df = pd.read_csv('traffic_data.csv')

# Define features and labels
features = ['vehicle_count', 'speed', 'cyclist_presence', 'time_of_day', 'day_of_week']
labels = ['pred_vehicle_count', 'pred_speed', 'pred_cyclist_presence']

# Normalize continuous features if not already done
scaler = MinMaxScaler()
df[['vehicle_count', 'speed', 'time_of_day']] = scaler.fit_transform(df[['vehicle_count', 'speed', 'time_of_day']])

# 2. Prepare features and target variables
X = df[features]
y = df[labels]  # Using all labels

# 3. Perform train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print shapes to verify split
print("Training set shapes:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print("\nTest set shapes:")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")
