In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load traffic and station data
traffic = pd.read_csv('datasets_cleaned/trafficData.csv')
stations = pd.read_csv('datasets_cleaned/trafficStations.csv')

# Melt hourly traffic data
hour_columns = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'traffic_direction_seq', 'cardinal_direction_seq',
             'classification_seq', 'year', 'month', 'day', 'day_of_week',
             'public_holiday', 'school_holiday'],
    value_vars=hour_columns,
    var_name='hour',
    value_name='traffic_count'
)

# Convert hour column from 'hour_00' to int
traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)

# Create datetime for each row
traffic_long['datetime'] = pd.to_datetime(traffic_long[['year', 'month', 'day']]) + pd.to_timedelta(traffic_long['hour'], unit='h')

df = traffic_long.merge(stations, on='station_key', how='left')

# Create basic features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Drop irrelevant text columns (optional)
drop_cols = ['name', 'full_name', 'intersection']
df = df.drop(columns=drop_cols, errors='ignore')

# Encode categorical variables
categorical_cols = ['mab_way_type', 'road_functional_hierarchy', 'road_on_type',
                    'lane_count', 'road_classification_type', 'rms_region',
                    'lga', 'suburb']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define features and target
X = df.drop(columns=['traffic_count', 'datetime', 'station_key'])  # Adjust as needed
y = df['traffic_count']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")



  traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)
  traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)


KeyboardInterrupt: 

Training model for a smaller dataset

In [None]:
import pandas as pd

# Load the first 50 rows of each file
traffic = pd.read_csv("datasets_cleaned\trafficData.csv").head(500)
stations = pd.read_csv("datasets_cleaned\trafficStations.csv")

# Melt traffic data from wide to long format
hour_columns = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'traffic_direction_seq', 'cardinal_direction_seq',
             'classification_seq', 'year', 'month', 'day', 'day_of_week',
             'public_holiday', 'school_holiday'],
    value_vars=hour_columns,
    var_name='hour',
    value_name='traffic_count'
)

# Convert hour to numeric
traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)

# Create datetime
traffic_long['datetime'] = pd.to_datetime(traffic_long[['year', 'month', 'day']]) + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with station metadata
df = traffic_long.merge(stations, on='station_key', how='left')

# Add features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Drop text columns (optional)
drop_cols = ['name', 'full_name', 'intersection']
df = df.drop(columns=drop_cols, errors='ignore')

# Encode categorical variables
categorical_cols = ['mab_way_type', 'road_functional_hierarchy', 'road_on_type',
                    'lane_count', 'road_classification_type', 'rms_region',
                    'lga', 'suburb']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Limit to 50 rows again after transformation (optional for speed)
df = df.head(500)

# Split and train
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

X = df.drop(columns=['traffic_count', 'datetime', 'station_key'], errors='ignore')
y = df['traffic_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")



  traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)


RMSE: 8.11


Cross-validation

In [None]:
import pandas as pd

# Load the first 50 rows of each file
traffic = pd.read_csv("datasets_cleaned\trafficData.csv").head(500)
stations = pd.read_csv("datasets_cleaned\trafficStations.csv")

# Melt traffic data from wide to long format
hour_columns = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'traffic_direction_seq', 'cardinal_direction_seq',
             'classification_seq', 'year', 'month', 'day', 'day_of_week',
             'public_holiday', 'school_holiday'],
    value_vars=hour_columns,
    var_name='hour',
    value_name='traffic_count'
)

# Convert hour to numeric
traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)

# Create datetime
traffic_long['datetime'] = pd.to_datetime(traffic_long[['year', 'month', 'day']]) + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with station metadata
df = traffic_long.merge(stations, on='station_key', how='left')

# Add features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Drop text columns (optional)
drop_cols = ['name', 'full_name', 'intersection']
df = df.drop(columns=drop_cols, errors='ignore')

# Encode categorical variables
categorical_cols = ['mab_way_type', 'road_functional_hierarchy', 'road_on_type',
                    'lane_count', 'road_classification_type', 'rms_region',
                    'lga', 'suburb']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Limit to 50 rows again after transformation (optional for speed)
df = df.head(500)

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Assume df is already prepared with features and target (traffic_count)

X = df.drop(columns=['traffic_count', 'datetime', 'station_key'], errors='ignore')
y = df['traffic_count']

# Create the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform K-Fold Cross-Validation
# Let's use 5-folds (you can adjust k)
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE to positive and calculate RMSE for each fold
rmse_scores = np.sqrt(-cv_scores)

# Print the results
print(f"Cross-Validation RMSE Scores: {rmse_scores}")
print(f"Average RMSE: {rmse_scores.mean():.2f}")
print(f"Standard Deviation of RMSE: {rmse_scores.std():.2f}")


Cross-Validation RMSE Scores: [ 8.72494527 13.98610997 20.92654628 10.85487821  7.24731433]
Average RMSE: 12.35
Standard Deviation of RMSE: 4.85


Stratified Split

In [None]:
import pandas as pd

# Load the first 50 rows of each file
traffic = pd.read_csv("datasets_cleaned\trafficData.csv").head(500)
stations = pd.read_csv("datasets_cleaned\trafficStations.csv")

# Melt traffic data from wide to long format
hour_columns = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'traffic_direction_seq', 'cardinal_direction_seq',
             'classification_seq', 'year', 'month', 'day', 'day_of_week',
             'public_holiday', 'school_holiday'],
    value_vars=hour_columns,
    var_name='hour',
    value_name='traffic_count'
)

# Convert hour to numeric
traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)

# Create datetime
traffic_long['datetime'] = pd.to_datetime(traffic_long[['year', 'month', 'day']]) + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with station metadata
df = traffic_long.merge(stations, on='station_key', how='left')

# Add features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Drop text columns (optional)
drop_cols = ['name', 'full_name', 'intersection']
df = df.drop(columns=drop_cols, errors='ignore')

# Encode categorical variables
categorical_cols = ['mab_way_type', 'road_functional_hierarchy', 'road_on_type',
                    'lane_count', 'road_classification_type', 'rms_region',
                    'lga', 'suburb']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Limit to 50 rows again after transformation (optional for speed)
df = df.head(500)

from sklearn.model_selection import StratifiedShuffleSplit

# Binning the target variable (traffic counts) into categories
# Let's categorize traffic counts into 3 bins: low, medium, and high
traffic_bins = pd.cut(df['traffic_count'], bins=[0, 100, 500, float('inf')], labels=['Low', 'Medium', 'High'])

# Stratified split using the bins
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Create the train-test split
for train_index, test_index in sss.split(df, traffic_bins):
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]

# Separate features (X) and target (y) for training and testing
X_train = train_data.drop(columns=['traffic_count', 'datetime', 'station_key'], errors='ignore')
y_train = train_data['traffic_count']

X_test = test_data.drop(columns=['traffic_count', 'datetime', 'station_key'], errors='ignore')
y_test = test_data['traffic_count']

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

ValueError: Input contains NaN

Stratified K-Fold

In [None]:
import pandas as pd

# Load the first 50 rows of each file
traffic = pd.read_csv("datasets_cleaned\trafficData.csv").head(500)
stations = pd.read_csv("datasets_cleaned\trafficStations.csv")

# Melt traffic data from wide to long format
hour_columns = [f'hour_{i:02d}' for i in range(24)]
traffic_long = traffic.melt(
    id_vars=['station_key', 'traffic_direction_seq', 'cardinal_direction_seq',
             'classification_seq', 'year', 'month', 'day', 'day_of_week',
             'public_holiday', 'school_holiday'],
    value_vars=hour_columns,
    var_name='hour',
    value_name='traffic_count'
)

# Convert hour to numeric
traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)

# Create datetime
traffic_long['datetime'] = pd.to_datetime(traffic_long[['year', 'month', 'day']]) + pd.to_timedelta(traffic_long['hour'], unit='h')

# Merge with station metadata
df = traffic_long.merge(stations, on='station_key', how='left')

# Add features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# Drop text columns (optional)
drop_cols = ['name', 'full_name', 'intersection']
df = df.drop(columns=drop_cols, errors='ignore')

# Encode categorical variables
categorical_cols = ['mab_way_type', 'road_functional_hierarchy', 'road_on_type',
                    'lane_count', 'road_classification_type', 'rms_region',
                    'lga', 'suburb']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Limit to 50 rows again after transformation (optional for speed)
df = df.head(500)

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Binning the target variable (traffic counts) into categories
traffic_bins = pd.cut(df['traffic_count'], bins=[0, 100, 500, float('inf')], labels=['Low', 'Medium', 'High'])

# Create Stratified K-Fold cross-validation (e.g., 5 folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Store RMSE for each fold
fold_rmse = []

# Cross-validation loop
for train_index, test_index in skf.split(df, traffic_bins):
    # Split data
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]
    
    # Features and target
    X_train = train_data.drop(columns=['traffic_count', 'datetime', 'station_key'], errors='ignore')
    y_train = train_data['traffic_count']
    
    X_test = test_data.drop(columns=['traffic_count', 'datetime', 'station_key'], errors='ignore')
    y_test = test_data['traffic_count']
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict and calculate RMSE for this fold
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    fold_rmse.append(rmse)

# Print the RMSE for each fold and average RMSE
print(f"Fold-wise RMSE: {fold_rmse}")
print(f"Average RMSE: {np.mean(fold_rmse):.2f}")

  traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)
  traffic_long['hour'] = traffic_long['hour'].str.extract('hour_(\d+)').astype(int)


ValueError: Input contains NaN