In [20]:
import pandas as pd
import numpy as np
import copy
import torch
import torch.nn as nn
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from mlxtend.evaluate.time_series import GroupTimeSeriesSplit, plot_splits

In [21]:
data = pd.read_csv("Data/feature_matrix.csv")
data['date'] = data['year'].astype(str) + '-' + data['month'].astype(str).str.pad(2, 'left', '0')
data.head()

Unnamed: 0,route,current_garage,day_type,lagged_total_precip,lagged_avg_temp,year,month,lagged_avg_riders,season,num_unique_stops,num_unique_stops_with_shelter,covid,avg_riders,date
0,1,Ross,SAT.,3.43,33.6,2017,1,,Winter,224.0,17.0,0,969.5,2017-01
1,1,Ross,SAT.,3.54,34.6,2017,2,969.5,Winter,224.0,17.0,0,1238.75,2017-02
2,1,Ross,SAT.,1.46,40.6,2017,3,1238.75,Spring,224.0,17.0,0,1178.25,2017-03
3,1,Ross,SAT.,5.02,39.9,2017,4,1178.25,Spring,224.0,17.0,0,1285.2,2017-04
4,1,Ross,SAT.,3.54,57.3,2017,5,1285.2,Spring,224.0,17.0,0,1235.5,2017-05


In [22]:
## Sort by date
df_sorted = data.sort_values('date')
df_sorted = df_sorted.set_index('date')
df_sorted.head()

Unnamed: 0_level_0,route,current_garage,day_type,lagged_total_precip,lagged_avg_temp,year,month,lagged_avg_riders,season,num_unique_stops,num_unique_stops_with_shelter,covid,avg_riders
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-01,1,Ross,SAT.,3.43,33.6,2017,1,,Winter,224.0,17.0,0,969.5
2017-01,74,East Liberty,SAT.,3.43,33.6,2017,1,,Winter,177.0,6.0,0,392.0
2017-01,74,East Liberty,WEEKDAY,3.43,33.6,2017,1,,Winter,177.0,6.0,0,874.590909
2017-01,75,East Liberty,SAT.,3.43,33.6,2017,1,,Winter,157.0,13.0,0,1783.5
2017-01,75,East Liberty,SUN.,3.43,33.6,2017,1,,Winter,157.0,13.0,0,1111.2


In [23]:
## specify chunks for time series splits
cv_args = {"test_size": 1, "n_splits": 10}
tscv = GroupTimeSeriesSplit(**cv_args)
months = np.sort(df_sorted.index)
months

array(['2017-01', '2017-01', '2017-01', ..., '2024-10', '2024-10',
       '2024-10'], dtype=object)

In [28]:
unique_months = np.unique(months)
month_to_group = {month: idx for idx, month in enumerate(unique_months)}
groups = np.array([month_to_group[m] for m in months])

In [29]:
## establish feature and target dataframes
X = df_sorted.drop(['avg_riders'], axis = 1)
y = df_sorted['avg_riders']

## drop sparse columns
sparse_columns = list(X.columns[X.nunique() / len(X) * 100 < 0.01])
X = X.drop(sparse_columns, axis = 1)

In [30]:
# Define the Neural Net Regressor wrapped as a sklearn estimator
class NeuralNetRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, hidden_dim=64, epochs=50, batch_size=32, lr=1e-3):
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.model = None
        self.criterion = nn.MSELoss()
        self.optimizer = None

    def fit(self, X, y):
        input_dim = X.shape[1]
        self.model = nn.Sequential(
            nn.Linear(input_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, 1)
        )
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y.values.reshape(-1, 1), dtype=torch.float32)
        dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
        loader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        for epoch in range(self.epochs):
            for xb, yb in loader:
                pred = self.model(xb)
                loss = self.criterion(pred, yb)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            X_tensor = torch.tensor(X, dtype=torch.float32)
            return self.model(X_tensor).squeeze().numpy()

In [31]:
# Function to build the full pipeline
def build_neural_network_model(X):
    # Identify feature types
    int_cols = X.select_dtypes(include='int').columns.tolist()
    float_cols = X.select_dtypes(include='float').columns.tolist()
    cat_cols = X.select_dtypes(include='object').columns.tolist()

    numeric_preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_preprocessor = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # force dense
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_preprocessor, int_cols + float_cols),
        ('cat', categorical_preprocessor, cat_cols)
    ])

    model = Pipeline([
        ('preprocessor', copy.deepcopy(preprocessor)),
        ('to_float32', FunctionTransformer(lambda X: X.astype(np.float32), validate=False)),
        ('regressor', NeuralNetRegressor())
    ])

    return {"Neural Network": model}

In [32]:
neural_net_model = build_neural_network_model(X)

model = neural_net_model["Neural Network"]

from sklearn.model_selection import cross_val_score
scores = cross_val_score(
    model,
    X,
    y,
    cv=tscv,   # your GroupTimeSeriesSplit
    groups=groups,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)
rmse_scores = -scores

Neural Network RMSE mean: 286.7828523744138


In [36]:
print(f"{"Neural Network":<20} | Mean RMSE: {rmse_scores.mean():>10,.3f} | Std: {rmse_scores.std():.3f}")

Neural Network       | Mean RMSE:    286.783 | Std: 129.152
