# Model Selection

This Notebook tests multiple regression models in their base configs to select the most promising algorithm for the task

In [3]:
import os
import polars as pl
from sklearn.preprocessing import (
    StandardScaler,
    PolynomialFeatures,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error as rmse

In [4]:
def get_data(poly_features: int = 1):
    # Get Data
    data = pl.read_parquet("data.parquet")
    data = data.drop(["Step", "Light_ID", "Lane", "Intersection_u", "Sim_ID"])
    data = data.with_columns(pl.col("Is_Entrypoint").cast(pl.Int8))
    print(f"Data: {data.shape}")
    print(f"{data.collect_schema()}")

    # Split Data
    X = data.drop("Num_Cars").to_numpy()
    y = data.select(pl.col("Num_Cars")).to_numpy()
    y = y.ravel()
    print("")
    print(f"X: {X.shape}")
    print(f"y: {y.shape}")

    # Scale
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Polynomial Features
    if poly_features > 1:
        poly = PolynomialFeatures(degree=poly_features)
        X = poly.fit_transform(X)

    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.6, test_size=0.4, random_state=42
    )
    print("")
    print(f"X_train: {X_train.shape}")
    print(f"X_test: {X_test.shape}")
    print(f"y_train: {y_train.shape}")
    print(f"y_test {y_test.shape}")

    # Train Test Validation Split
    X_test, X_val, y_test, y_val = train_test_split(
        X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
    )
    print("")
    print(f"X_test: {X_test.shape}")
    print(f"X_val: {X_val.shape}")
    print(f"y_test: {y_test.shape}")
    print(f"y_val: {y_val.shape}")

    return X_train, X_test, X_val, y_train, y_test, y_val

## Linear Regression


In [4]:
from sklearn.linear_model import LinearRegression

In [3]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [5]:
model = LinearRegression(n_jobs=(os.cpu_count() - 1))
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_test, y_pred=y_pred)}")

LinearRegression(n_jobs=7)

R2 Score (Training Data): 0.09408396482467651
RMSE (Training Data): 1.4932094812393188
R2 Score (Test Data): 0.09340804815292358
RMSE (Training Data): 1.4924378395080566


## Linear Regression with Polynomial Features


In [None]:
from sklearn.linear_model import LinearRegression

In [6]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data(poly_features=2)

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 15)
X_test: (2644000, 15)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 15)
X_val: (1322000, 15)
y_test: (1322000,)
y_val: (1322000,)


In [7]:
model = LinearRegression(n_jobs=(os.cpu_count() - 1))
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_test, y_pred=y_pred)}")

LinearRegression(n_jobs=7)

R2 Score (Training Data): 3.9517879486083984e-05
RMSE (Training Data): 1.5688021183013916
R2 Score (Test Data): 5.6743621826171875e-05
RMSE (Training Data): 1.5673933029174805


In [8]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data(poly_features=3)

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 35)
X_test: (2644000, 35)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 35)
X_val: (1322000, 35)
y_test: (1322000,)
y_val: (1322000,)


In [9]:
model = LinearRegression(n_jobs=(os.cpu_count() - 1))
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_test, y_pred=y_pred)}")

LinearRegression(n_jobs=7)

R2 Score (Training Data): 3.9517879486083984e-05
RMSE (Training Data): 1.5688021183013916
R2 Score (Test Data): 5.6862831115722656e-05
RMSE (Training Data): 1.567393183708191


## Decision Tree Regression


In [10]:
from sklearn.tree import DecisionTreeRegressor

In [11]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [12]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_test, y_pred=y_pred)}")

DecisionTreeRegressor(random_state=42)

R2 Score (Training Data): 0.46521877733411565
RMSE (Training Data): 1.1472678598443036
R2 Score (Test Data): 0.02001084905453221
RMSE (Training Data): 1.5516754321690063


## Random Forest Regression


In [13]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [15]:
model = RandomForestRegressor(n_estimators=10, random_state=42, verbose=2)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_test, y_pred=y_pred)}")

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
RandomForestRegressor(n_estimators=10, random_state=42, verbose=2)

R2 Score (Training Data): 0.446212053208886
RMSE (Training Data): 1.1674774526151541
R2 Score (Test Data): 0.0692549301794868
RMSE (Training Data): 1.5121874250399012


Increase min. samples per leaf to avoid overfitting:

In [16]:
model = RandomForestRegressor(
    n_estimators=10, random_state=42, verbose=2, min_samples_leaf=2
)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_test, y_pred=y_pred)}")

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10
RandomForestRegressor(min_samples_leaf=2, n_estimators=10, random_state=42,
                      verbose=2)

R2 Score (Training Data): 0.4398256155684619
RMSE (Training Data): 1.1741899937798117
R2 Score (Test Data): 0.14421481836910033
RMSE (Training Data): 1.4500154548136859


Increase estimators for better performance:

In [17]:
model = RandomForestRegressor(
    n_estimators=20, random_state=42, verbose=2, min_samples_leaf=2
)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20
RandomForestRegressor(min_samples_leaf=2, n_estimators=20, random_state=42,
                      verbose=2)

R2 Score (Training Data): 0.44669807222173186
RMSE (Training Data): 1.1669650355502112
R2 Score (Test Data): 0.1513488170667192
RMSE (Test Data): 1.4439589950923968


Test limiting max. features considered for splitting decision to avoid overfitting:

In [18]:
model = RandomForestRegressor(
    n_estimators=20,
    random_state=42,
    verbose=2,
    min_samples_leaf=2,
    n_jobs=(os.cpu_count() - 1),
    max_features="sqrt",
)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.


building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=7)]: Done  18 out of  20 | elapsed:   20.0s remaining:    2.2s
[Parallel(n_jobs=7)]: Done  20 out of  20 | elapsed:   20.1s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.


RandomForestRegressor(max_features='sqrt', min_samples_leaf=2, n_estimators=20,
                      n_jobs=7, random_state=42, verbose=2)



[Parallel(n_jobs=7)]: Done  18 out of  20 | elapsed:    9.8s remaining:    1.1s
[Parallel(n_jobs=7)]: Done  20 out of  20 | elapsed:    9.9s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.


R2 Score (Training Data): 0.44369678332711837
RMSE (Training Data): 1.1701257533491618
R2 Score (Test Data): 0.146967210174067
RMSE (Test Data): 1.4476817939244613


[Parallel(n_jobs=7)]: Done  18 out of  20 | elapsed:    3.4s remaining:    0.4s
[Parallel(n_jobs=7)]: Done  20 out of  20 | elapsed:    3.4s finished


## Gradient Boosting Regression


In [3]:
from xgboost import XGBRegressor

In [4]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [5]:
model = XGBRegressor(n_jobs=(os.cpu_count() - 1), random_state=42)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=7, num_parallel_tree=None, ...)

R2 Score (Training Data): 0.21278494596481323
RMSE (Training Data): 1.3919494152069092
R2 Score (Test Data): 0.20956403017044067
RMSE (Test Data): 1.3935534954071045


## KNN Regression


In [6]:
from sklearn.neighbors import KNeighborsRegressor

In [7]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [9]:
model = KNeighborsRegressor(n_jobs=(os.cpu_count() - 1))
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

KNeighborsRegressor(n_jobs=7)

R2 Score (Training Data): 0.37521500359296933
RMSE (Training Data): 1.2400581420494516
R2 Score (Test Data): 0.09948158281760355
RMSE (Test Data): 1.487430045139285


## Bayesian Linear Regression


In [10]:
from sklearn.linear_model import BayesianRidge

In [11]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [12]:
model = BayesianRidge(verbose=True)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

Convergence after  1  iterations
BayesianRidge(verbose=True)

R2 Score (Training Data): 0.09408408403396606
RMSE (Training Data): 1.4932093620300293
R2 Score (Test Data): 0.09340924024581909
RMSE (Test Data): 1.4924367666244507


## MLP Regression


In [13]:
from sklearn.neural_network import MLPRegressor

In [14]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()

Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [15]:
model = MLPRegressor(random_state=42, verbose=True)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

Iteration 1, loss = 1.04002500
Iteration 2, loss = 1.03235185
Iteration 3, loss = 1.02938068
Iteration 4, loss = 1.02779078
Iteration 5, loss = 1.02649188
Iteration 6, loss = 1.02544248
Iteration 7, loss = 1.02457106
Iteration 8, loss = 1.02347517
Iteration 9, loss = 1.02260268
Iteration 10, loss = 1.02186048
Iteration 11, loss = 1.02130949
Iteration 12, loss = 1.02098083
Iteration 13, loss = 1.02094650
Iteration 14, loss = 1.02058840
Iteration 15, loss = 1.02036154
Iteration 16, loss = 1.02026320
Iteration 17, loss = 1.02010155
Iteration 18, loss = 1.01990640
Iteration 19, loss = 1.01977825
Iteration 20, loss = 1.01974905
Iteration 21, loss = 1.01959395
Iteration 22, loss = 1.01933146
Iteration 23, loss = 1.01891053
Iteration 24, loss = 1.01834548
Iteration 25, loss = 1.01804018
Iteration 26, loss = 1.01783276
Iteration 27, loss = 1.01754177
Iteration 28, loss = 1.01745856
Iteration 29, loss = 1.01734650
Iteration 30, loss = 1.01707852
Iteration 31, loss = 1.01704168
Iteration 32, los

Try an adaptive learning rate to increase performance:

In [None]:
model = MLPRegressor(random_state=42, verbose=True, learning_rate="adaptive")
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

Iteration 1, loss = 1.04002500
Iteration 2, loss = 1.03235185
Iteration 3, loss = 1.02938068
Iteration 4, loss = 1.02779078
Iteration 5, loss = 1.02649188
Iteration 6, loss = 1.02544248
Iteration 7, loss = 1.02457106
Iteration 8, loss = 1.02347517
Iteration 9, loss = 1.02260268
Iteration 10, loss = 1.02186048
Iteration 11, loss = 1.02130949
Iteration 12, loss = 1.02098083
Iteration 13, loss = 1.02094650
Iteration 14, loss = 1.02058840
Iteration 15, loss = 1.02036154
Iteration 16, loss = 1.02026320
Iteration 17, loss = 1.02010155
Iteration 18, loss = 1.01990640
Iteration 19, loss = 1.01977825
Iteration 20, loss = 1.01974905
Iteration 21, loss = 1.01959395
Iteration 22, loss = 1.01933146
Iteration 23, loss = 1.01891053
Iteration 24, loss = 1.01834548
Iteration 25, loss = 1.01804018
Iteration 26, loss = 1.01783276
Iteration 27, loss = 1.01754177
Iteration 28, loss = 1.01745856
Iteration 29, loss = 1.01734650
Iteration 30, loss = 1.01707852
Iteration 31, loss = 1.01704168
Iteration 32, los

Increase model size:

In [None]:
model = MLPRegressor(
    random_state=42,
    verbose=True,
    learning_rate="adaptive",
    hidden_layer_sizes=(100, 100),
)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

Iteration 1, loss = 1.02910233
Iteration 2, loss = 1.01558268
Iteration 3, loss = 1.01191688
Iteration 4, loss = 1.01002657
Iteration 5, loss = 1.00901532
Iteration 6, loss = 1.00826156
Iteration 7, loss = 1.00774109
Iteration 8, loss = 1.00720823
Iteration 9, loss = 1.00675797
Iteration 10, loss = 1.00661278
Iteration 11, loss = 1.00636470
Iteration 12, loss = 1.00593662
Iteration 13, loss = 1.00554991
Iteration 14, loss = 1.00547671
Iteration 15, loss = 1.00525570
Iteration 16, loss = 1.00500309
Iteration 17, loss = 1.00496173
Iteration 18, loss = 1.00466895
Iteration 19, loss = 1.00465298
Iteration 20, loss = 1.00445390
Iteration 21, loss = 1.00440884
Iteration 22, loss = 1.00389993
Iteration 23, loss = 1.00400865
Iteration 24, loss = 1.00385201
Iteration 25, loss = 1.00384152
Iteration 26, loss = 1.00370872
Iteration 27, loss = 1.00360417
Iteration 28, loss = 1.00350821
Iteration 29, loss = 1.00347757
Iteration 30, loss = 1.00333762
Iteration 31, loss = 1.00335276
Iteration 32, los

In [None]:
model = MLPRegressor(
    random_state=42,
    verbose=True,
    learning_rate="adaptive",
    hidden_layer_sizes=(100, 100, 100),
)
model.fit(X_train, y_train)
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
print(f"RMSE (Training Data): {rmse(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")
print(f"RMSE (Test Data): {rmse(y_true=y_test, y_pred=y_pred)}")

Iteration 1, loss = 1.02563238
Iteration 2, loss = 1.01324058
Iteration 3, loss = 1.01000822
Iteration 4, loss = 1.00877106
Iteration 5, loss = 1.00761306
Iteration 6, loss = 1.00686157
Iteration 7, loss = 1.00587726
Iteration 8, loss = 1.00556612
Iteration 9, loss = 1.00503755
Iteration 10, loss = 1.00477731
Iteration 11, loss = 1.00428271
Iteration 12, loss = 1.00398505
Iteration 13, loss = 1.00362897
Iteration 14, loss = 1.00346816
Iteration 15, loss = 1.00302374
Iteration 16, loss = 1.00281954
Iteration 17, loss = 1.00257349
Iteration 18, loss = 1.00221217
Iteration 19, loss = 1.00213695
Iteration 20, loss = 1.00215912
Iteration 21, loss = 1.00180042
Iteration 22, loss = 1.00177586
Iteration 23, loss = 1.00168824
Iteration 24, loss = 1.00144851
Iteration 25, loss = 1.00114262
Iteration 26, loss = 1.00104582
Iteration 27, loss = 1.00115407
Iteration 28, loss = 1.00122845
Iteration 29, loss = 1.00104868
Iteration 30, loss = 1.00096953
Iteration 31, loss = 1.00082231
Iteration 32, los

## Tensorflow Dense Neural Net

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError as RMSE, R2Score

In [6]:
X_train, X_test, X_val, y_train, y_test, y_val = get_data()


Data: (6610000, 5)
Schema({'Time': Int16, 'Num_Cars': Int16, 'Centrality': Float32, 'Is_Entrypoint': Int8, 'Distance': Int16})

X: (6610000, 4)
y: (6610000,)

X_train: (3966000, 4)
X_test: (2644000, 4)
y_train: (3966000,)
y_test (2644000,)

X_test: (1322000, 4)
X_val: (1322000, 4)
y_test: (1322000,)
y_val: (1322000,)


In [7]:
model = Sequential(
    [
        Dense(128, activation="relu"),
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1),
    ]
)
model.build(input_shape=(None, X_train.shape[1]))

model.compile(
    optimizer=Adam(learning_rate=0.001), loss="mse", metrics=[RMSE(), R2Score()]
)

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1,
)

loss, rmse, r2_score = model.evaluate(X_test, y_test, verbose=1)

print("")
print(f"Loss: {loss}")
print(f"RMSE: {rmse}")
print(f"R2Score: {r2_score}")

Epoch 1/10
[1m123938/123938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 470us/step - loss: 2.0646 - r2_score: 0.1583 - root_mean_squared_error: 1.4368 - val_loss: 2.0736 - val_r2_score: 0.1546 - val_root_mean_squared_error: 1.4400
Epoch 2/10
[1m123938/123938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 476us/step - loss: 2.0444 - r2_score: 0.1722 - root_mean_squared_error: 1.4298 - val_loss: 2.0235 - val_r2_score: 0.1750 - val_root_mean_squared_error: 1.4225
Epoch 3/10
[1m123938/123938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 481us/step - loss: 2.0213 - r2_score: 0.1767 - root_mean_squared_error: 1.4217 - val_loss: 2.0147 - val_r2_score: 0.1786 - val_root_mean_squared_error: 1.4194
Epoch 4/10
[1m123938/123938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 482us/step - loss: 2.0303 - r2_score: 0.1771 - root_mean_squared_error: 1.4249 - val_loss: 2.0127 - val_r2_score: 0.1794 - val_root_mean_squared_error: 1.4187
Epoch 5/10
[1m123938/123938

The XGB Regressor and Neural Net are the most promising candidates.