In [4]:
import polars as pl
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

## Preprocess Data

In [2]:
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
data

Time,Arrivals,Centrality,Distance,Incoming_Cars
i16,i16,f32,i16,i16
198,0,0.026253,9,15
198,0,0.026253,7,15
198,0,0.026253,7,15
198,0,0.026253,19,15
197,0,0.026253,9,15
…,…,…,…,…
17,0,0.026685,9,14
17,0,0.026685,7,14
10,1,0.026685,8,13
10,1,0.026685,9,13


In [3]:
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print(f"X: {X.shape}")
print(f"y: {y.shape}")

X: (1495920, 4)
y: (1495920, 1)


In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 1.6034868 , -0.21799846, -0.33276877, -0.61164314],
       [ 1.6034868 , -0.21799846, -0.73915625, -0.61164314],
       [ 1.6034868 , -0.21799846, -0.73915625, -0.61164314],
       ...,
       [-1.7010854 , -0.14746545, -0.53596246, -0.715857  ],
       [-1.7010854 , -0.14746545, -0.33276877, -0.715857  ],
       [-1.7010854 , -0.14746545, -0.73915625, -0.715857  ]],
      shape=(1495920, 4), dtype=float32)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)


In [6]:
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")


X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


## Linear Regression

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
model

In [11]:
y_pred = model.predict(X_train)
r2_score(y_true=y_train, y_pred=y_pred)

0.1211400032043457

In [None]:
y_pred = model.predict(X_test)
r2_score(y_true=y_test, y_pred=y_pred)

0.12077599763870239

## Polynomial Features

In [32]:
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print(f"X: {X.shape}")
print(f"y: {y.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})
X: (1495920, 4)
y: (1495920, 1)


In [33]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 1.6034868 , -0.21799846, -0.33276877, -0.61164314],
       [ 1.6034868 , -0.21799846, -0.73915625, -0.61164314],
       [ 1.6034868 , -0.21799846, -0.73915625, -0.61164314],
       ...,
       [-1.7010854 , -0.14746545, -0.53596246, -0.715857  ],
       [-1.7010854 , -0.14746545, -0.33276877, -0.715857  ],
       [-1.7010854 , -0.14746545, -0.73915625, -0.715857  ]],
      shape=(1495920, 4), dtype=float32)

In [34]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X = poly.fit_transform(X)
X

array([[ 1.6034868 , -0.21799846, -0.33276877, ...,  0.11073505,
         0.20353574,  0.37410733],
       [ 1.6034868 , -0.21799846, -0.73915625, ...,  0.54635197,
         0.45209983,  0.37410733],
       [ 1.6034868 , -0.21799846, -0.73915625, ...,  0.54635197,
         0.45209983,  0.37410733],
       ...,
       [-1.7010854 , -0.14746545, -0.53596246, ...,  0.28725576,
         0.3836725 ,  0.5124513 ],
       [-1.7010854 , -0.14746545, -0.33276877, ...,  0.11073505,
         0.23821487,  0.5124513 ],
       [-1.7010854 , -0.14746545, -0.73915625, ...,  0.54635197,
         0.5291302 ,  0.5124513 ]], shape=(1495920, 14), dtype=float32)

In [35]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

X_train: (897552, 14)
X_test: (598368, 14)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 14)
X_val: (299184, 14)
y_test: (299184, 1)
y_val: (299184, 1)


In [36]:
model = LinearRegression()
model.fit(X_train, y_train)
model

In [37]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")

R2 Score (Training Data): 0.1571146845817566


In [38]:
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Test Data): 0.15710312128067017


Test higher degree polynomials

In [39]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("\n")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Create Polynomial Features
poly = PolynomialFeatures(degree=3, include_bias=False)
X = poly.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("\n")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})


X: (1495920, 4)
y: (1495920, 1)


X_train: (897552, 34)
X_test: (598368, 34)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 34)
X_val: (299184, 34)
y_test: (299184, 1)
y_val: (299184, 1)


In [40]:
model = LinearRegression()
model.fit(X_train, y_train)
model

In [41]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.1038588285446167
R2 Score (Test Data): 0.10201376676559448


## Decision Tree Regression

In [17]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("\n")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("\n")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})


X: (1495920, 4)
y: (1495920, 1)


X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [None]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)
model

In [19]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.9807314211451464
R2 Score (Test Data): 0.19405347896624503


In [25]:
print(f"Tree Depth: {model.get_depth()}")
print(f"Leaves: {model.get_n_leaves()}")

Tree Depth: 58
Leaves: 397797


Try to minimize overfitting

In [28]:
model = DecisionTreeRegressor(
    max_depth=29, min_samples_split=5, min_samples_leaf=5, random_state=42
)
model.fit(X_train, y_train)
model

In [30]:
print(f"Tree Depth: {model.get_depth()}")
print(f"Leaves: {model.get_n_leaves()}")

Tree Depth: 29
Leaves: 113241


In [29]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.5462822039353408
R2 Score (Test Data): 0.19354928202464317


Try Polynomial Features

In [45]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("\n")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Polynomial Features
poly = PolynomialFeatures(2, include_bias=False)
X = poly.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("\n")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")


Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})


X: (1495920, 4)
y: (1495920, 1)


X_train: (897552, 14)
X_test: (598368, 14)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 14)
X_val: (299184, 14)
y_test: (299184, 1)
y_val: (299184, 1)


In [46]:
model = DecisionTreeRegressor(
    max_depth=29, min_samples_split=5, min_samples_leaf=5, random_state=42
)
model.fit(X_train, y_train)
model

In [47]:
print(f"Tree Depth: {model.get_depth()}")
print(f"Leaves: {model.get_n_leaves()}")
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

Tree Depth: 29
Leaves: 98146
R2 Score (Training Data): 0.5721988727038445
R2 Score (Test Data): 0.13636073201550325


## Random Forest Regression

In [48]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("\n")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("\n")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})


X: (1495920, 4)
y: (1495920, 1)


X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [50]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train.ravel())
model

In [51]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.9149511456337214
R2 Score (Test Data): 0.4850469986089051


Reduce Overfitting

In [2]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("\n")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("\n")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})


X: (1495920, 4)
y: (1495920, 1)


X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [57]:
model = RandomForestRegressor(
    max_depth=50, min_samples_leaf=5, min_samples_split=5, random_state=42
)
model.fit(X_train, y_train.ravel())
model

In [58]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.5782973338010617
R2 Score (Test Data): 0.3381628895584391


In [59]:
model = RandomForestRegressor(
    max_depth=50, min_samples_leaf=3, min_samples_split=3, random_state=42
)
model.fit(X_train, y_train.ravel())
model

In [60]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.7016317214008073
R2 Score (Test Data): 0.3854639580693364


In [61]:
model = RandomForestRegressor(
    max_depth=75, min_samples_leaf=3, min_samples_split=3, random_state=42
)
model.fit(X_train, y_train.ravel())
model

In [62]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.701638297685877
R2 Score (Test Data): 0.38546963715565696


In [63]:
model = RandomForestRegressor(
    max_depth=75, min_samples_leaf=2, min_samples_split=2, random_state=42
)
model.fit(X_train, y_train.ravel())
model

In [64]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.7996864699352769
R2 Score (Test Data): 0.4259918566258796


In [65]:
model = RandomForestRegressor(
    n_estimators=150, max_depth=75, min_samples_leaf=2, random_state=42
)
model.fit(X_train, y_train.ravel())
model

In [66]:
y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

R2 Score (Training Data): 0.80055190684938
R2 Score (Test Data): 0.4267974575972209


In [68]:
model = RandomForestRegressor(n_estimators=150, min_samples_leaf=2, random_state=42)
model.fit(X_train, y_train.ravel())
print(model)

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

RandomForestRegressor(min_samples_leaf=2, n_estimators=150, random_state=42)
R2 Score (Training Data): 0.80055190684938
R2 Score (Test Data): 0.4267974575972209


In [69]:
model = RandomForestRegressor(n_estimators=200, min_samples_leaf=2, random_state=42)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

RandomForestRegressor(min_samples_leaf=2, n_estimators=200, random_state=42)

R2 Score (Training Data): 0.8012846384463441
R2 Score (Test Data): 0.4281483111702832


Test Pruning

In [None]:
model = RandomForestRegressor(
    n_estimators=150, min_samples_leaf=2, random_state=42, ccp_alpha=0.1, verbose=2
)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

building tree 1 of 150
building tree 2 of 150


## Gradient Boosting Machines

In [2]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("\n")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("\n")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("\n")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})


X: (1495920, 4)
y: (1495920, 1)


X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)


X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [None]:
model = XGBRegressor(n_jobs=4)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=4, num_parallel_tree=None, ...)

R2 Score (Training Data): 0.23501378297805786
R2 Score (Test Data): 0.21222811937332153


## Support Vector Regression

In [3]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})

X: (1495920, 4)
y: (1495920, 1)

X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)

X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [10]:
model = SVR(verbose=True, max_iter=10000)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

[LibSVM].........WARN: libsvm Solver reached max_iter
optimization finished, #iter = 10000
obj = -140236.790936, rho = -5.212795
nSV = 19902, nBSV = 19902
SVR(max_iter=10000, verbose=True)





R2 Score (Training Data): -1.9115265123486735
R2 Score (Test Data): -1.9290390471665098


## KNN Regression

In [None]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})

X: (1495920, 4)
y: (1495920, 1)

X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)

X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [13]:
model = KNeighborsRegressor(n_jobs=8)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

KNeighborsRegressor(n_jobs=8)

R2 Score (Training Data): 0.40220480792704294
R2 Score (Test Data): 0.08947333358600329


In [14]:
model = KNeighborsRegressor(algorithm="ball_tree", n_jobs=8)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

KNeighborsRegressor(algorithm='ball_tree', n_jobs=8)

R2 Score (Training Data): 0.40217776093846014
R2 Score (Test Data): 0.08945919796685986


In [15]:
model = KNeighborsRegressor(algorithm="kd_tree", n_jobs=8)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

KNeighborsRegressor(algorithm='kd_tree', n_jobs=8)

R2 Score (Training Data): 0.40220480792704294
R2 Score (Test Data): 0.08947333358600329


## Gaussian Process Regression

In [2]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})

X: (1495920, 4)
y: (1495920, 1)

X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)

X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [3]:
model = GaussianProcessRegressor(random_state=42)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

: 

## Bayesian Linear Regression

In [3]:
# Get Data
data = pl.read_parquet("data.parquet")
data = data.drop(["Light_ID", "Is_Entrypoint"])
data = data.filter(pl.col("Incoming_Cars") <= data["Incoming_Cars"].quantile(0.95))
print(f"Data: {data.shape}")
print(f"{data.collect_schema()}")

# Split Data
X = data.drop("Arrivals").to_numpy()
y = data.select(pl.col("Arrivals")).to_numpy()
print("")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Scale
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.6, test_size=0.4, random_state=42
)
print("")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test {y_test.shape}")

# Train Test Validation Split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, train_size=0.5, test_size=0.5, random_state=42
)
print("")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")
print(f"y_test: {y_test.shape}")
print(f"y_val: {y_val.shape}")

Data: (1495920, 5)
Schema({'Time': Int16, 'Arrivals': Int16, 'Centrality': Float32, 'Distance': Int16, 'Incoming_Cars': Int16})

X: (1495920, 4)
y: (1495920, 1)

X_train: (897552, 4)
X_test: (598368, 4)
y_train: (897552, 1)
y_test (598368, 1)

X_test: (299184, 4)
X_val: (299184, 4)
y_test: (299184, 1)
y_val: (299184, 1)


In [5]:
model = BayesianRidge(verbose=True)
model.fit(X_train, y_train.ravel())
print(model)

print("")

y_pred = model.predict(X_train)
print(f"R2 Score (Training Data): {r2_score(y_true=y_train, y_pred=y_pred)}")
y_pred = model.predict(X_test)
print(f"R2 Score (Test Data): {r2_score(y_true=y_test, y_pred=y_pred)}")

Convergence after  1  iterations
BayesianRidge(verbose=True)

R2 Score (Training Data): 0.12113994359970093
R2 Score (Test Data): 0.12077641487121582
