In [1]:
%load_ext autoreload
%autoreload 2

# --------------------------------------------------------------------------------

import os
import sys
from pathlib import Path
import numpy as np
np.warnings.filterwarnings('ignore')

# hex2cec
HOME = os.environ["HOME"]

sys.path.insert(0, f"{os.environ['HOME']}/hex2vec")
sys.path.insert(0, f"{os.environ['HOME']}/amazon-routing-challenge")

# add codebase
sys.path.insert(0, f"/gcsmount-notebook/codebase")

from src.data.make_dataset import h3_to_polygon
import urban_tools.constants as uc
import urban_tools.hex_pipeline as hp
from urban_tools.hex_pipeline import RouteHexHandler, TestTrainManager
from urban_tools.pipelines import route_hex_pipeline
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import geopandas as gpd
import h3
import gcsfs

%matplotlib inline
import matplotlib.pyplot as plt
## Read in the DataFrame

p = Path("/gcsmount-research-data-staging/osmnx-cities/hexed-routes/12.12.22-Revised-Embeddings/hh.pkl")
hh = RouteHexHandler.from_pickle(p)
hh.print_history()
hh = hh.drop_zero_tags()
### Drop Tags that occur in <X% of a City
# percentage = 0.01
# import pandas as pd

# drop_cols = pd.Index([])
# for _hh in [hh]:
#     assert _hh.df["city"].unique().shape[0] == 1
#     h3_df = _hh.df.groupby('h3')[_hh.all_tags].first()
#     percent_occurance = (h3_df > 0).sum() / h3_df.shape[0]
#     drop_cols = drop_cols.union(percent_occurance[percent_occurance < percentage].index)
# hh.drop_cols(drop_cols)
# len(hh.all_tags)
### Append the embedding to the dataframe
# embedding_df = pd.read_parquet(
#     hh.my_folder / "embeddings.prq"
# )
embedding_df = pd.read_feather(
    "/gcsmount-research-data-staging/hex2vec-models/paper-final/embedding_dfs/subtags+cities+littlemodel.feather"
)
embedding_df = embedding_df.set_index("h3")
# embedding_df.columns = [f"e_{e}" for e in embedding_df.columns]
# embedding_df = embedding_df[embedding_df.columns.str.]
# embedding_columns = embedding_df.columns.copy()
# embedding_df = embedding_df.reset_index()

hh.df = hh.df.merge(embedding_df, on="h3", how="left")
hh.update_tags()
hh.df = hh.df.dropna(subset=hh.embeddings)
### Filter for only H3 with > X Data Points

# tagged_df = tagged_df.loc[tagged_df["h3_9"] > 20].copy()
hh = hh.filter_hex_occurance(20)
hh.df = hh.df[hh.df.city.str.contains('Boston')]
# hh.df.drop("h3_9", axis=1, inplace=True)
# print(tagged_df.shape)
hh.df = hh.df.reset_index(drop=True)

read_cities(*(PosixPath('/gcsmount-research-data-staging/osmnx-cities/hexed-complete'),), **{'cities': ['Boston, USA', 'Austin, USA', 'Seattle, USA', 'Los Angeles, USA'], 'add_city_col': True})
create_super_tags(*(), **{})
Merge building_residential + building_house -> building_yes(*(), **{})
drop_zero_buildings(*(), **{})
drop_zero_building_area(*(), **{})
add_new_special_tag(*('building.area.average',), **{})
add_new_special_tag(*('parking.area.average',), **{})
append_route_df(*(), **{'grouped': False, 'tukey_fences': False})
open_route_df(*(), **{'grouped': False, 'tukey_fences': False})
drop_zero_tags(*(), **{})


### Create the TT Manager

In [2]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn import metrics
from scipy.stats import pearsonr

In [3]:
tt = TestTrainManager(
    hh.df,
    x_col=hh.embeddings.union(hh.special_tags).union(hh.super_tags),
    y_col=["planned_service_time_log"],
    scaler=RobustScaler,
    grouped=False,
    desired_quantiles=[0.1, 0.5, 0.90]
)

tt.split_test_train(train_size=0.8, random_seed=42)
tt.scale_test_train()
# split the tt.X_test into several chunks. Have to do this because of memory issues
tt.build_test_df(agg=False)

In [4]:
X_train, y_train, X_test, y_test = tt.X_train.values, tt.Y_train.planned_service_time_log.values.ravel(), tt.X_test.values, tt.Y_test.loc.values
X_train, X_cal, y_train, y_cal = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [5]:
tt.X_train.shape, tt.X_test.shape

((14135, 74), (3347, 74))

In [7]:
X_train.shape, X_cal.shape, X_test.shape

((11308, 74), (2827, 74), (3347, 74))

In [14]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import statistics as stat
import math
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
%matplotlib inline

### 1. NGBOOST

In [8]:
from ngboost import NGBRegressor
from ngboost.learners import default_tree_learner, default_linear_learner
from ngboost.scores import CRPS, MLE 
from ngboost.distns import LogNormal, Normal

In [10]:
ngb = NGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    Dist=Normal,
    Base=default_tree_learner,
    natural_gradient=True,
    # minibatch_frac=1.0,
    Score=MLE,
    # minibatch_frac=0.5, 
    # col_sample=0.5
)

In [11]:
tt.scale_test_train()
print("NGBRegressor on full train set. Dimensions of X_train: ", tt.X_train.shape)

ngb.fit(tt.X_train, tt.Y_train.values.ravel(), )  #X_val=tt.X_test, Y_val=tt.Y_test.loc.values.ravel(), early_stopping_rounds=200)


NGBRegressor on full train set. Dimensions of X_train:  (14135, 74)
[iter 0] loss=1.0852 val_loss=0.0000 scale=1.0000 norm=0.8079
[iter 100] loss=0.9945 val_loss=0.0000 scale=2.0000 norm=1.5021
[iter 200] loss=0.9645 val_loss=0.0000 scale=1.0000 norm=0.7488
[iter 300] loss=0.9527 val_loss=0.0000 scale=2.0000 norm=1.5005
[iter 400] loss=0.9469 val_loss=0.0000 scale=1.0000 norm=0.7501
[iter 500] loss=0.9430 val_loss=0.0000 scale=2.0000 norm=1.4993
[iter 600] loss=0.9399 val_loss=0.0000 scale=1.0000 norm=0.7491
[iter 700] loss=0.9374 val_loss=0.0000 scale=1.0000 norm=0.7486
[iter 800] loss=0.9355 val_loss=0.0000 scale=2.0000 norm=1.4962
[iter 900] loss=0.9336 val_loss=0.0000 scale=2.0000 norm=1.4953


##### Train Performance

In [12]:
ngb_train_dist = ngb.pred_dist(tt.X_train)

nbg_train_MAPE = metrics.mean_absolute_percentage_error(y_train, ngb_train_dist.loc)
nbg_train_R2 = metrics.r2_score(y_train,  ngb_train_dist.loc)
nbg_train_MSE = metrics.mean_squared_error(y_train,  ngb_train_dist.loc, squared=True)
nbg_train_RMSE = metrics.mean_squared_error(y_train,  ngb_train_dist.loc, squared=False)

print(f"MAPE: {nbg_train_MAPE:.2f}, R2: {nbg_train_R2:.2f}, MSE: {nbg_train_MSE:.2f}, RMSE: {nbg_train_RMSE:.2f}")
plt.figure(figsize=(10, 10))
plt.scatter(y_train,  ngb_train_dist.loc, s=10)


ValueError: Found input variables with inconsistent numbers of samples: [11308, 14135]

##### Test Performance

In [None]:
# test on the grouped train set
h3_index = hh.df.loc[tt._test_slice, 'h3']
x_test_grouped = tt.X_test.groupby(hh.df['h3'].iloc[h3_index]).first()
y_test_grouped = tt.Y_test.groupby(hh.df['h3'].iloc[h3_index]).agg(('mean', 'std'))
# drop level 0
y_test_grouped.columns = y_test_grouped.columns.droplevel(0)

y_pred_grouped = ngb.pred_dist(x_test_grouped)

nbg_test_mean_MAPE = metrics.mean_absolute_percentage_error(y_test_grouped['mean'], y_pred_grouped.loc)
nbg_test_mean_R2 = metrics.r2_score(y_test_grouped['mean'],  y_pred_grouped.loc)
nbg_test_mean_MSE = metrics.mean_squared_error(y_test_grouped['mean'],  y_pred_grouped.loc, squared=True)
nbg_test_mean_RMSE = metrics.mean_squared_error(y_test_grouped['mean'],  y_pred_grouped.loc, squared=False)

print(f"MAPE: {nbg_test_mean_MAPE:.2f}, R2: {nbg_test_mean_R2:.2f}, MSE: {nbg_test_mean_MSE:.2f}, RMSE: {nbg_test_mean_RMSE:.2f}")
plt.figure(figsize=(10, 10))
plt.scatter(y_test_grouped['mean'],  y_pred_grouped.loc, s=10)

#  testing on the scaled 
nbg_test_std_MAPE = metrics.mean_absolute_percentage_error(y_test_grouped['std'], y_pred_grouped.scale)
nbg_test_std_R2 = metrics.r2_score(y_test_grouped['std'], y_pred_grouped.scale)
nbg_test_std_MSE = metrics.mean_squared_error(y_test_grouped['std'], y_pred_grouped.scale, squared=True)
nbg_test_std_RMSE = metrics.mean_squared_error(y_test_grouped['std'], y_pred_grouped.scale, squared=False)

print(f"MAPE: {nbg_test_std_MAPE:.2f}, R2: {nbg_test_std_R2:.2f}, MSE: {nbg_test_std_MSE:.2f}, RMSE: {nbg_test_std_RMSE:.2f}")
plt.figure(figsize=(10, 10))
plt.scatter(y_test_grouped['std'], y_pred_grouped.scale, s=10)

### 2. NN

In [13]:
from sklearn.neural_network import MLPRegressor

# 16, 32, 64, 128, 64, 32, 8
# 16, 32, 128, 32, 16

mlp = MLPRegressor(
    hidden_layer_sizes=[16, 32, 128, 32, 16],
    max_iter=1000,
    activation='relu',
    solver='adam',
    learning_rate='constant',
    learning_rate_init=1e-3,
    random_state=12323,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.2,
)


mlp.fit(tt.X_train.values,tt.Y_train.values.ravel())
# mlp.fit(tt.X_train.iloc[:batch_size,:].values,tt.Y_train.iloc[:batch_size,:].values.ravel())

Iteration 1, loss = 0.22519293
Validation score: 0.155506
Iteration 2, loss = 0.21256592
Validation score: 0.166689
Iteration 3, loss = 0.20861406
Validation score: 0.180832
Iteration 4, loss = 0.20586463
Validation score: 0.182109
Iteration 5, loss = 0.20515468
Validation score: 0.181525
Iteration 6, loss = 0.20447696
Validation score: 0.185493
Iteration 7, loss = 0.20367596
Validation score: 0.182074
Iteration 8, loss = 0.20389400
Validation score: 0.178804
Iteration 9, loss = 0.20238575
Validation score: 0.183528
Iteration 10, loss = 0.20248813
Validation score: 0.182719
Iteration 11, loss = 0.20200614
Validation score: 0.190579
Iteration 12, loss = 0.20092184
Validation score: 0.190191
Iteration 13, loss = 0.20114971
Validation score: 0.193537
Iteration 14, loss = 0.20078023
Validation score: 0.195553
Iteration 15, loss = 0.20072532
Validation score: 0.178089
Iteration 16, loss = 0.20132224
Validation score: 0.193952
Iteration 17, loss = 0.20035361
Validation score: 0.187834
Iterat

In [20]:
print(f"Train-RMSE: {mean_squared_error(tt.Y_train.values.ravel(), mlp.predict(tt.X_train.values), squared=False).round(2)}, Train-R2: {r2_score(tt.Y_train.values.ravel(), mlp.predict(tt.X_train.values)).round(2)}")
print(f"Test-RMSE: {mean_squared_error(tt.Y_test.loc.values.ravel(), mlp.predict(tt.X_test.values), squared=False).round(2)}, Test-R2: {r2_score(tt.Y_test.loc.values.ravel(), mlp.predict(tt.X_test.values)).round(2)}")

Train-RMSE: 0.63, Train-R2: 0.22
Test-RMSE: 0.65, Test-R2: 0.07


### 3. GP Regression (RBF Kernel + KISS GP)

### 3. Conditional GAN