In [9]:
# Imports
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import h3
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import papermill as pm
import concurrent.futures
import random
import os 


In [None]:
# Constants

In [12]:
def preprocess_data(df, train_ratio=0.75, validation_ratio=0.15, random_state=42):
    features = ['demand_h-2', 'demand_h-24', 'hour_sin', 'hour_cos', 'weekend', 'season_sin', 'season_cos', 'public_holiday', 'temperature', 'precip']
    features_to_scale = ['demand_h-2', 'demand_h-24', 'temperature', 'precip']
    target = 'demand'
    
    # Copy the input DataFrame
    df_copy = df.copy()

    # Select features and target
    X = df_copy[features]
    y = df_copy[target]

    # Split into train, validation, and test sets
    X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y, test_size=(1 - train_ratio), random_state=random_state)
    X_val_unscaled, X_test_unscaled, y_val, y_test = train_test_split(X_test_unscaled, y_test, test_size=validation_ratio / (validation_ratio + test_ratio), random_state=random_state)

    # Scaling
    scaler = StandardScaler()
    scaler.fit(X_train_unscaled[features_to_scale])

    X_train = X_train_unscaled.copy()
    X_val = X_val_unscaled.copy()
    X_test = X_test_unscaled.copy()

    X_train[features_to_scale] = scaler.transform(X_train_unscaled[features_to_scale])
    X_val[features_to_scale] = scaler.transform(X_val_unscaled[features_to_scale])
    X_test[features_to_scale] = scaler.transform(X_test_unscaled[features_to_scale])

    return (X_train, X_val, X_test, y_train, y_val, y_test)

In [5]:
# Pull Datasets from Feature Engineering
# This takes around ~ 25-35 Minutes and will fill you RAM and CPU nearly completely. 

# Constants
TIME_RESOLUTIONS = ['1H', '2H', '6H', '24H']
SPATIAL_RESOLUTIONS = [6, 7, 8]
DATASET_SUFFIX = ['_h3', '_census']
PROCESSING_NOTEBOOK_FILE = './predicitve_feature_engineering.ipynb'
FILE_BASE_NAME='./data/predictive/dataset'
# Max 4 on 32 GB Ram (Adrians Machine)
MAX_WORKER_THREADS = 4

In [6]:
%%capture
# ^ supress notebook outputs as to not get spammed by 12 Data preparation notebooks. Output can be found under /data/notebook_outs

output_filenames = []

# Function to execute a notebook with given parameters
def execute_notebook(notebook, params):
    output_notebook = f"./data/notebook_outs/output_{random.randint(1, 100)}"
    pm.execute_notebook(notebook, output_notebook, parameters=params)

# Generate notebooks and parameters
notebooks_and_params = []
for time_res in TIME_RESOLUTIONS:
    for spatial_res in SPATIAL_RESOLUTIONS:
        output_filename_base = f'{FILE_BASE_NAME}-spatial_{spatial_res}-temporal_{time_res}'
        output_filenames.append(output_filename_base)

        notebook = PROCESSING_NOTEBOOK_FILE  # Replace with your notebook filename
        params = {
            "TIME_RESOLUTION": time_res,
            "SPATIAL_RESOLUTION": spatial_res,
            "OUTPUT_FILENAME_BASE": output_filename_base
        }
        notebooks_and_params.append((notebook, params))

# Parallel execution using concurrent.futures
with concurrent.futures.ThreadPoolExecutor(MAX_WORKER_THREADS) as executor:
    futures = [executor.submit(execute_notebook, nb, params) for nb, params in notebooks_and_params]

# Wait for all futures to complete
concurrent.futures.wait(futures)

# Print exception details
for future in futures:
    exception = future.exception()
    if exception:
        print(f"Exception in future: {exception}")


In [15]:
# Import datasets
datasets = {}

for filepath in output_filenames:
    filename = f'{os.path.basename(filepath)}_census'
    datasets[filename] = preprocess_data(pd.read_csv(f'{filepath}.csv'))

    filename = f'{os.path.basename(filepath)}_h3'
    datasets[filename] = preprocess_data(pd.read_csv(f'{filepath}.csv'))

FileNotFoundError: [Errno 2] No such file or directory: './data/predictive/dataset-spatial_6-temporal_1H.csv'

### Function for Hyperparameter Tuning
This function enables us to do hyperparameter tuning for any model in the sklearn universe. We have the choice to either do a RandomizedGridSearch with cross validation or a standard GridSearch, the latter is computationally heavier.

In [None]:
def optimize_hyperparameters(param_grid, model, X, y, randomized=False):
    if randomized:
        grid = RandomizedSearchCV(model, param_grid)
    else:
        grid = GridSearchCV(model, param_grid, verbose=3)

    grid.fit(X, y)
    print(f"Best params: {grid.best_params_}")
    print(f"Scoring: {grid.best_score_}")
    return grid


### Model Evaluation Function

In [None]:
# Model Evaluation function:
def evaluate_model(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return rmse, mae, r2

## SVR Prediction

We will now do SVR to estimate demand given our features. The issue is that the implementation of our SVR is based on libsvm. The fit time complexity is more than quadratic with the number of samples which makes it hard to scale to datasets with more than a couple of 10000 samples. For large datasets, we can either downsample or use sklearn.svm.LinearSVR which has a more performant implementation. Libsvm scales either O(n_features * n_samples^2) or O(n_features * n_samples^3). We will therefore focus our eperimentations on the linear kernel and only briefly try the other ones with reduced dataset sizes.

Overview:


In [None]:
# Hyperparameters
C=100
EPSILON=0.1
POLY_DEGREE=3

# Training Parameters
CACHE_SIZE = 2048 # in MB

# Parallel Training
MAX_WORKER_THREADS = 4

In [7]:
def train_linear_svr_models(dataset_name, dataset, C, EPSILON, CACHE_SIZE):
    X_train, X_val, X_test, y_train, y_val, y_test = dataset

    svr_lin = LinearSVR(C=C, epsilon=EPSILON, cache_size=CACHE_SIZE)
    
    print(f'Fitting for {dataset_name}..')
    svr_lin.fit(X_train, y_train)

    print(f'Predicting for {dataset_name}..')
    y_lin = svr_rbf.predict(X_test)

    print(f'Evaluating for {dataset_name}..')
    metrics[dataset_name] = evaluate_model(y_test, y_lin)
    models[dataset_name] = svr_lin


metrics = {}
models = {}

# Parallel execution using concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKER_THREADS) as executor:
    futures = [executor.submit(train_linear_svr_models, dataset.key(), dataset.value(), C, EPSILON, CACHE_SIZE) for dataset in datasets]

# Wait for all futures to complete
concurrent.futures.wait(futures)

# Print exception details
for future in futures:
    exception = future.exception()
    if exception:
        print(f"Exception in future: {exception}")

NameError: name 'datasets' is not defined

In [None]:
# Create SVR models with different kernels
# Important - Only 
svr_rbf = SVR(kernel='rbf', C=C, gamma=C, epsilon=EPSILON, cache_size=CACHE_SIZE)
#svr_lin = LinearSVR(C=C, epsilon=EPSILON, cache_size=CACHE_SIZE)
#svr_poly = SVR(kernel='poly', C=C, degree=POLY_DEGREE, epsilon=EPSILON, cache_size=CACHE_SIZE)

# Fit the models
print('Fitting')
#svr_lin.fit(X_val, y_val)
#svr_poly.fit(X, y)

y_rbf = svr_rbf.predict(X_test)
print('Predicting')
#y_lin = svr_lin.predict(X_test)
#y_poly = svr_poly.predict(X_test)
print()

# Problem: for normal SVMs, the 

## NN Prediction