In [1]:
%cd ../

/Users/harrysonghurst/Documents/Coursework/L48 Machine Learning and the Physical World/TrafficEmu


In [2]:
# If you get a SciPy error when installing Emukit, build it from source:

# git clone https://github.com/amzn/Emukit.git
# cd Emukit
# pip install -r requirements/requirements.txt
# python setup.py develop

In [3]:
import numpy as np
import pickle
import emukit as ek
import GPy

from emukit.model_wrappers import GPyModelWrapper
from emukit.experimental_design.experimental_design_loop import ExperimentalDesignLoop
from emukit.core import ParameterSpace, ContinuousParameter, DiscreteParameter
from emukit.core.initial_designs import RandomDesign, latin_design
from emukit.experimental_design.acquisitions import ModelVariance, IntegratedVarianceReduction
from emukit.core.loop import UserFunctionWrapper
from emukit.bayesian_optimization.acquisitions import ExpectedImprovement
from emukit.bayesian_optimization.loops import BayesianOptimizationLoop

from sumo_grid_simulation.grid_simulation import Simulator

### Pseudocode

Fitting a gaussian process to a simulator using Emukit takes the following form (taken from L48 lectures):

```
initialize GP with some randomly chosen points
while stopping condition is not met:
    compute candidate point(s) using GP and acquisition funciton (model_variance) -> new point
    evaluate this new point with our simulator/user function -> observation
    update model with new observation -> new GP
```

### Simulator

In [4]:
# trips_generator_period = 10 else simulation is v. slow
simulator = Simulator(trips_generator_period=10)

### User Function
 
This is the function we want to understand. Namedly, how are CO2 emissions and timeLoss (the time lost due to driving below the ideal speed) affected by the following parameters: 

```
edgeMaxSpeed: legal speed limit in m/s - this can be exceeded (11.11 == 40km/h)
maxSpeed: the absolute maximum velocity of any vehicle in m/s (55.55 == 200 km/h)
edgeLength: length of the roads between intersections in meters
numLanes: number of lanes per road
accel: The acceleration ability of vehicles in m/s^2.
```

*A complete list of parmeters analysed is discussed in our report.*

In [5]:
def user_function_time_loss(X):
    """  X = inputs - emukit doesnt pass named args, just an NxM ndarray """
    result = []
    print(X)
    
    for gridSize, edgeMaxSpeed, maxSpeed, edgeLength, numLanes, accel in X:
        s = simulator.simulate(
            gridSize      = int(gridSize),
            edgeMaxSpeed  = edgeMaxSpeed,
            maxSpeed      = maxSpeed,
            edgeLength    = int(edgeLength),
            numberOfLanes = int(numLanes),
            accel         = accel
        )
        # average time loss / average route length
        result.append(s['timeLoss']/s['routeLength'])
        
    # expand dims is essential or the acquition function breaks
    return np.expand_dims(np.array(result), 1)  

### Model (GP)

Our surrogate model is our emulator. In this case, a gaussian process. I think that emukit handles categorical/discrete inputs using one-hot encodings.

#### Model Inputs

In [6]:
gridSize = ContinuousParameter('gridSize', min_value=3, max_value=20)
edgeMaxSpeed = ContinuousParameter('edgeMaxSpeed', min_value=1, max_value=25)
maxSpeed = ContinuousParameter('maxSpeed', min_value=1, max_value=25)
edgeLength = ContinuousParameter('edgeLength', min_value=2, max_value=200)
numberOfLanes = DiscreteParameter('numberOfLanes', domain=[1,2,3])
accel = ContinuousParameter('accel', 1., 6.)

parameter_space = ParameterSpace([gridSize, edgeMaxSpeed, maxSpeed, edgeLength, numberOfLanes, accel])

#### Initialize Model / Emulator (GP)

In [7]:
init_X, init_Y = pickle.load(open('experimental_design/init_points_6_param.pkl', 'rb'))

# design = RandomDesign(parameter_space)  # initialize with random points
# num_data_points = 250
# init_X = design.get_samples(num_data_points)
# init_Y = user_function_time_loss(init_X)
# with open('init_points_6_param.pkl', "wb") as f:
#      pickle.dump((init_X, init_Y), f)

In [8]:
# emulator = pickle.load(open('experimental_design_timeLoss_50_iter_batch_size_3.pkl',"rb"))
emulator = GPy.models.GPRegression(init_X, init_Y)
emukit_model = GPyModelWrapper(emulator)
emulator

GP_regression.,value,constraints,priors
rbf.variance,1.0,+ve,
rbf.lengthscale,1.0,+ve,
Gaussian_noise.variance,1.0,+ve,


### Optimization

#### Acquisition Function

In [9]:
model_variance = ModelVariance(model=emukit_model)

#### Experimental Design

In [10]:
ed_loop = ExperimentalDesignLoop(
    model = emukit_model,
    space = parameter_space,
    acquisition = model_variance,
    batch_size = 1
)

In [None]:
ed_loop.run_loop(user_function_time_loss, 100)

### Evaluate

#### Test on 25 point test set

In [None]:
test_X, test_Y = pickle.load(open('experimental_design/test_points_6_param.pkl', 'rb'))

# design = RandomDesign(parameter_space)  # initialize with random points
# num_data_points = 25
# test_X = design.get_samples(num_data_points)
# test_Y = user_function_time_loss(test_X)
# with open('test_points_6_param.pkl', "wb") as f:
#      pickle.dump((test_X, test_Y), f)

In [33]:
pred_Y, pred_Y_variance = emukit_model.predict(test_X)
mse = np.sqrt(np.mean((pred_Y-test_Y)**2))
print('Root mean squared error: ', mse)

Root mean squared error:  1.4445145369062213


In [34]:
pred_Y, test_Y

(array([[2.45691158e-01],
        [2.38863539e-02],
        [4.69330831e-03],
        [1.00988806e-01],
        [1.35885465e-02],
        [1.05232247e-01],
        [2.28397253e-01],
        [9.31499341e-03],
        [7.26287369e+00],
        [3.83722195e-02],
        [1.34149205e-01],
        [2.27769023e-02],
        [3.36298296e-01],
        [1.33301769e-02],
        [1.01401805e-01],
        [4.72236366e-02],
        [7.42585919e-02],
        [1.17316377e-01],
        [1.51749925e-01],
        [6.84317207e-02],
        [1.48797727e-01],
        [2.29759632e-03],
        [2.24372114e-01],
        [1.03947071e-02],
        [1.12581139e-01]]),
 array([[0.07507789],
        [0.01842446],
        [0.01340747],
        [0.04006658],
        [0.01636273],
        [0.1492    ],
        [0.29265569],
        [0.01211676],
        [0.05204098],
        [0.02777613],
        [0.03821875],
        [0.02842034],
        [0.11653083],
        [0.01095996],
        [0.01356708],
        [0.0250339

### Save Model

In [20]:
with open('timeLoss_model_variance_100_iter_batch_size_1.pkl', "wb") as f:
     pickle.dump(emulator, f)

### Analysis

As we can see, optimizing for model_variance causes emukit to alternate between extreme choices of \[gridSize, edgeMaxSpeed, maxSpeed, numberOfLanes, accel\] whilst only varying edgeLength sensibly. This means that we don't explore the parameter space comprehensively - which is undesirable.