In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [5]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'metr-la')

In [6]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [7]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.explanation.navigator.model import Navigator
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'raw', 'adj_mx_metr_la.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_metr_la.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the STGNN in evaluation mode.
spatial_temporal_gnn.eval();

In [8]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'raw', 'graph_sensor_locations_metr_la.csv'),
    has_header=True)

In [9]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [10]:
import pickle

# Get the data scaler.
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [11]:
import os
import numpy as np

# Get the data and the values predicted by the STGNN.
x_train = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'x_train.npy'))
y_train = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'y_train.npy'))
x_val = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'x_val.npy'))
y_val = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'y_val.npy'))
x_test = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'x_test.npy'))
y_test = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'y_test.npy'))

# Get the time intervals.
x_test_time = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'x_test_time.npy'))
y_test_time = np.load(os.path.join(BASE_DATA_DIR, 'explainable', 'y_test_time.npy'))

In [12]:
from src.data.data_processing import get_distance_matrix

# Build the distance matrix between the nodes.
distance_matrix = get_distance_matrix(locations_df, node_ids_dict)

In [13]:
from src.explanation.monte_carlo.evaluation import apply_grid_search

apply_grid_search(
    x_train[::10],
    y_train[::10],
    distance_matrix,
    spatial_temporal_gnn,
    scaler,
    n_rollouts_list=[30, 50],
    explanation_size_factor_list=[2, 3, 5],
    cut_size_factor_list=[2, 3],
    exploration_weight_list=[5, 10, 20])

Testing: cut_size_factor: 2 explanation_size_factor: 2 exploration_weight: 5 n_rollouts: 30
[100/100] - 462s - MAE: 1.61 - RMSE: 1.99 - MAPE: 5.5% - Average time: 4.62s              

Testing: cut_size_factor: 2 explanation_size_factor: 2 exploration_weight: 5 n_rollouts: 50
[100/100] - 754s - MAE: 1.48 - RMSE: 1.77 - MAPE: 4.96% - Average time: 7.54s             

Testing: cut_size_factor: 2 explanation_size_factor: 2 exploration_weight: 10 n_rollouts: 30
[100/100] - 460s - MAE: 1.62 - RMSE: 1.97 - MAPE: 5.4% - Average time: 4.6s               

Testing: cut_size_factor: 2 explanation_size_factor: 2 exploration_weight: 10 n_rollouts: 50
[100/100] - 755s - MAE: 1.53 - RMSE: 1.89 - MAPE: 5.15% - Average time: 7.55s             

Testing: cut_size_factor: 2 explanation_size_factor: 2 exploration_weight: 20 n_rollouts: 30
[100/100] - 616s - MAE: 1.82 - RMSE: 2.22 - MAPE: 6.31% - Average time: 6.16s             

Testing: cut_size_factor: 2 explanation_size_factor: 2 exploration_weight: 20

In [13]:
from src.explanation.monte_carlo.search import get_best_input_events_subset

# Randomize list
ls = list(zip(x_train, y_train))

import random
#random.shuffle(ls)

res = []
y_samples = []
ls1 = [53, 224, 259, 666, 715, 770, 838, 885, 891, 947]

for x_sample, y_sample in ls[1:2]:#[ls[i] for i in [53, 224, 259, 666, 715, 770, 838, 885, 891, 947]]:
    explanation_size = int((y_sample.flatten() != 0).sum() * 2)
    subset = get_best_input_events_subset(
        x_sample,
        y_sample,
        distance_matrix,
        spatial_temporal_gnn,
        scaler,
        n_rollouts=50,
        n_top_events=explanation_size*2,
        exploration_weight=20,
        explanation_size=explanation_size,
        remove_value=0.,
        verbose=True)
    res.append(subset)
    y_samples.append(y_sample)
    print()


Execution 1/50
reward: -0.1568386843706285 , mae: 6.375977993011475
Execution 2/50
reward: -0.1568386843706285 , mae: 6.375977993011475
Execution 3/50
reward: -0.17692743278257236 , mae: 5.652034759521484
Execution 4/50
reward: -0.17692743278257236 , mae: 5.652034759521484
Execution 5/50
reward: -0.17692743278257236 , mae: 5.652034759521484
Execution 6/50
reward: -0.17692743278257236 , mae: 5.652034759521484
Execution 7/50
reward: -0.17692743278257236 , mae: 5.652034759521484
Execution 8/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 9/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 10/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 11/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 12/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 13/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 14/50
reward: -0.514044160105184 , mae: 1.945358157157898
Execution 15/50
reward: -0.514044160105184 , 

In [175]:
print(res[0])

[(9, 28), (8, 28), (7, 27), (6, 27), (5, 28), (4, 27), (3, 27), (3, 28), (2, 27), (2, 28), (1, 27), (7, 177), (6, 177), (4, 177), (3, 177), (2, 177), (1, 177), (0, 2), (11, 79), (10, 79), (9, 79), (8, 79), (7, 79), (6, 79)]


In [176]:
# y_train[8].flatten().sum() / (y_train[8] != 0).flatten().sum()

In [188]:
i = 3

input_events_subset = res[i]
y_sample = y_samples[i]

In [189]:
#print(input_events_subset)

In [190]:
print(input_events_subset)

[(2, 69), (7, 113), (5, 73), (3, 52), (0, 69), (5, 113), (3, 73), (2, 52), (4, 113), (1, 73), (3, 113), (0, 52), (0, 73), (2, 113), (0, 113), (9, 141), (8, 141), (5, 141), (1, 141), (11, 164), (10, 164), (5, 164), (4, 164), (2, 164)]


In [191]:
from src.explanation.events import remove_features_by_events

x_subset = x_sample.copy()

x_subset = remove_features_by_events(x_subset, input_events_subset)

#for e in input_events_subset:
#    x_subset[e[1], e[2], 0] = 0.
    
x_subset = x_subset[..., :1]

In [192]:
print(x_subset.shape)
print(y_sample.shape)

(12, 207, 1)
(12, 207, 1)


In [193]:
# Concatenate the input events subset with the output events.
explained_instance = np.concatenate((x_subset, y_sample), axis=0)


MPH_TO_KMH_FACTOR = 1.609344
#explained_instance *= MPH_TO_KMH_FACTOR

In [194]:
print(explained_instance.shape)

(24, 207, 1)


In [195]:
clusters = np.zeros_like(explained_instance)
clusters[12:] = 1.
clusters[explained_instance == 0.] = -1

In [196]:
from src.explanation.clustering.analyisis import (
    get_node_values_with_clusters_and_location_dataframe)

location_df_with_clusters = \
    get_node_values_with_clusters_and_location_dataframe(
        explained_instance, clusters, node_pos_dict, locations_df)

In [197]:
from keplergl.keplergl import KeplerGl

m = KeplerGl(height=800, show_docs=False, data={'data': location_df_with_clusters})

In [198]:
m

KeplerGl(data={'data':      sensor_id  latitude  longitude  cluster  speed  datetime
0       773869  34.15497 …

In [480]:
from src.explanation.monte_carlo.search import get_explanations_from_data

x_explained, y_explained = get_explanations_from_data(
    x_test[:10],
    y_test[:10],
    adj_matrix,
    spatial_temporal_gnn,
    navigator,
    distance_matrix,
    scaler,
    n_rollouts=30,
    exploration_weight=20,
)

0.7947202115917098 1.258304476737976
0.3220503831806981 3.1051042079925537
0.10994488160575627 9.095466613769531
0.4848132338640501 2.062649965286255
0.6544738526296934 1.5279449224472046
0.7828435674063169 1.277394413948059
0.391424523348731 2.5547709465026855
0.38892625281693777 2.571181535720825
0.5395853457235139 1.853274941444397
0.37837483818295814 2.6428818702697754


In [481]:
import os
import numpy as np

EXPLAINED_DATA_DIR = os.path.join(BASE_DATA_DIR, 'explained')
os.makedirs(EXPLAINED_DATA_DIR, exist_ok=True)

# Save the explained data.
np.save(os.path.join(EXPLAINED_DATA_DIR, 'x_test.npy'), x_explained)
np.save(os.path.join(EXPLAINED_DATA_DIR, 'y_test.npy'), y_explained)

# Save the explained time information of the datasets.
np.save(os.path.join(EXPLAINED_DATA_DIR, 'x_test_time.npy'), x_test_time[:10])
np.save(os.path.join(EXPLAINED_DATA_DIR, 'y_test_time.npy'), y_test_time[:10])