In [1]:
import sys
import os

# Set the main path in the root folder of the project.
sys.path.append(os.path.join('..'))

In [2]:
# Settings for autoreloading.
%load_ext autoreload
%autoreload 2

In [3]:
from src.utils.seed import set_random_seed

# Set the random seed for deterministic operations.
SEED = 42
set_random_seed(SEED)

In [4]:
import torch

# Set the device for training and querying the model.
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'The selected device is: "{DEVICE}"')

The selected device is: "cuda"


# Loading the Data

In [5]:
import os

BASE_DATA_DIR = os.path.join('..', 'data', 'metr-la')

In [6]:
import pickle
with open(os.path.join(BASE_DATA_DIR, 'processed', 'scaler.pkl'), 'rb') as f:
    scaler = pickle.load(f)

In [7]:
from src.spatial_temporal_gnn.model import SpatialTemporalGNN
from src.data.data_extraction import get_adjacency_matrix

# Get the adjacency matrix
adj_matrix_structure = get_adjacency_matrix(
    os.path.join(BASE_DATA_DIR, 'raw', 'adj_mx_metr_la.pkl'))

# Get the header of the adjacency matrix, the node indices and the
# matrix itself.
header, node_ids_dict, adj_matrix = adj_matrix_structure

# Get the STGNN and load the checkpoints.
spatial_temporal_gnn = SpatialTemporalGNN(9, 1, 12, 12, adj_matrix, DEVICE, 64)

stgnn_checkpoints_path = os.path.join('..', 'models', 'checkpoints',
                                      'st_gnn_metr_la.pth')

stgnn_checkpoints = torch.load(stgnn_checkpoints_path)
spatial_temporal_gnn.load_state_dict(stgnn_checkpoints['model_state_dict'])

# Set the model in evaluation mode.
spatial_temporal_gnn.eval();

In [8]:
from src.data.data_extraction import get_locations_dataframe

# Get the dataframe containing the latitude and longitude of each sensor.
locations_df = get_locations_dataframe(
    os.path.join(BASE_DATA_DIR, 'raw', 'graph_sensor_locations_metr_la.csv'),
    has_header=True)

In [9]:
# Get the node positions dictionary.
node_pos_dict = { i: id for id, i in node_ids_dict.items() }

In [10]:
import os
import numpy as np
from src.spatial_temporal_gnn.prediction import predict

# Get the data and the values predicted by the STGNN.
x_train = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'x_train.npy'))
y_train = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'y_train.npy'))
x_val = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'x_val.npy'))
y_val = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'y_val.npy'))
x_test = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'x_test.npy'))
y_test = np.load(os.path.join(BASE_DATA_DIR, 'predicted', 'y_test.npy'))

# Get the time information of the train, validation and test sets.
x_train_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'x_train_time.npy'))
y_train_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'y_train_time.npy'))
x_val_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'x_val_time.npy'))
y_val_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'y_val_time.npy'))
x_test_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'x_test_time.npy'))
y_test_time = np.load(
    os.path.join(BASE_DATA_DIR, 'processed', 'y_test_time.npy'))

In [11]:
# Turn the results in kilometers per hour.
MPH_TO_KMH_FACTOR = 1.609344

y_train = y_train * MPH_TO_KMH_FACTOR
y_val = y_val * MPH_TO_KMH_FACTOR
y_test = y_test * MPH_TO_KMH_FACTOR

In [12]:
_, n_timesteps, n_nodes, _ = y_train.shape

# Adjacency Distance Matrix

In [13]:
from src.explanation.clustering.clustering import (
    get_adjacency_distance_matrix)

adj_distance_matrix = get_adjacency_distance_matrix(adj_matrix, n_timesteps)

In [14]:
print(f'Shape of the Adjacency Distance Matrix: {adj_distance_matrix.shape}')

Shape of the Adjacency Distance Matrix: (2484, 2484)


# Temporal Distance Matrix

In [15]:
from src.explanation.clustering.clustering import (
    get_temporal_distance_matrix)

temporal_distance_matrix = get_temporal_distance_matrix(n_nodes, n_timesteps)

In [16]:
print('Shape of the Temporal Distance Matrix:',
      f'{temporal_distance_matrix.shape}')

Shape of the Temporal Distance Matrix: (2484, 2484)


# Clustering Function

In [19]:
from src.explanation.clustering.evaluation import apply_grid_search

# Apply the grid search on a subset of the training set.
apply_grid_search(
    instances=y_train[:200],
    eps_list=[.1, .15, .2, .25, .3, .35, .4, .45, .5],
    min_samples_list=[5, 7, 10, 12, 15, 17, 20],
    adj_distance_matrix=adj_distance_matrix,
    temporal_distance_matrix=temporal_distance_matrix)

eps: 0.1 min_samples: 5
	Within-Cluster Variance: 0.999 Connected Cluster Dissimilarity: 6.07 Noise points ratio: 0.997

eps: 0.1 min_samples: 7
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 10
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 12
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 15
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 17
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.1 min_samples: 20
	Within-Cluster Variance: 1 Connected Cluster Dissimilarity: 0 Noise points ratio: 1

eps: 0.15 min_samples: 5
	Within-Cluster Variance: 0.946 Connected Cluster Dissimilarity: 12.1 Noise points ratio: 0.913

eps: 0.15 min_samples: 7
	Within-Cluster Variance: 0.993 Connected Cluster Dissimilarity: 8.

In [17]:
# Set the best parameters based on the results of the grid search.

EPS = .35 #.5
MIN_SAMPLES = 5

In [18]:
from src.explanation.clustering.evaluation import get_dataset_clustering_scores

(avg_within_cluster_variance, avg_connected_cluster_dissimilarity,
 avg_noise_ratio) = get_dataset_clustering_scores(
     y_test, adj_distance_matrix, temporal_distance_matrix, EPS, MIN_SAMPLES)
 
print(
    'Within-Cluster Variance on the test set:',
    f'{avg_within_cluster_variance:.3g}',
    'Connected Cluster Dissimilarity on the test set:',
    f'{avg_connected_cluster_dissimilarity:.3g}',
    'Noise points ratio on the test set:', f'{avg_noise_ratio:.3g}')

Within-Cluster Variance on the test set: 0.0919 Connected Cluster Dissimilarity on the test set: 15.2 Noise points ratio on the test set: 0.0544


In [19]:
sample = y_test[100]

In [20]:
from src.explanation.clustering.clustering import get_clusters
from src.explanation.clustering.evaluation import (
    get_within_clusters_variance, get_connected_cluster_dissimilarity,
    get_noise_ratio)

clusters = get_clusters(sample, adj_distance_matrix, temporal_distance_matrix,
                        eps=EPS, min_samples=MIN_SAMPLES)

print(
    'Sample Within Cluster Variance:',
    f'{get_within_clusters_variance(sample, clusters):.3g}',
    'Sample Connected Cluster Dissimilarity:',
    f'{get_connected_cluster_dissimilarity(sample, clusters):.3g}',
    'Sample Noise Ratio:', f'{get_noise_ratio(sample, clusters):.3g}')

Sample Within Cluster Variance: 0.0421 Sample Connected Cluster Dissimilarity: 18 Sample Noise Ratio: 0.00966


In [21]:
print('Number of clusters found:', len(np.unique(clusters)))

Number of clusters found: 116


In [22]:
from src.explanation.clustering.analyisis import (
    get_node_values_with_clusters_and_location_dataframe)


In [23]:
location_df_with_clusters = \
    get_node_values_with_clusters_and_location_dataframe(
        sample, clusters, node_pos_dict, locations_df)

In [24]:
location_df_with_clusters.head()

Unnamed: 0,sensor_id,latitude,longitude,cluster,speed,datetime
0,773869,34.15497,-118.31829,0,106.795815,0
1,767541,34.11621,-118.23799,1,107.163132,0
2,767542,34.11641,-118.23819,2,111.637352,0
3,717447,34.07248,-118.26772,3,84.031723,0
4,717446,34.07142,-118.26572,52,45.019501,0


In [25]:
from keplergl.keplergl import KeplerGl

m = KeplerGl(height=800, show_docs=False, data={'data': location_df_with_clusters})

In [26]:
'''from src.data.data_analysis import show_kepler_map

print('Metr-LA speed clusters on the first Monday:')
show_kepler_map(location_df_with_clusters, None)''';

In [27]:
m

KeplerGl(data={'data':      sensor_id  latitude  longitude  cluster       speed  datetime
0       773869  34.1…

In [19]:
import os

DATA_DIR = os.path.join('..', 'data', 'metr-la', 'explainable')

In [20]:
from numpy import save
from src.explanation.clustering.clustering import (
    get_dataset_for_explainability)

os.makedirs(DATA_DIR, exist_ok=True)

(x_train_expl, y_train_expl,
 x_train_time_expl, y_train_time_expl) = get_dataset_for_explainability(
    x_train,
    y_train,
    x_train_time,
    y_train_time,
    EPS,
    MIN_SAMPLES,
    adj_distance_matrix,
    temporal_distance_matrix,
    total_samples=1_000)
save(os.path.join(DATA_DIR, 'x_train.npy'), x_train_expl)
save(os.path.join(DATA_DIR, 'y_train.npy'), y_train_expl)
save(os.path.join(DATA_DIR, 'x_train_time.npy'), x_train_time_expl)
save(os.path.join(DATA_DIR, 'y_train_time.npy'), y_train_time_expl)

(x_val_expl, y_val_expl,
 x_val_time_expl, y_val_time_expl) = get_dataset_for_explainability(
    x_val,
    y_val,
    x_val_time,
    y_val_time,
    EPS,
    MIN_SAMPLES,
    adj_distance_matrix,
    temporal_distance_matrix,
    total_samples=200)
save(os.path.join(DATA_DIR, 'x_val.npy'), x_val_expl)
save(os.path.join(DATA_DIR, 'y_val.npy'), y_val_expl)
save(os.path.join(DATA_DIR, 'x_val_time.npy'), x_val_time_expl)
save(os.path.join(DATA_DIR, 'y_val_time.npy'), y_val_time_expl)

(x_test_expl, y_test_expl,
 x_test_time_expl, y_test_time_expl) = get_dataset_for_explainability(
    x_test,
    y_test,
    x_test_time,
    y_test_time,
    EPS,
    MIN_SAMPLES,
    adj_distance_matrix,
    temporal_distance_matrix,
    total_samples=300)
save(os.path.join(DATA_DIR, 'x_test.npy'), x_test_expl)
save(os.path.join(DATA_DIR, 'y_test.npy'), y_test_expl)
save(os.path.join(DATA_DIR, 'x_test_time.npy'), x_test_time_expl)
save(os.path.join(DATA_DIR, 'y_test_time.npy'), y_test_time_expl)

In [21]:
print('Train dataset for explainability shapes:',
      x_train_expl.shape, y_train_expl.shape)
print('Validation dataset for explainability shapes:',
      x_val_expl.shape, y_val_expl.shape)
print('Test dataset for explainability shapes:',
      x_test_expl.shape, y_test_expl.shape)

Train dataset for explainability shapes: (999, 12, 207, 9) (999, 12, 207, 1)
Validation dataset for explainability shapes: (198, 12, 207, 9) (198, 12, 207, 1)
Test dataset for explainability shapes: (300, 12, 207, 9) (300, 12, 207, 1)
