# Preprocess ML data
This notebooks generates a dataset for machine learning based on the result folders of SUMO simulations

Note that the notebook cals *infer_incident_data*

In [1]:
import sys
sys.path.append('..')

In [4]:
import os
import glob
import json
import pandas as pd
from tqdm import tqdm
import numpy as np
import shutil
     
from utils.preprocess_utils import infer_incident_data


## Select experiment

In [38]:
scenario = 'motorway'
experiment_name = 'incident3'
path = f'../{scenario}/Results/{experiment_name}'
sim_path = f'../{scenario}/Results/{experiment_name}/simulations'
scenario_folder = f'../{scenario}'

result_folders = os.listdir(sim_path)
print(f'{len(result_folders)} simulations')



1635 simulations


In [4]:
simulation_folders = [folder for folder in result_folders if folder.startswith(experiment_name)]

# Check for uncaught errors 
error_folders = []
zero_folders = []
for folder in simulation_folders:
    folder_dir = os.listdir(f'{sim_path}/{folder}')
    if len(folder_dir) != 11:
        print(f'{folder} is missing files it has only {len(folder_dir)}')
        if len(folder_dir) == 0:
            zero_folders.append(folder)
        else:
            error_folders.append(folder)
    
    error_logs = [file for file in glob.glob(f'{sim_path}/{folder}/*.err')]
    for log in error_logs:
        with open(log, 'r') as read_obj:
            one_char = read_obj.read(1)
            if one_char:
                print(f'{folder} has error in {log}')
                error_folders.append(folder)
                

In [5]:
for folder in zero_folders:
    os.rmdir(f'{sim_path}/{folder}')

In [6]:
error_path = f'../{scenario}/Results/{experiment_name}_error_sims'
if not os.path.isdir(error_path):
    os.mkdir(error_path)
    
for folder in error_folders:
    shutil.move(f'{sim_path}/{folder}', f'{error_path}/{folder}')

# Load data

### Load simulation data

In [7]:
incident_settings_arr = []
for simulation in tqdm(simulation_folders[:10]):
    simulation_path = f'{sim_path}/{simulation}'
    with open(f'{simulation_path}/incident_settings.json') as f:
        incident_settings = json.load(f)
        incident_settings_arr.append(incident_settings)
    

    

100%|████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 15517.22it/s]


In [8]:
from multiprocessing import Pool

In [29]:
simulation_paths = [f'{sim_path}/{simulation}' for simulation in simulation_folders]

In [30]:
with Pool(os.cpu_count() - 24) as pool:
         returns = list(tqdm(pool.imap(infer_incident_data, simulation_paths)))

1627it [03:48,  7.13it/s]


In [31]:
input_data_arr = []
target_data_arr = []
incident_info_arr = []
network_info_arr = []
full_inci_data_arr = []
counter_data_arr = []
ind_to_edge_arr = []
incident_settings_arr = []

for results in returns:
    if results[4].shape[2] == 231:
        input_data_arr.append(results[0])
        target_data_arr.append(results[1])
        incident_info_arr.append(results[2])
        network_info_arr.append(results[3])
        full_inci_data_arr.append(results[4])
        counter_data_arr.append(results[5])
        ind_to_edge_arr.append(results[6])
        incident_settings_arr.append(results[7])
        
input_data = np.array(input_data_arr)
target_data = np.array(target_data_arr)
incident_info = np.array(incident_info_arr)
network_info = np.array(network_info_arr)
full_inci_data = np.array(full_inci_data_arr)
counter_data = np.array(counter_data_arr) 
incident_settings = np.array(incident_settings_arr)
ind_to_edge = ind_to_edge_arr[0]

In [32]:
print(f'input data {input_data.shape}')
print(f'target data {target_data.shape}')
print(f'incident info {incident_info.shape}')
print(f'network info {network_info.shape}')
print(f'full inci data {full_inci_data.shape}')
print(f'counter data {counter_data.shape}')
print(f'ind_to_edge {len(ind_to_edge)}')
print(f'incident_settings {len(incident_settings)}')


input data (1422, 147, 6, 10, 5)
target data (1422, 147, 4)
incident info (1422, 4)
network info (1422, 147, 13)
full inci data (1422, 147, 6, 231, 5)
counter data (1422, 147, 6, 231, 3)
ind_to_edge 147
incident_settings 1422


In [40]:
np.save(f'{path}/input_data.npy', input_data)
np.save(f'{path}/target_data.npy', target_data)
np.save(f'{path}/incident_info.npy', incident_info)
np.save(f'{path}/network_info.npy', network_info)
np.save(f'{path}/full_inci_data.npy', full_inci_data)
np.save(f'{path}/counter_data.npy', counter_data)
np.save(f'{path}/incident_settings.npy', incident_settings)
with open(f'{path}/ind_to_edge.json', 'w') as fp:
    json.dump(ind_to_edge_arr[0], fp)

In [43]:
os.listdir(f'{path}')

['full_inci_data.npy',
 'network_info.npy',
 'ind_to_edge.json',
 'inci_data.npy',
 'simulations',
 'incident_settings.npy',
 'target_data.npy',
 'input_data.npy',
 'incident_info.npy',
 'counter_data.npy']