# Preprocess ML data
This notebooks generates a dataset for machine learning based on the result folders of SUMO simulations

Note that the notebook cals *infer_incident_data*

In [1]:
import sys
sys.path.append('..')

In [2]:
import os
import glob
import json
import pandas as pd
from tqdm import tqdm
import numpy as np
import shutil
     
from util_folder.preprocess_utils import infer_incident_data


## Select experiment

In [3]:
scenario = 'motorway'
scenario_folder = f'../{scenario}'

sim_path = '/mnt/raid1/manity/folder_small_run'

experiment_name = 'small_run'

result_folders = os.listdir(sim_path)
print(f'{len(result_folders)} simulations')



20 simulations


In [4]:
simulation_folders = [folder for folder in result_folders if folder.startswith(experiment_name)]

# Check for uncaught errors 
error_folders = []
zero_folders = []
missing_csvs = []
for folder in simulation_folders:
    folder_dir = os.listdir(f'{sim_path}/{folder}')
    if len(folder_dir) != 7:
        print(f'{folder} is missing files it has only {len(folder_dir)}')
        if len(folder_dir) == 0:
            zero_folders.append(folder)
        else:
            error_folders.append(folder)
    
    error_logs = [file for file in glob.glob(f'{sim_path}/{folder}/*.err')]
    for log in error_logs:
        with open(log, 'r') as read_obj:
            one_char = read_obj.read(1)
            if one_char:
                print(f'{folder} has error in {log}')
                error_folders.append(folder)
                

    if 'detectordata_counterfactual.csv' not in folder_dir:
        print(f'{folder} is missing counterfactual csv')
        missing_csvs.append(folder)

    if 'detectordata.csv' not in folder_dir:
        print(f'{folder} is missing csv')
        missing_csvs.append(folder)        

    

In [5]:
print(f'number of zero folders: {len(zero_folders)}')
print(f'number of error folders: {len(error_folders)}')
print(f'number of missing csv folders: {len(missing_csvs)}')

number of zero folders: 0
number of error folders: 0
number of missing csv folders: 0


In [6]:
for folder in zero_folders:
    os.rmdir(f'{sim_path}/{folder}')

In [7]:
error_path = f'{sim_path}/error_folders'
if not os.path.isdir(error_path):
    os.mkdir(error_path)
    
for folder in error_folders:
    shutil.move(f'{sim_path}/{folder}', f'{error_path}/{folder}')

In [8]:
missing_csv_path = f'{sim_path}/missing_csv'
if not os.path.isdir(missing_csv_path):
    os.mkdir(missing_csv_path)
    
for folder in missing_csvs:
    shutil.move(f'{sim_path}/{folder}', f'{missing_csv_path}/{folder}')

In [9]:
print(f'Number of moved error folders: {len(os.listdir(error_path))}')
print(f'Number of moved missing csv folders: {len(os.listdir(missing_csv_path))}')

Number of moved error folders: 0
Number of moved missing csv folders: 0


# Load data

### Load simulation data

In [10]:
print(f'Number of simulations {len(simulation_folders)}')

Number of simulations 20


In [12]:
incident_settings_arr = []
for simulation in tqdm(simulation_folders[:]):
    simulation_path = f'{sim_path}/{simulation}'
    with open(f'{simulation_path}/incident_settings.json') as f:
        incident_settings = json.load(f)
        incident_settings_arr.append(incident_settings)
    

    

100%|██████████| 20/20 [00:00<00:00, 18359.83it/s]


In [13]:
from multiprocessing import Pool

In [14]:
net_path = '../Simulation_scenarios/motorway/Simulations/Base/network.net.xml'
simulation_paths = [[f'{sim_path}/{simulation}', net_path] for simulation in simulation_folders[:10]]

In [15]:
with Pool(os.cpu_count() - 24) as pool:
         returns = list(tqdm(pool.imap(infer_incident_data, simulation_paths)))

10it [00:01,  6.34it/s]


In [16]:
len(returns)

10

In [17]:
input_data_arr = []
target_data_arr = []
incident_info_arr = []
network_info_arr = []
full_inci_data_arr = []
counter_data_arr = []
ind_to_edge_arr = []
incident_settings_arr = []

for results in returns:
    print(results[4].shape[2])
    if results[4].shape[2] == 231:
        input_data_arr.append(results[0])
        target_data_arr.append(results[1])
        incident_info_arr.append(results[2])
        network_info_arr.append(results[3])
        full_inci_data_arr.append(results[4])
        counter_data_arr.append(results[5])
        ind_to_edge_arr.append(results[6])
        incident_settings_arr.append(results[7])
        
input_data = np.array(input_data_arr)
target_data = np.array(target_data_arr)
incident_info = np.array(incident_info_arr)
network_info = np.array(network_info_arr)
full_inci_data = np.array(full_inci_data_arr)
counter_data = np.array(counter_data_arr) 
incident_settings = np.array(incident_settings_arr)
ind_to_edge = ind_to_edge_arr[0]

231
231
231
231
231
231
231
231
231
231


In [18]:
print(f'input data {input_data.shape}')
print(f'target data {target_data.shape}')
print(f'incident info {incident_info.shape}')
print(f'network info {network_info.shape}')
print(f'full inci data {full_inci_data.shape}')
print(f'counter data {counter_data.shape}')
print(f'ind_to_edge {len(ind_to_edge)}')
print(f'incident_settings {len(incident_settings)}')


input data (10, 147, 6, 10, 5)
target data (10, 147, 4)
incident info (10, 4)
network info (10, 147, 13)
full inci data (10, 147, 6, 231, 5)
counter data (10, 147, 6, 231, 3)
ind_to_edge 147
incident_settings 10


In [40]:
np.save(f'{path}/input_data.npy', input_data)
np.save(f'{path}/target_data.npy', target_data)
np.save(f'{path}/incident_info.npy', incident_info)
np.save(f'{path}/network_info.npy', network_info)
np.save(f'{path}/full_inci_data.npy', full_inci_data)
np.save(f'{path}/counter_data.npy', counter_data)
np.save(f'{path}/incident_settings.npy', incident_settings)
with open(f'{path}/ind_to_edge.json', 'w') as fp:
    json.dump(ind_to_edge_arr[0], fp)

In [43]:
os.listdir(f'{path}')

['full_inci_data.npy',
 'network_info.npy',
 'ind_to_edge.json',
 'inci_data.npy',
 'simulations',
 'incident_settings.npy',
 'target_data.npy',
 'input_data.npy',
 'incident_info.npy',
 'counter_data.npy']