# Data Aggregation

This notebook aggregates sim data to a dataset that can be used for ML training

In [33]:
import os
import json
import polars as pl

In [3]:
data_folder = "../data/"
data_paths = [os.path.join(data_folder, file) for file in os.listdir(data_folder)]
data_paths

['../data/2025-04-08-22:50:10',
 '../data/2025-04-09-00:54:16',
 '../data/2025-04-08-23:05:05',
 '../data/2025-04-08-21:54:43',
 '../data/2025-04-08-23:39:25',
 '../data/2025-04-09-01:33:08',
 '../data/2025-04-08-22:33:13',
 '../data/2025-04-08-21:45:18',
 '../data/2025-04-09-01:16:01',
 '../data/2025-04-09-01:44:54',
 '../data/2025-04-08-23:34:08',
 '../data/2025-04-08-23:19:32',
 '../data/2025-04-09-00:02:57',
 '../data/2025-04-08-23:29:58',
 '../data/2025-04-08-23:47:17',
 '../data/2025-04-09-00:21:06',
 '../data/2025-04-08-22:17:43',
 '../data/2025-04-08-21:26:58',
 '../data/2025-04-08-22:47:19',
 '../data/2025-04-08-22:58:32']

In [14]:
traffic = pl.read_parquet(os.path.join(data_paths[0], "traffic.parquet"))
traffic

Step,Light_ID,Time,Lane,Num_Cars
i32,i16,i16,str,i16
1,1,199,"""intersection_6""",0
1,1,199,"""intersection_46""",0
1,1,199,"""intersection_16""",0
1,1,199,"""intersection_24""",0
2,1,198,"""intersection_6""",0
…,…,…,…,…
999,52,1,"""intersection_47""",0
1000,52,200,"""intersection_16""",0
1000,52,200,"""intersection_27""",0
1000,52,200,"""intersection_38""",0


In [15]:
light_data = pl.read_parquet(os.path.join(data_paths[0], "light_data.parquet"))
light_data

Light_ID,Centrality,Is_Entrypoint
i16,f32,bool
1,0.024816,true
2,0.0231,true
3,0.019337,true
4,0.01774,true
5,0.019902,true
…,…,…
48,0.021564,true
49,0.023838,true
50,0.020373,true
51,0.017215,true


In [16]:
connections = pl.read_parquet(os.path.join(data_paths[0], "connections.parquet"))
connections

Intersection_u,Intersection_v,Distance
str,str,i16
"""intersection_0""","""intersection_6""",13
"""intersection_0""","""intersection_46""",9
"""intersection_0""","""intersection_16""",11
"""intersection_0""","""intersection_24""",12
"""intersection_0""","""border_190""",2
…,…,…
"""intersection_51""","""border_186""",4
"""intersection_51""","""border_115""",17
"""intersection_51""","""border_98""",4
"""intersection_51""","""border_76""",17


In [17]:
light_intersection_mapping = pl.read_parquet(
    os.path.join(data_paths[0], "light_intersection_mapping.parquet")
)
light_intersection_mapping

Light_ID,Intersection
i16,str
1,"""intersection_0"""
2,"""intersection_1"""
3,"""intersection_2"""
4,"""intersection_3"""
5,"""intersection_4"""
…,…
48,"""intersection_47"""
49,"""intersection_48"""
50,"""intersection_49"""
51,"""intersection_50"""


In [18]:
data = traffic.join(light_data, on="Light_ID", how="left")
data

Step,Light_ID,Time,Lane,Num_Cars,Centrality,Is_Entrypoint
i32,i16,i16,str,i16,f32,bool
1,1,199,"""intersection_6""",0,0.024816,true
1,1,199,"""intersection_46""",0,0.024816,true
1,1,199,"""intersection_16""",0,0.024816,true
1,1,199,"""intersection_24""",0,0.024816,true
2,1,198,"""intersection_6""",0,0.024816,true
…,…,…,…,…,…,…
999,52,1,"""intersection_47""",0,0.022771,true
1000,52,200,"""intersection_16""",0,0.022771,true
1000,52,200,"""intersection_27""",0,0.022771,true
1000,52,200,"""intersection_38""",0,0.022771,true


In [19]:
data = data.join(light_intersection_mapping, on="Light_ID", how="left")
data = data.rename(mapping={"Intersection": "Intersection_u"})
data

Step,Light_ID,Time,Lane,Num_Cars,Centrality,Is_Entrypoint,Intersection_u
i32,i16,i16,str,i16,f32,bool,str
1,1,199,"""intersection_6""",0,0.024816,true,"""intersection_0"""
1,1,199,"""intersection_46""",0,0.024816,true,"""intersection_0"""
1,1,199,"""intersection_16""",0,0.024816,true,"""intersection_0"""
1,1,199,"""intersection_24""",0,0.024816,true,"""intersection_0"""
2,1,198,"""intersection_6""",0,0.024816,true,"""intersection_0"""
…,…,…,…,…,…,…,…
999,52,1,"""intersection_47""",0,0.022771,true,"""intersection_51"""
1000,52,200,"""intersection_16""",0,0.022771,true,"""intersection_51"""
1000,52,200,"""intersection_27""",0,0.022771,true,"""intersection_51"""
1000,52,200,"""intersection_38""",0,0.022771,true,"""intersection_51"""


In [20]:
data = data.join(
    connections,
    left_on=["Intersection_u", "Lane"],
    right_on=["Intersection_u", "Intersection_v"],
    how="left",
)
data

Step,Light_ID,Time,Lane,Num_Cars,Centrality,Is_Entrypoint,Intersection_u,Distance
i32,i16,i16,str,i16,f32,bool,str,i16
1,1,199,"""intersection_6""",0,0.024816,true,"""intersection_0""",13
1,1,199,"""intersection_46""",0,0.024816,true,"""intersection_0""",9
1,1,199,"""intersection_16""",0,0.024816,true,"""intersection_0""",11
1,1,199,"""intersection_24""",0,0.024816,true,"""intersection_0""",12
2,1,198,"""intersection_6""",0,0.024816,true,"""intersection_0""",13
…,…,…,…,…,…,…,…,…
999,52,1,"""intersection_47""",0,0.022771,true,"""intersection_51""",11
1000,52,200,"""intersection_16""",0,0.022771,true,"""intersection_51""",9
1000,52,200,"""intersection_27""",0,0.022771,true,"""intersection_51""",16
1000,52,200,"""intersection_38""",0,0.022771,true,"""intersection_51""",32


In [21]:
data = data.with_columns([pl.lit(0).alias("Sim_ID")])
data

Step,Light_ID,Time,Lane,Num_Cars,Centrality,Is_Entrypoint,Intersection_u,Distance,Sim_ID
i32,i16,i16,str,i16,f32,bool,str,i16,i32
1,1,199,"""intersection_6""",0,0.024816,true,"""intersection_0""",13,0
1,1,199,"""intersection_46""",0,0.024816,true,"""intersection_0""",9,0
1,1,199,"""intersection_16""",0,0.024816,true,"""intersection_0""",11,0
1,1,199,"""intersection_24""",0,0.024816,true,"""intersection_0""",12,0
2,1,198,"""intersection_6""",0,0.024816,true,"""intersection_0""",13,0
…,…,…,…,…,…,…,…,…,…
999,52,1,"""intersection_47""",0,0.022771,true,"""intersection_51""",11,0
1000,52,200,"""intersection_16""",0,0.022771,true,"""intersection_51""",9,0
1000,52,200,"""intersection_27""",0,0.022771,true,"""intersection_51""",16,0
1000,52,200,"""intersection_38""",0,0.022771,true,"""intersection_51""",32,0


In [22]:
def agg_data(folder: str, sim_id: int) -> pl.DataFrame:
    """Function to aggregate data from a simulation run

    Args:
        folder (str): Path to the folder the data is stored in
        sim_id (int): ID of the simulation run

    Returns:
        pl.DataFrame: Aggregated data
    """
    traffic = pl.read_parquet(os.path.join(folder, "traffic.parquet"))
    light_data = pl.read_parquet(os.path.join(folder, "light_data.parquet"))
    connections = pl.read_parquet(os.path.join(folder, "connections.parquet"))
    light_intersection_mapping = pl.read_parquet(
        os.path.join(folder, "light_intersection_mapping.parquet")
    )

    data = traffic.join(light_data, on="Light_ID", how="left")
    data = data.join(light_intersection_mapping, on="Light_ID", how="left")
    data = data.rename(mapping={"Intersection": "Intersection_u"})
    data = data.join(
        connections,
        left_on=["Intersection_u", "Lane"],
        right_on=["Intersection_u", "Intersection_v"],
        how="left",
    )

    data = data.with_columns([pl.lit(sim_id).alias("Sim_ID")])

    return data

In [23]:
data = pl.DataFrame()
for i in range(len(data_paths)):
    new_data = agg_data(folder=data_paths[i], sim_id=i)
    data = data.vstack(new_data)

In [24]:
data

Step,Light_ID,Time,Lane,Num_Cars,Centrality,Is_Entrypoint,Intersection_u,Distance,Sim_ID
i32,i16,i16,str,i16,f32,bool,str,i16,i32
1,1,199,"""intersection_6""",0,0.024816,true,"""intersection_0""",13,0
1,1,199,"""intersection_46""",0,0.024816,true,"""intersection_0""",9,0
1,1,199,"""intersection_16""",0,0.024816,true,"""intersection_0""",11,0
1,1,199,"""intersection_24""",0,0.024816,true,"""intersection_0""",12,0
2,1,198,"""intersection_6""",0,0.024816,true,"""intersection_0""",13,0
…,…,…,…,…,…,…,…,…,…
999,84,1,"""intersection_61""",2,0.033812,true,"""intersection_83""",5,19
1000,84,200,"""intersection_1""",0,0.033812,true,"""intersection_83""",5,19
1000,84,200,"""intersection_23""",0,0.033812,true,"""intersection_83""",12,19
1000,84,200,"""intersection_43""",2,0.033812,true,"""intersection_83""",5,19


In [26]:
data.write_parquet("data.parquet")

In [28]:
metadata = pl.DataFrame()
for i in range(len(data_paths)):
    new_metadata = pl.read_json(os.path.join(data_paths[i], "config.json"))
    new_metadata = new_metadata.with_columns([pl.lit(i).alias("Sim_ID")])
    metadata = metadata.vstack(new_metadata)
metadata

num_intersections,num_cars,num_borders,min_distance,max_distance,optimization_type,steps,Sim_ID
i64,i64,i64,i64,i64,str,i64,i32
52,296,208,9,36,"""simple""",1000,0
125,1229,500,10,40,"""simple""",1000,1
81,409,324,5,10,"""simple""",1000,2
92,621,460,10,40,"""simple""",1000,3
66,429,198,9,27,"""simple""",1000,4
…,…,…,…,…,…,…,…
105,823,420,6,18,"""simple""",1000,15
99,933,297,5,15,"""simple""",1000,16
116,663,464,10,20,"""simple""",1000,17
85,773,340,10,20,"""simple""",1000,18


In [30]:
metadata.write_parquet("metadata.parquet")

In [32]:
agg_metadata = {
    "Sim Runs": (metadata["Sim_ID"].max() + 1),
    "Avg. Intersections": metadata["num_intersections"].mean(),
    "Avg. Cars": metadata["num_cars"].mean(),
    "Avg. Borders": metadata["num_borders"].mean(),
    "Avg. Min. Distance": metadata["min_distance"].mean(),
    "Avg. Max. Distance": metadata["max_distance"].mean(),
    "Optimization": metadata["optimization_type"].max(),
    "Steps": metadata["steps"].max(),
}
agg_metadata

{'Sim Runs': 20,
 'Avg. Intersections': 92.65,
 'Avg. Cars': 688.95,
 'Avg. Borders': 335.15,
 'Avg. Min. Distance': 7.85,
 'Avg. Max. Distance': 23.7,
 'Optimization': 'simple',
 'Steps': 1000}

In [34]:
with open(file="metadata.json", mode="w") as file:
    json.dump(obj=agg_metadata, fp=file, indent=4)