# Aggregate Sim Data

This notebook is used to aggregate data from multiple sim runs into one dataset used for ML Training

In [2]:
import os
import polars as pl

## Run 1

In [3]:
folder = "../data/2025-04-04-14:58:25"

In [4]:
arrivals = pl.read_parquet(os.path.join(folder, "arrivals.parquet"))
arrivals

Step,Light_ID,Time,Arrivals
i32,i16,i16,i16
1,1,199,0
2,1,198,0
3,1,197,0
4,1,196,0
5,1,195,0
…,…,…,…
995,86,5,0
997,86,3,1
998,86,2,0
999,86,1,0


In [5]:
connections = pl.read_parquet(os.path.join(folder, "connections.parquet"))
connections = connections.filter(~pl.col("Intersection_u").str.starts_with("border"))
connections = connections.filter(~pl.col("Intersection_v").str.starts_with("border"))
connections

Intersection_u,Intersection_v,Distance
str,str,i16
"""intersection_0""","""intersection_6""",9
"""intersection_0""","""intersection_72""",7
"""intersection_0""","""intersection_8""",7
"""intersection_0""","""intersection_29""",19
"""intersection_1""","""intersection_27""",11
…,…,…
"""intersection_84""","""intersection_22""",19
"""intersection_84""","""intersection_36""",14
"""intersection_84""","""intersection_67""",7
"""intersection_85""","""intersection_5""",5


In [6]:
light_data = pl.read_parquet(os.path.join(folder, "light_data.parquet"))
light_data

Light_ID,Centrality,Is_Entrypoint
i16,f32,bool
1,0.026253,true
2,0.025178,true
3,0.030019,true
4,0.024715,true
5,0.030451,true
…,…,…
82,0.029477,true
83,0.024726,true
84,0.024884,true
85,0.025494,true


In [7]:
light_intersection_mapping = pl.read_parquet(
    os.path.join(folder, "light_intersection_mapping.parquet")
)
light_intersection_mapping

Light_ID,Intersection
i16,str
1,"""intersection_0"""
2,"""intersection_1"""
3,"""intersection_2"""
4,"""intersection_3"""
5,"""intersection_4"""
…,…
82,"""intersection_81"""
83,"""intersection_82"""
84,"""intersection_83"""
85,"""intersection_84"""


In [8]:
traffic = pl.read_parquet(os.path.join(folder, "traffic.parquet"))
traffic

Step,Light_ID,Time,Num_Cars
i32,i16,i16,i16
1,1,199,0
2,1,198,0
3,1,197,6
4,1,196,11
5,1,195,11
…,…,…,…
995,86,5,5
997,86,3,7
998,86,2,6
999,86,1,7


### Aggregate Data

Arrivals Data is used as the base as arrivals will be used as the label for ML.

Add LightAgent Metadata

In [9]:
data = arrivals.join(light_data, on="Light_ID", how="left")
data

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint
i32,i16,i16,i16,f32,bool
1,1,199,0,0.026253,true
2,1,198,0,0.026253,true
3,1,197,0,0.026253,true
4,1,196,0,0.026253,true
5,1,195,0,0.026253,true
…,…,…,…,…,…
995,86,5,0,0.028758,true
997,86,3,1,0.028758,true
998,86,2,0,0.028758,true
999,86,1,0,0.028758,true


Add Intersection Mapping

In [10]:
data = data.join(light_intersection_mapping, on="Light_ID", how="left")
data

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Intersection
i32,i16,i16,i16,f32,bool,str
1,1,199,0,0.026253,true,"""intersection_0"""
2,1,198,0,0.026253,true,"""intersection_0"""
3,1,197,0,0.026253,true,"""intersection_0"""
4,1,196,0,0.026253,true,"""intersection_0"""
5,1,195,0,0.026253,true,"""intersection_0"""
…,…,…,…,…,…,…
995,86,5,0,0.028758,true,"""intersection_85"""
997,86,3,1,0.028758,true,"""intersection_85"""
998,86,2,0,0.028758,true,"""intersection_85"""
999,86,1,0,0.028758,true,"""intersection_85"""


Add Connections Data

In [11]:
data = data.join(
    connections, left_on="Intersection", right_on="Intersection_u", how="left"
)
data

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Intersection,Intersection_v,Distance
i32,i16,i16,i16,f32,bool,str,str,i16
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_6""",9
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_72""",7
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_8""",7
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_29""",19
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_6""",9
…,…,…,…,…,…,…,…,…
998,86,2,0,0.028758,true,"""intersection_85""","""intersection_30""",8
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_5""",5
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_30""",8
1000,86,200,0,0.028758,true,"""intersection_85""","""intersection_5""",5


Add Traffic

In [12]:
traffic = traffic.join(light_intersection_mapping, on="Light_ID", how="left")
traffic

Step,Light_ID,Time,Num_Cars,Intersection
i32,i16,i16,i16,str
1,1,199,0,"""intersection_0"""
2,1,198,0,"""intersection_0"""
3,1,197,6,"""intersection_0"""
4,1,196,11,"""intersection_0"""
5,1,195,11,"""intersection_0"""
…,…,…,…,…
995,86,5,5,"""intersection_85"""
997,86,3,7,"""intersection_85"""
998,86,2,6,"""intersection_85"""
999,86,1,7,"""intersection_85"""


In [13]:
traffic = traffic.join(
    connections, left_on="Intersection", right_on="Intersection_u", how="left"
)
traffic


Step,Light_ID,Time,Num_Cars,Intersection,Intersection_v,Distance
i32,i16,i16,i16,str,str,i16
1,1,199,0,"""intersection_0""","""intersection_6""",9
1,1,199,0,"""intersection_0""","""intersection_72""",7
1,1,199,0,"""intersection_0""","""intersection_8""",7
1,1,199,0,"""intersection_0""","""intersection_29""",19
2,1,198,0,"""intersection_0""","""intersection_6""",9
…,…,…,…,…,…,…
998,86,2,6,"""intersection_85""","""intersection_30""",8
999,86,1,7,"""intersection_85""","""intersection_5""",5
999,86,1,7,"""intersection_85""","""intersection_30""",8
1000,86,200,8,"""intersection_85""","""intersection_5""",5


In [14]:
data = data.hstack(traffic.select(pl.col("Num_Cars")))
data

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Intersection,Intersection_v,Distance,Num_Cars
i32,i16,i16,i16,f32,bool,str,str,i16,i16
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_72""",7,0
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_8""",7,0
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_29""",19,0
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0
…,…,…,…,…,…,…,…,…,…
998,86,2,0,0.028758,true,"""intersection_85""","""intersection_30""",8,6
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_5""",5,7
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_30""",8,7
1000,86,200,0,0.028758,true,"""intersection_85""","""intersection_5""",5,8


Shift the number of cars at each connected intersection 5 steps to the past as a proxy for incoming cars

In [15]:
shifted_df = data.with_columns((pl.col("Step") - 5).alias("shifted_step"))

shifted_df = shifted_df.with_columns(((pl.col("Time") + 5) % 200).alias("shifted_time"))

shifted_df = shifted_df.with_columns(
    pl.when(pl.col("shifted_time") == 0)
    .then(200)
    .otherwise(pl.col("shifted_time"))
    .alias("shifted_time")
)

shifted_df

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Intersection,Intersection_v,Distance,Num_Cars,shifted_step,shifted_time
i32,i16,i16,i16,f32,bool,str,str,i16,i16,i32,i16
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0,-4,4
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_72""",7,0,-4,4
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_8""",7,0,-4,4
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_29""",19,0,-4,4
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0,-3,3
…,…,…,…,…,…,…,…,…,…,…,…
998,86,2,0,0.028758,true,"""intersection_85""","""intersection_30""",8,6,993,7
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_5""",5,7,994,6
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_30""",8,7,994,6
1000,86,200,0,0.028758,true,"""intersection_85""","""intersection_5""",5,8,995,5


In [16]:
data = data.join(
    shifted_df.select(
        ["shifted_step", "Light_ID", "shifted_time", "Intersection_v", "Num_Cars"]
    ),
    left_on=["Step", "Light_ID", "Time", "Intersection_v"],
    right_on=["shifted_step", "Light_ID", "shifted_time", "Intersection_v"],
    how="left",
)
data

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Intersection,Intersection_v,Distance,Num_Cars,Num_Cars_right
i32,i16,i16,i16,f32,bool,str,str,i16,i16,i16
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0,
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_72""",7,0,
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_8""",7,0,
1,1,199,0,0.026253,true,"""intersection_0""","""intersection_29""",19,0,
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0,15
…,…,…,…,…,…,…,…,…,…,…
998,86,2,0,0.028758,true,"""intersection_85""","""intersection_30""",8,6,
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_5""",5,7,
999,86,1,0,0.028758,true,"""intersection_85""","""intersection_30""",8,7,
1000,86,200,0,0.028758,true,"""intersection_85""","""intersection_5""",5,8,


In [17]:
data = data.drop_nulls()
data

Step,Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Intersection,Intersection_v,Distance,Num_Cars,Num_Cars_right
i32,i16,i16,i16,f32,bool,str,str,i16,i16,i16
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_6""",9,0,15
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_72""",7,0,15
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_8""",7,0,15
2,1,198,0,0.026253,true,"""intersection_0""","""intersection_29""",19,0,15
3,1,197,0,0.026253,true,"""intersection_0""","""intersection_6""",9,6,15
…,…,…,…,…,…,…,…,…,…,…
993,86,7,1,0.028758,true,"""intersection_85""","""intersection_30""",8,6,6
994,86,6,2,0.028758,true,"""intersection_85""","""intersection_5""",5,6,7
994,86,6,2,0.028758,true,"""intersection_85""","""intersection_30""",8,6,7
995,86,5,0,0.028758,true,"""intersection_85""","""intersection_5""",5,5,8


In [18]:
data = data.drop(["Step", "Intersection", "Intersection_v", "Num_Cars"])
data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Num_Cars_right
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
86,7,1,0.028758,true,8,6
86,6,2,0.028758,true,5,7
86,6,2,0.028758,true,8,7
86,5,0,0.028758,true,5,8


In [19]:
data = data.rename({"Num_Cars_right": "Incoming_Cars"})
data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
86,7,1,0.028758,true,8,6
86,6,2,0.028758,true,5,7
86,6,2,0.028758,true,8,7
86,5,0,0.028758,true,5,8


## Create Aggregation Function

In [20]:
def aggregate_data(folder: str) -> pl.DataFrame:
    # Load Data
    arrivals = pl.read_parquet(os.path.join(folder, "arrivals.parquet"))
    connections = pl.read_parquet(os.path.join(folder, "connections.parquet"))
    connections = connections.filter(
        ~pl.col("Intersection_u").str.starts_with("border")
    )
    connections = connections.filter(
        ~pl.col("Intersection_v").str.starts_with("border")
    )
    light_data = pl.read_parquet(os.path.join(folder, "light_data.parquet"))
    light_intersection_mapping = pl.read_parquet(
        os.path.join(folder, "light_intersection_mapping.parquet")
    )
    traffic = pl.read_parquet(os.path.join(folder, "traffic.parquet"))

    # Add LightAgent Metadata
    data = arrivals.join(light_data, on="Light_ID", how="left")

    # Add Intersection Mapping
    data = data.join(light_intersection_mapping, on="Light_ID", how="left")

    # Add Connections Data
    data = data.join(
        connections, left_on="Intersection", right_on="Intersection_u", how="left"
    )

    # Add Traffic
    traffic = traffic.join(light_intersection_mapping, on="Light_ID", how="left")
    traffic = traffic.join(
        connections, left_on="Intersection", right_on="Intersection_u", how="left"
    )
    data = data.hstack(traffic.select(pl.col("Num_Cars")))

    # Get Incoming Cars data
    shifted_df = data.with_columns((pl.col("Step") - 5).alias("shifted_step"))
    shifted_df = shifted_df.with_columns(
        ((pl.col("Time") + 5) % 200).alias("shifted_time")
    )
    shifted_df = shifted_df.with_columns(
        pl.when(pl.col("shifted_time") == 0)
        .then(200)
        .otherwise(pl.col("shifted_time"))
        .alias("shifted_time")
    )
    data = data.join(
        shifted_df.select(
            ["shifted_step", "Light_ID", "shifted_time", "Intersection_v", "Num_Cars"]
        ),
        left_on=["Step", "Light_ID", "Time", "Intersection_v"],
        right_on=["shifted_step", "Light_ID", "shifted_time", "Intersection_v"],
        how="left",
    )

    # Cleanup
    data = data.drop_nulls()
    data = data.drop(["Step", "Intersection", "Intersection_v", "Num_Cars"])
    data = data.rename({"Num_Cars_right": "Incoming_Cars"})

    return data

## Run 2

In [21]:
folder = "../data/2025-04-04-15:10:53"

new_data = aggregate_data(folder=folder)

new_data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.02763,true,10,18
1,198,0,0.02763,true,8,18
1,198,0,0.02763,true,12,18
1,198,0,0.02763,true,11,18
1,197,0,0.02763,true,10,22
…,…,…,…,…,…,…
102,10,0,0.027976,true,8,19
102,5,0,0.027976,true,8,14
102,5,0,0.027976,true,9,14
102,5,0,0.027976,true,9,14


Combine Data

In [22]:
data = data.vstack(new_data)
data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
102,10,0,0.027976,true,8,19
102,5,0,0.027976,true,8,14
102,5,0,0.027976,true,9,14
102,5,0,0.027976,true,9,14


## Run 3

In [23]:
folder = "../data/2025-04-04-15:38:00"

new_data = aggregate_data(folder=folder)

new_data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.038861,true,9,49
1,198,0,0.038861,true,6,49
1,198,0,0.038861,true,7,49
1,198,0,0.038861,true,7,49
1,197,0,0.038861,true,9,54
…,…,…,…,…,…,…
100,99,0,0.029218,true,8,2
100,94,0,0.029218,true,7,1
100,94,0,0.029218,true,8,1
100,93,0,0.029218,true,7,1


In [24]:
data = data.vstack(new_data)
data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
100,99,0,0.029218,true,8,2
100,94,0,0.029218,true,7,1
100,94,0,0.029218,true,8,1
100,93,0,0.029218,true,7,1


## Run 4

In [25]:
folder = "../data/2025-04-04-15:39:21"

new_data = aggregate_data(folder=folder)

new_data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.023113,true,14,28
1,198,0,0.023113,true,17,28
1,198,0,0.023113,true,10,28
1,198,0,0.023113,true,12,28
1,197,0,0.023113,true,14,29
…,…,…,…,…,…,…
109,74,0,0.020484,true,10,2
109,73,0,0.020484,true,9,1
109,73,0,0.020484,true,10,1
109,72,1,0.020484,true,9,0


In [26]:
data = data.vstack(new_data)
data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
109,74,0,0.020484,true,10,2
109,73,0,0.020484,true,9,1
109,73,0,0.020484,true,10,1
109,72,1,0.020484,true,9,0


## Run 5

In [27]:
folder = "../data/2025-04-04-17:52:19"
new_data = aggregate_data(folder=folder)
data = data.vstack(new_data)
data

Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
95,9,3,0.036983,true,7,21
95,8,0,0.036983,true,5,21
95,8,0,0.036983,true,5,21
95,8,0,0.036983,true,6,21


## Run 6

In [28]:
folder = "../data/2025-04-04-18:03:36"
new_data = aggregate_data(folder=folder)
data = data.vstack(new_data)
data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
88,10,1,0.025575,true,8,3
88,9,0,0.025575,true,7,4
88,9,0,0.025575,true,8,4
88,8,0,0.025575,true,7,5


## Run 7

In [29]:
folder = "../data/2025-04-04-18:04:01"
new_data = aggregate_data(folder=folder)
data = data.vstack(new_data)
data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
85,8,0,0.028257,true,12,20
85,5,0,0.028257,true,8,30
85,5,0,0.028257,true,15,30
85,5,0,0.028257,true,13,30


## Run 8

In [30]:
folder = "../data/2025-04-04-18:27:35"
new_data = aggregate_data(folder=folder)
data = data.vstack(new_data)
data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
113,8,0,0.015698,true,22,15
113,5,2,0.015698,true,14,18
113,5,2,0.015698,true,9,18
113,5,2,0.015698,true,29,18


## Run 9

In [31]:
folder = "../data/2025-04-04-19:02:47"
new_data = aggregate_data(folder=folder)
data = data.vstack(new_data)
data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
81,8,0,0.018919,true,8,13
81,8,0,0.018919,true,20,13
81,7,0,0.018919,true,12,13
81,7,0,0.018919,true,8,13


## Run 10

In [32]:
folder = "../data/2025-04-04-19:51:49"
new_data = aggregate_data(folder=folder)
data = data.vstack(new_data)
data


Light_ID,Time,Arrivals,Centrality,Is_Entrypoint,Distance,Incoming_Cars
i16,i16,i16,f32,bool,i16,i16
1,198,0,0.026253,true,9,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,7,15
1,198,0,0.026253,true,19,15
1,197,0,0.026253,true,9,15
…,…,…,…,…,…,…
122,17,0,0.026685,true,9,14
122,17,0,0.026685,true,7,14
122,10,1,0.026685,true,8,13
122,10,1,0.026685,true,9,13


Save the data

In [33]:
data.write_parquet("data.parquet")