In [16]:
import math
import random
import pygame
import sys
import numpy as np
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
from torch_geometric.data import Data, InMemoryDataset, download_url, TemporalData

## Converting CSV To Input For Our Model

    Node Features: (1000 boids, 4 features (x,y, dx, dy))
    Edge Features: (499500 max edges, 1 feature (whether they are connected or not))
    Edge Index: (2, 499500 max edges)

### Grabbing Our CSV And Converting To DataFrame

In [7]:
path_to_sim = '../data/simulation.csv'
sim_df = pd.read_csv(path_to_sim)

sim_df.head(5)

Unnamed: 0,x,y,dx,dy,Boids,Simulation,Timestep
0,408.424572,113.159835,-1.478406,3.231182,0,0,0
1,8.392009,800.274387,-0.269981,4.851227,1,0,0
2,516.899396,835.291405,2.147703,-4.46383,2,0,0
3,701.850915,328.749664,-0.194744,-0.011315,3,0,0
4,652.138901,947.95758,-2.078118,-0.137789,4,0,0


In [8]:
path_to_sim_edges = '../data/simulation_edges.csv'
sim_edges_df = pd.read_csv(path_to_sim_edges)

sim_edges_df.head(5)

Unnamed: 0,Boid_i,Boid_j,Timestep,Simulation
0,0,81,0,0
1,1,50,0,0
2,2,26,0,0
3,2,57,0,0
4,3,7,0,0


### EDA Of Dataset

In [9]:
# TODO

In [11]:
sim_df[(sim_df['Simulation'] == 0) & (sim_df['Timestep'] == 0)]

Unnamed: 0,x,y,dx,dy,Boids,Simulation,Timestep
0,408.424572,113.159835,-1.478406,3.231182,0,0,0
1,8.392009,800.274387,-0.269981,4.851227,1,0,0
2,516.899396,835.291405,2.147703,-4.463830,2,0,0
3,701.850915,328.749664,-0.194744,-0.011315,3,0,0
4,652.138901,947.957580,-2.078118,-0.137789,4,0,0
...,...,...,...,...,...,...,...
95,31.299375,129.853999,-4.948772,-4.811826,95,0,0
96,877.973626,518.923052,3.075391,4.435759,96,0,0
97,222.640178,714.115298,-3.083768,-2.957470,97,0,0
98,645.935625,992.826904,1.476175,0.574492,98,0,0


In [15]:
sim_edges_df[['Boid_i', 'Boid_j']].to_numpy().T.shape

(2, 6841129)

### Converting DataFrame To Data Object From Pytorch Geometric

In [61]:
def toDataGraph(sim_df, sim_edges_df, node_features_names):
    """
    Converts simulation data into a PyTorch Geometric Data object.

    Parameters:
    - sim_df (DataFrame): DataFrame containing node features for a specific simulation and timestep.
    - sim_edges_df (DataFrame): DataFrame containing edge information for the simulation.
    - node_features_names (list of str): Names of the columns in sim_df that are node features.

    Returns:
    - Data: A PyTorch Geometric Data object representing the graph for the simulation.
    """
    # Convert node features and edge information into tensors
    node_features = torch.tensor(sim_df[node_features_names].to_numpy(), dtype=torch.float)
    edge_index = torch.tensor(sim_edges_df[['Boid_i', 'Boid_j']].to_numpy().T, dtype=torch.long)
    edge_attributes = torch.tensor(np.ones((sim_edges_df.shape[0], 1)), dtype=torch.float)

    # Create and return the Data object
    graph = Data(x=node_features, edge_index=edge_index, edge_attr=edge_attributes)
    return graph

def allDataGraph(sim_df, sim_edges_df):
    """
    Generates a list of PyTorch Geometric Data objects for each simulation and timestep.

    Parameters:
    - sim_df (DataFrame): DataFrame containing node features for all simulations and timesteps.
    - sim_edges_df (DataFrame): DataFrame containing edge information for all simulations and timesteps.

    Returns:
    - list of Data: A list of PyTorch Geometric Data objects, one for each simulation and timestep.
    """
    # Group the data by simulation and timestep
    sim_gb_df = sim_df.groupby(['Timestep', 'Simulation'])
    sim_edges_gb_df = sim_edges_df.groupby(['Timestep', 'Simulation'])

    graphs = []
    # Iterate over each group and convert to a Data object
    for key, _ in sim_gb_df:
        curr_sim_df = sim_gb_df.get_group(key)
        curr_sim_edges_df = sim_edges_gb_df.get_group(key)
        curr_graph = toDataGraph(curr_sim_df, curr_sim_edges_df, ['x', 'y', 'dx', 'dy'])
        graphs.append(curr_graph)

    return graphs

# Example usage
graphs = allDataGraph(sim_df, sim_edges_df)
bruh = [graphs[i-5:i-1] for i in range(5, len(graphs)+1, 5)] #[995, 999)

In [66]:
## TODO: NEXT STEP MAKES CLASS THAT GIVEN THE SIMULATION DATAFRAME AND SIMULATION EDGES DATAFRAME CREATES A DATASET OBJECT

class CustomDataset(Dataset):
    def __init__(self, sim_df, sim_edges_df):
        super(CustomDataset).__init__()
        self.all_graphs = allDataGraph(sim_df, sim_edges_df)
        self.sequences = [graphs[i-5:i-1] for i in range(5, len(self.all_graphs)+1, 5)]
        self.labels = [graphs[i-1] for i in range(5, len(self.all_graphs)+1, 5)]
        self.len = len(self.labels)
    def __getitem__(self, index):
        return self.sequences[index], self.labels[index]
    def __len__(self):
        return self.len

dataset = CustomDataset(sim_df, sim_edges_df)

2000

In [67]:
dataset[0]

([Data(x=[100, 4], edge_index=[2, 78], edge_attr=[78, 1]),
  Data(x=[100, 4], edge_index=[2, 99], edge_attr=[99, 1]),
  Data(x=[100, 4], edge_index=[2, 87], edge_attr=[87, 1]),
  Data(x=[100, 4], edge_index=[2, 86], edge_attr=[86, 1])],
 Data(x=[100, 4], edge_index=[2, 91], edge_attr=[91, 1]))

In [5]:
from torch import Tensor


class CustomDataset(Data):
    '''
    self.x: Node feature matrix with shape [num_nodes, num_node_features]

    self.edge_index: Graph connectivity in COO format with shape [2, num_edges] and type torch.long

    self.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]

    self.y: Target to train against (may have arbitrary shape), e.g., node-level targets of shape [num_nodes, *] or graph-level targets of shape [1, *]

    self.pos: Node position matrix with shape [num_nodes, num_dimensions]
    '''
    def __init__(self, x: Tensor | None = None, edge_index: Tensor | None = None, edge_attr: Tensor | None = None, y: Tensor | int | float | None = None, pos: Tensor | None = None, time: Tensor | None = None, **kwargs):
        super().__init__(x, edge_index, edge_attr, y, pos, time, **kwargs)
        