In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch_geometric.data import Data, Batch
from tqdm import tqdm
import torch.nn.functional as F


In [74]:
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Apple GPU")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("CUDA GPU")
else:
    device = torch.device('cpu')

Apple GPU


In [75]:
def getData(path):
    train_file = np.load(path+"/train.npz")
    train_data = train_file['data']
    test_file = np.load(path+"/test_input.npz")
    test_data = test_file['data']
    print(f"Training Data's shape is {train_data.shape} and Test Data's is {test_data.shape}")
    return train_data, test_data
trainData, testData = getData("./data/")

Training Data's shape is (10000, 50, 110, 6) and Test Data's is (2100, 50, 50, 6)


In [76]:
import numpy as np

all_zeros_mask = np.all(trainData == 0, axis=-1)
zero_indices = np.argwhere(all_zeros_mask)

In [77]:
len(zero_indices)

28855717

In [78]:
zero_indices

array([[   0,    2,    0],
       [   0,    2,   81],
       [   0,    2,   82],
       ...,
       [9999,   49,  107],
       [9999,   49,  108],
       [9999,   49,  109]], shape=(28855717, 3))

In [79]:
trainData[0, 2, 0, :], trainData[   0,    2,   81, :], trainData[9999,   49,  109, :]

(array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]))

In [80]:
mask = (trainData[..., 0] == 0) & (trainData[..., 1] == 0)
indices = np.argwhere(mask)
len(indices)

28855717

In [81]:
zero_indices == indices

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       ...,
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]], shape=(28855717, 3))

In [None]:
import numpy as np

def transform_to_relative_frame(data):
    data = np.copy(data)
    ref_t = 49

    pos_x = data[:, :, :, 0]
    pos_y = data[:, :, :, 1]
    vel_x = data[:, :, :, 2]
    vel_y = data[:, :, :, 3]
    heading = data[:, :, :, 4]

    x_ref = pos_x[:, :, ref_t]
    y_ref = pos_y[:, :, ref_t]
    theta_ref = heading[:, :, ref_t]

    x_ref_exp = x_ref[:, :, None]
    y_ref_exp = y_ref[:, :, None]
    theta_ref_exp = theta_ref[:, :, None]

    cos_theta = np.cos(-theta_ref_exp)
    sin_theta = np.sin(-theta_ref_exp)

    dx = pos_x - x_ref_exp
    dy = pos_y - y_ref_exp

    new_x = dx * cos_theta - dy * sin_theta
    new_y = dx * sin_theta + dy * cos_theta
    new_vx = vel_x * cos_theta - vel_y * sin_theta
    new_vy = vel_x * sin_theta + vel_y * cos_theta
    new_heading = heading - theta_ref_exp
    new_heading = (new_heading + np.pi) % (2 * np.pi) - np.pi

    all_zero_timestamp_mask = np.all(data == 0, axis=-1)

    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            for t in range(data.shape[2]):
                if all_zero_timestamp_mask[i, j, t]:
                    continue 
                data[i, j, t, 0] = new_x[i, j, t]
                data[i, j, t, 1] = new_y[i, j, t]
                data[i, j, t, 2] = new_vx[i, j, t]
                data[i, j, t, 3] = new_vy[i, j, t]
                data[i, j, t, 4] = new_heading[i, j, t]

    return data


In [114]:
def unnormalize_from_relative_frame(transformed_data, original_data):
    transformed_data = np.copy(transformed_data)
    ref_t = 49

    x_ref = original_data[:, :, ref_t, 0]
    y_ref = original_data[:, :, ref_t, 1]
    theta_ref = original_data[:, :, ref_t, 4]

    x_ref_exp = x_ref[:, :, None]
    y_ref_exp = y_ref[:, :, None]
    theta_ref_exp = theta_ref[:, :, None]

    cos_theta = np.cos(theta_ref_exp)
    sin_theta = np.sin(theta_ref_exp)

    rel_x = transformed_data[:, :, :, 0]
    rel_y = transformed_data[:, :, :, 1]
    rel_vx = transformed_data[:, :, :, 2]
    rel_vy = transformed_data[:, :, :, 3]
    rel_heading = transformed_data[:, :, :, 4]

    global_x = rel_x * cos_theta - rel_y * sin_theta + x_ref_exp
    global_y = rel_x * sin_theta + rel_y * cos_theta + y_ref_exp
    global_vx = rel_vx * cos_theta - rel_vy * sin_theta
    global_vy = rel_vx * sin_theta + rel_vy * cos_theta
    global_heading = rel_heading + theta_ref_exp
    global_heading = (global_heading + np.pi) % (2 * np.pi) - np.pi

    # Mask for timestamps where all features are zero (shape: N,50,110)
    all_zero_timestamp_mask = np.all(original_data == 0, axis=-1)

    for i in range(transformed_data.shape[0]):
        for j in range(transformed_data.shape[1]):
            for t in range(transformed_data.shape[2]):
                if all_zero_timestamp_mask[i, j, t]:
                    continue  # skip untransforming this timestamp
                transformed_data[i, j, t, 0] = global_x[i, j, t]
                transformed_data[i, j, t, 1] = global_y[i, j, t]
                transformed_data[i, j, t, 2] = global_vx[i, j, t]
                transformed_data[i, j, t, 3] = global_vy[i, j, t]
                transformed_data[i, j, t, 4] = global_heading[i, j, t]

    return transformed_data


In [115]:
trainData[0, 0, :5, :]

array([[ 338.59322192, -672.21574762,   -5.32538052,    1.61518358,
           2.84662927,    0.        ],
       [ 338.06105992, -672.05375338,   -5.32538052,    1.61518358,
           2.84649174,    0.        ],
       [ 337.40442818, -671.85373445,  -10.68688785,    3.2413244 ,
           2.84631882,    0.        ],
       [ 336.62778653, -671.6169553 ,  -10.62519386,    3.24042872,
           2.84611797,    0.        ],
       [ 335.73981124, -671.34594719,  -10.61315332,    3.2487452 ,
           2.84590647,    0.        ]])

In [116]:
transformedData = transform_to_relative_frame(trainData)

In [117]:
transformedData[0, 0, :5, :]

array([[-5.26532783e+01, -1.14474392e-01,  5.56485288e+00,
         3.01362068e-02,  4.93283902e-03,  0.00000000e+00],
       [-5.20970116e+01, -1.12026513e-01,  5.56485288e+00,
         3.01362068e-02,  4.79530686e-03,  0.00000000e+00],
       [-5.14105975e+01, -1.09135185e-01,  1.11674571e+01,
         6.04768544e-02,  4.62238569e-03,  0.00000000e+00],
       [-5.05986700e+01, -1.05909909e-01,  1.11082521e+01,
         4.31068562e-02,  4.42153067e-03,  0.00000000e+00],
       [-4.96702661e+01, -1.02495550e-01,  1.10992058e+01,
         3.16045334e-02,  4.21003003e-03,  0.00000000e+00]])

In [118]:
trueData = unnormalize_from_relative_frame(transformedData, trainData)
trueData[0, 0, :5, :]

array([[ 338.59322192, -672.21574762,   -5.32538052,    1.61518358,
           2.84662927,    0.        ],
       [ 338.06105992, -672.05375338,   -5.32538052,    1.61518358,
           2.84649174,    0.        ],
       [ 337.40442818, -671.85373445,  -10.68688785,    3.2413244 ,
           2.84631882,    0.        ],
       [ 336.62778653, -671.6169553 ,  -10.62519386,    3.24042872,
           2.84611797,    0.        ],
       [ 335.73981124, -671.34594719,  -10.61315332,    3.2487452 ,
           2.84590647,    0.        ]])

In [119]:
transformedData[0, 2, 0, :], transformedData[0,    2,   81, :], transformedData[9999,   49,  109, :], transformedData[0, 49, 0, :], transformedData[0, 49, 1, :], transformedData[0, 49, 49, :], transformedData[0, 49, 109, :]

(array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]))

In [110]:
trainData[0, 2, 0, :], trainData[0,    2,   81, :], trainData[9999,   49,  109, :], trainData[0, 49, 0, :], trainData[0, 49, 1, :], trainData[0, 49, 49, :], trainData[0, 49, 109, :]

(array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]))

In [120]:
trueData[0, 2, 0, :], trueData[0,    2,   81, :], trueData[9999,   49,  109, :], trueData[0, 49, 0, :], trueData[0, 49, 1, :], trueData[0, 49, 49, :], trueData[0, 49, 109, :]

(array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0.]))

In [None]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return x

class MotionTransformer(nn.Module):
    def __init__(self, feature_dim=6, agent_count=50, past_seq_len=50, future_seq_len=60, d_model=128, nhead=8, num_layers=3):
        super().__init__()
        self.feature_dim = feature_dim
        self.agent_count = agent_count
        self.past_seq_len = past_seq_len
        self.future_seq_len = future_seq_len
        self.d_model = d_model
        
        # Input projection (flatten agents and features)
        # Each timestamp: 50 agents * 6 features = 300
        self.input_proj = nn.Linear(agent_count * feature_dim, d_model)
        
        # Positional encoding for past sequence
        self.pos_encoder = PositionalEncoding(d_model, max_len=past_seq_len)
        
        # Transformer Encoder for past
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # For decoder input: use zeros + positional encoding for future timestamps
        self.future_pos_encoder = PositionalEncoding(d_model, max_len=future_seq_len)
        
        # Learnable query vector for decoding (one agent)
        self.query_embed = nn.Parameter(torch.randn(1, 1, d_model))  # (1, 1, d_model)
        
        # Transformer Decoder
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        
        # Output projection to 6 features for future timestamps
        self.output_proj = nn.Linear(d_model, feature_dim)
    
    def forward(self, x):
        # x: (B, 50 agents, 50 timestamps, 6 features)
        B, A, T, F = x.shape
        assert A == self.agent_count and T == self.past_seq_len and F == self.feature_dim
        
        # Rearrange to (B, T, A*F)
        x = x.permute(0, 2, 1, 3).contiguous().view(B, T, A * F)  # (B, T, 300)
        
        # Input projection
        x = self.input_proj(x)  # (B, T, d_model)
        
        # Add positional encoding
        x = self.pos_encoder(x)  # (B, T, d_model)
        
        # Transformer expects shape (T, B, d_model)
        memory = self.transformer_encoder(x.permute(1, 0, 2))  # (T, B, d_model)
        
        # Prepare decoder input queries for future timestamps
        # Repeat query for future timestamps
        query = self.query_embed.repeat(self.future_seq_len, B, 1)  # (future_seq_len, B, d_model)
        
        # Add positional encoding for future timestamps
        query = self.future_pos_encoder(query.permute(1, 0, 2)).permute(1, 0, 2)  # (future_seq_len, B, d_model)
        
        # Decode future sequence based on memory
        output = self.transformer_decoder(tgt=query, memory=memory)  # (future_seq_len, B, d_model)
        
        # Project to output features
        output = self.output_proj(output)  # (future_seq_len, B, feature_dim)
        
        # Rearrange to (B, 1 agent, future_seq_len, feature_dim)
        output = output.permute(1, 0, 2)
        
        return output

model = MotionTransformer()
x = torch.randn(8, 50, 50, 6)
out = model(x)
print(out.shape)


torch.Size([8, 60, 6])
