In [None]:
import pandas as pd
import sqlite3
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision.utils import save_image
from torch.distributions.normal import Normal

from PIL import Image

import math
import os
import numpy as np
import random
import matplotlib.pyplot as plt
from skimage.io import imread, imshow
import datetime
from skimage.util import img_as_ubyte
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import argparse

from datetime import datetime

from models.indiv_crossAttention_seq2seq_code_size import crossAttention

from IPython import display

from torch.utils.tensorboard import SummaryWriter

stochastic_mode         = 1

# Make folder for outputs and logs


In [None]:
#Dataset name
dataset_name = "eth" # dataset options: 'university', 'zara_01', 'zara_02', 'eth', 'hotel'

In [None]:
__file__=os.getcwd()
print(__file__)

now = datetime.now() # current date and time
current_time_date = now.strftime("%d_%m_%y_%H_%M_%S")
run_folder  = "Outputs/traj_pred_"+ dataset_name + "_" + str(os.path.basename(__file__)) + str(current_time_date)
os.makedirs(run_folder)

Skip to left side bar
>
/
Name
Last Modified

# Make log folder for tensorboard

In [None]:
SummaryWriter_path = "2Encod_Transf_decoder_eth_ca/"
os.makedirs(SummaryWriter_path)   
writer = SummaryWriter(SummaryWriter_path,comment="ADE_FDE_Train")

# Make image folder to save outputs


In [None]:
image_path  = run_folder + "/Visual_Prediction"
os.makedirs(image_path)

# GPU


In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device('cuda:0')

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.cuda.device_count()
torch.cuda.current_device()

# Variables


In [None]:
#DataBase Variables
image_folder_path = 'data/data_trajpred/'+dataset_name
DB_PATH_train = "./data/data_trajpred/"+dataset_name+"/pos_data_train.db"
cnx_train    = sqlite3.connect(DB_PATH_train)
DB_PATH_val  = "./data/data_trajpred/"+dataset_name+"/pos_data_val.db"
cnx_val      = sqlite3.connect(DB_PATH_val)
DB_DIR       = run_folder + '/database'
os.makedirs( DB_DIR )
DB_PATH2     = DB_DIR+'/db_one_ped_delta_coordinates_results.db'
cnx2         = sqlite3.connect(DB_PATH2)

#Other variables
T_obs                   = 8
T_pred                  = 12
T_total                 = T_obs + T_pred #8+12=20
data_id                 = 0 
batch_size              = 40
chunk_size              = batch_size * T_total # Chunksize should be multiple of T_total
in_size                 = 2
stochastic_out_size     = in_size * 2
hidden_size             = 256 
embed_size              = 64 
global dropout_val
dropout_val             = 0.2 #0.5
teacher_forcing_ratio   = 0.7 # 0.9
regularization_factor   = 0.5 # 0.001
avg_n_path_eval         = 20
bst_n_path_eval         = 20
path_mode               = "top5" #"avg","bst","single","top5"
regularization_mode     = "regular" #"weighted","e_weighted", "regular"
startpoint_mode         = "on" #"on","off"
enc_out                 = "on" #"on","off"
biased_loss_mode        = 0 # 0 , 1

table_out   = "results_delta"
table       = "dataset_T_length_20delta_coordinates"
df_id       = pd.read_sql_query("SELECT data_id FROM "+table, cnx_train)
data_size   = df_id.data_id.max() * T_total
epoch_num   = 400
from_epoch  = 0

#Visual Variables
image_size              = 256  
image_dimension         = 3
mask_size               = 16
visual_features_size    = 128 
visual_embed_size       = 64
vsn_module_out_size    = 256
to_pil = torchvision.transforms.ToPILImage()

#Model Path
model_path = run_folder + "/NNmodel" 
os.makedirs(model_path)   
model_path = model_path + str("/model")

# Handle Sequential Data

In [None]:
class TrajectoryPredictionDataset(torch.utils.data.Dataset): 
    def __init__(self, ROOT_DIR, DB_PATH, cnx):
        self.pos_df    = pd.read_sql_query("SELECT * FROM "+str(table), cnx)
        self.root_dir  = ROOT_DIR+'/visual_data'
        self.transform = torchvision.transforms.Compose([torchvision.transforms.Resize((image_size,image_size)), \
                                                         torchvision.transforms.ToTensor(), \
                                                         torchvision.transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
        self.visual_data = []
        # read sorted frames
        for img in sorted(os.listdir(self.root_dir)): 
            self.visual_data.append(self.transform( Image.open(os.path.join(self.root_dir)+"/"+img) ))
        self.visual_data = torch.stack(self.visual_data)  

    def __len__(self):
        return self.pos_df.data_id.max() #data_id maximum dans dataset
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist() 
        
        extracted_df     = self.pos_df[ self.pos_df["data_id"] == idx ]
        tensor           = torch.tensor(extracted_df[['pos_x_delta','pos_y_delta']].values).reshape(-1,T_total,in_size)
        obs, pred        = torch.split(tensor,[T_obs,T_pred],dim=1)
        start_frames     = (extracted_df.groupby('data_id').frame_num.min().values/10).astype('int')
        for i in start_frames:            
            extracted_frames.append(self.visual_data[i:i+T_obs])
        frames = torch.stack(extracted_frames) #stack concatenates a sequence of tensors along a new dimension.
        start_frames = torch.tensor(start_frames)
        return obs, pred, frames, start_frames

# Initialize random weights for NN models


In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.2, 0.2)

# Regularizer loss
The distance_from_line_regularizer function is implemented as a regularization method to enforce the predicted trajectory to stay close to the observed trajectory. The method is used to minimize the Euclidean distances between each step of the predicted trajectory and a line fitted to the observed trajectory.
It takes 2 inputs:
-input_tensor : The observed trajectory in the form of a tensor of size (batch_size, T_obs, 2)
-prediction : The predicted trajectory in the form of a tensor of size (batch_size, T_pred, 2)

In [None]:
sum_sigma_distance  = torch.zeros(1)

def distance_from_line_regularizer(input_tensor,prediction):
    global sum_sigma_distance
    #the regularization is defined as the sum of Euclidean distances between each step of the predicted trajectory Tf , and a line fitted to the observed trajectory To.
    # Fit a line to observation points over batch 
    input_tensor    = input_tensor.double()
    prediction      = prediction.double()
    input_tensor    = input_tensor.cumsum(dim=1).double()
    X               = torch.ones_like(input_tensor).double().to('cuda', non_blocking=True)
    X[:,:,0]        = input_tensor[:,:,0]
    Y               = (input_tensor[:,:,1]).unsqueeze(-1).double()
    try:
        try:
            XTX_1       = torch.matmul( X.transpose(-1,-2), X).double().inverse()
        except:
            XTX_1       = torch.matmul( X.transpose(-1,-2), X).double().pinverse()
        XTY             = torch.matmul( X.transpose(-1,-2), Y)
        theta           = torch.matmul( XTX_1.double(), XTY.double())
        # Calculate real values of prediction instead of delta
        prediction[:,:,0] = prediction[:,:,0] + input_tensor[:,-1,0].unsqueeze(-1) 
        prediction[:,:,1] = prediction[:,:,1] + input_tensor[:,-1,1].unsqueeze(-1)
        
        # Calculate distance ( predicted_points , observation_fitted_line ) over batch
        theta0x0        = theta[:,0,:].double() * prediction[:,:,0].double()
        denominator     = torch.sqrt( theta[:,0,:].double() * theta[:,0,:].double() + 1 )
        nominator       = theta0x0 + theta[:,1,:] - prediction[:,:,1].double()
        distance        = nominator.abs() / denominator
        if regularization_mode =='weighted':
            weight              = torch.flip( torch.arange(1,T_pred+1).cuda().float(),[0])
            weight              = (weight / T_pred).repeat(distance.size(0)).view(-1,T_pred)
            weighted_distance   = weight * distance

        elif regularization_mode =='e_weighted':
            weight              = torch.flip( torch.arange(1,T_pred+1).cuda().float(),[0])
            weight              = (weight / T_pred).repeat(distance.size(0)).view(distance.size(0),T_pred)
            weight              = torch.exp(weight)
            weighted_distance   = weight*distance

        else:
            weighted_distance = distance
        sigma_distance  = torch.mean(weighted_distance,1)
        sum_sigma_distance  = torch.mean(sigma_distance)
        return sum_sigma_distance
    except:
        print("SINGULAR VALUE")
        sum_sigma_distance = torch.zeros(1).to('cuda', non_blocking=True) + 20
        return sum_sigma_distance

# Vision Transformer

### Spatial Features Extraction (Resnet 18)

In [None]:
from torch.utils.data import DataLoader 
from torchvision import datasets 
from torchvision.transforms import ToTensor 
from torchsummary import summary

In [None]:
def Resnet(pretrain=True,layers_to_unfreeze=8,layers_to_delete=2,in_planes=3):
    """
    param:
        pretrain: Define if we load a pretrained model from ImageNet
        layers_to_unfreeze: Define the number of layers that we want to train at the end of the Resnet
        layers_to_delete: Define the numbers of layers that we want to delete
        in_planes: Define the numbers of input channels of images (supported values: 1,2 or 3)
    return: The Resnet model
    """
    resnet = torchvision.models.resnet18(pretrained=pretrain)
    # Create a new model cause we don't want the pooling operation at the end and the classifier
    model = nn.Sequential()
    number_of_layers = len(list(resnet.children())) - layers_to_delete # In practice it remove the pooling operation and the classifier

    if number_of_layers<layers_to_unfreeze:
        layers_to_unfreeze = number_of_layers
    layers_to_freeze = number_of_layers - layers_to_unfreeze
    i=0
    for child in resnet.children():
        # For the first layers we create a new weight if in_planes is not 3 cause ResNet is pretrain on image with 3 channels there is no version for 1 channel
        if i==0 and in_planes<3:
            if i<layers_to_freeze: # Define if we freeze this layer or no
                for param in child.parameters():
                    param.requires_grad = False # Freeze the layers by passing requires_grad attribute to False
            w = child._parameters['weight'].data # Get the weight for 3 channels data
            child._modules['0'] = nn.Conv2d(in_planes, 64, kernel_size=3, padding=1) # Define the new conv layer
            if in_planes == 1:
                child._parameters['weight'].data = w.mean(dim=1, keepdim=True) # If the number of channels is 1 we made the mean of channels to set the new weight
            else:
                child._parameters['weight'].data = w[:, :-1] * 1.5

        if i<layers_to_freeze: # Define if we freeze this layer or no
            for param in child.parameters():
                param.requires_grad = False # Freeze the layers by passing requires_grad attribute to False
        if i<number_of_layers: # To define if we keep this layer or not
            model.append(child) 
        i+=1
    return model


class features_extraction(nn.Module):
    """
    param:
    conv_model: The convolution model used before capsules for the moment only ResNet is supported
    in_planes: Numbers of channels for the image
    """
    def __init__(self,conv_model,in_planes: int):
        super().__init__()
        self.conv_model = conv_model
        self.in_planes = in_planes
        self.pooling = nn.AdaptiveAvgPool2d((1,1))
        
    def forward(self,input):
        shape = input.size()
        x = input.view(-1,self.in_planes,shape[-2],shape[-1])
        x = self.conv_model(x)
        x = self.pooling(x)
        return x

### Transformer : EncoderSelfAttention

In [None]:
def position_embedding(input, d_model):
    input = input.view(-1, 1)
    dim = torch.arange(d_model // 2, dtype=torch.float32, device=input.device).view(1, -1)
    sin = torch.sin(input / 10000 ** (2 * dim / d_model))
    cos = torch.cos(input / 10000 ** (2 * dim / d_model))

    out = torch.zeros((input.shape[0], d_model), device=input.device)
    out[:, ::2] = sin
    out[:, 1::2] = cos
    return out

def sinusoid_encoding_table(max_len, d_model):
    pos = torch.arange(max_len, dtype=torch.float32)
    out = position_embedding(pos, d_model)
    return out

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_model, d_k, d_v, h):
        """
        param:
        d_model: Output dimensionality of the model
        d_k: Dimensionality of queries and keys
        d_v: Dimensionality of values
        h: Number of heads
        """
        super(ScaledDotProductAttention, self).__init__()
        self.fc_q = nn.Linear(d_model, h * d_k)
        self.fc_k = nn.Linear(d_model, h * d_k)
        self.fc_v = nn.Linear(d_model, h * d_v)
        self.fc_o = nn.Linear(h * d_v, d_model)

        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_v
        self.h = h

        self.init_weights(gain=1.0)

    def init_weights(self, gain=1.0):
        nn.init.xavier_normal_(self.fc_q.weight, gain=gain)
        nn.init.xavier_normal_(self.fc_k.weight, gain=gain)
        nn.init.xavier_normal_(self.fc_v.weight, gain=gain)
        nn.init.xavier_normal_(self.fc_o.weight, gain=gain)
        nn.init.constant_(self.fc_q.bias, 0)
        nn.init.constant_(self.fc_k.bias, 0)
        nn.init.constant_(self.fc_v.bias, 0)
        nn.init.constant_(self.fc_o.bias, 0)

    def forward(self, queries, keys, values):
        """
        Computes
        :param queries: Queries (b_s, nq, d_model)
        :param keys: Keys (b_s, nk, d_model)
        :param values: Values (b_s, nk, d_model)
        :return:
        """
        b_s, nq = queries.shape[:2]
        nk = keys.shape[1]
        q = self.fc_q(queries).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3)  # (b_s, h, nq, d_k)
        k = self.fc_k(keys).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1)  # (b_s, h, d_k, nk)
        v = self.fc_v(values).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3)  # (b_s, h, nk, d_v)

        att = torch.matmul(q, k) / np.sqrt(self.d_k)  # (b_s, h, nq, nk)

        att = torch.softmax(att, -1)

        out = torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v)  # (b_s, nq, h*d_v)
        out = self.fc_o(out)  # (b_s, nq, d_model)
        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, d_k, d_v, h, dff=2048, dropout=.1):
        super(MultiHeadAttention, self).__init__()

        self.attention = ScaledDotProductAttention(d_model=d_model, d_k=d_k, d_v=d_v, h=h)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        self.fc = nn.Sequential(*[nn.Linear(d_model, dff), nn.ReLU(inplace=True), nn.Dropout(p=dropout),nn.Linear(dff, d_model)])

    def forward(self, queries, keys, values):
        att = self.attention(queries, keys, values)
        att = self.dropout(att)
        att = self.fc(att)
        att = self.dropout(att)
        return self.layer_norm(queries + att)
    
class EncoderSelfAttention(nn.Module):
    def __init__(self, device, d_model, d_k, d_v, n_head, dff=2048, dropout_transformer=.1, n_module=6):
        super(EncoderSelfAttention, self).__init__()
        self.encoder = nn.ModuleList([MultiHeadAttention(d_model, d_k, d_v, n_head, dff, dropout_transformer) for _ in range(n_module)])
        self.device = device
    
    def forward(self, x):        
        in_encoder = x + sinusoid_encoding_table(x.shape[1], x.shape[2]).expand(x.shape).to(self.device)
        for l in self.encoder:
            in_encoder = l(in_encoder, in_encoder, in_encoder)
            
        return in_encoder

### Partie vision : Resnet + Transformer

In [None]:
class _GestureTransformer(nn.Module):
    """Multi-Modal model on 3 or 1 channel"""
    def __init__(self,device,backbone="resnet",in_planes=3,pretrained= True,input_dim=512,layers_to_unfreeze=8,layers_to_delete=2,n_head=8,n_module=6,ff_size=1024,dropout1d=0.5):
        super(_GestureTransformer, self).__init__()

        self.in_planes = in_planes
        self.device = device
        self.conv_name = backbone
        self.conv_model = None
        
        if self.conv_name.lower()=="resnet":
            self.conv_model = Resnet(pretrained,layers_to_unfreeze,layers_to_delete,in_planes)
        else:
            raise NotImplementedError("The model {} is not supported!".format(self.conv_name))
            
        self.conv_model.to(device)
        self.features = features_extraction(self.conv_model,in_planes)
        self.self_attention = EncoderSelfAttention(device,input_dim,64,64,n_head=n_head,dff=ff_size,dropout_transformer=dropout1d,n_module=n_module)

    def forward(self, x):
        
        shape = x.shape        
        x = self.features(x)
        x = x.view(shape[0],shape[1],-1)
        x = self.self_attention(x)
        return x

# CoordinatesTransformer (partie cinématique)

In [None]:
class CoordinatesTransformer(nn.Module):
    """Multi-Modal model on 3 or 1 channel"""
    def __init__(self,device,pretrained= True,input_dim=512,layers_to_unfreeze=8,layers_to_delete=2,n_head=8,n_module=6,ff_size=1024,dropout1d=0.5):
        super(CoordinatesTransformer, self).__init__()
        
        self.linear_mapper= torch.nn.Sequential(
                              torch.nn.Linear(2, 32),
                              torch.nn.ReLU(),
                              torch.nn.Linear(32, 64),
                              torch.nn.ReLU(),
                              torch.nn.Linear(64, 128),
                              torch.nn.ReLU(),
                              torch.nn.Linear(128, 256),
                              torch.nn.ReLU(),)
        
        
        self.self_attention = EncoderSelfAttention(device,256,64,64,n_head=n_head,dff=ff_size,dropout_transformer=dropout1d,n_module=n_module)
        self.pool = nn.AdaptiveAvgPool2d((1,256))
        self.fc_out = nn.Sequential(nn.Linear(256,256),nn.ReLU(),nn.Dropout(p=dropout_val))
        self.embedder_out = nn.Sequential(nn.Linear(8*256, 256),nn.ReLU(),nn.Dropout(p=dropout_val),nn.Linear(256, 256),nn.ReLU())

    def forward(self, x):
        shape = x.shape
        x=self.linear_mapper(x)
        x = x.view(shape[0],shape[1],-1)
        x = self.self_attention(x)
        x = self.fc_out(x)
        return x
    
    def emb_out(self,input):
        out= self.embedder_out(input)
        return out

# Decoder Transformer

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead):
        super(DecoderLayer, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Linear(d_model, d_model)

    def forward(self, x, e_output):
        # Multihead self-attention
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.norm1(attn_output)
        # Feed forward
        ff_output = self.feed_forward(x)
        x = x + self.norm2(ff_output)
        return x

    
class DecoderTransformer(nn.Module):
    def __init__(self, in_size, embed_size, hidden_size, d_model=512, dropout_val=dropout_val, batch_size=1, nhead=8, num_layers=6):
        super(DecoderTransformer, self).__init__()
                
        self.in_size                = in_size #2
        self.stochastic_out_size    = stochastic_out_size #2*2=4
        self.hidden_size            = hidden_size #256
        self.batch_size             = batch_size
        self.embed_size             = embed_size #64
        self.seq_length             = T_pred #12
        self.dropout_val            = dropout_val #0.2
        self.visual_embed_size      = visual_embed_size #64
        self.visual_embed_size      = visual_embed_size
        self.visual_size            = image_dimension * image_size * image_size #3*256*256
        
        self.d_model=d_model
        self.nhead=nhead
        self.num_layers=num_layers
        
        self.embedder_rho = nn.Linear(self.in_size, self.embed_size) #(2,64)
        self.fC_mu = nn.Sequential(nn.Linear(self.hidden_size + self.hidden_size + in_size, int(self.hidden_size/2), bias=True),nn.ReLU(),nn.Dropout(p=dropout_val),nn.Linear(int(self.hidden_size/2), self.stochastic_out_size, bias=True))
        self.dropout = nn.Dropout(dropout_val)
        self.reducted_size = int((self.hidden_size-1)/3)+1
        self.reducted_size2 = int((self.hidden_size+in_size-1)/3)+1
        self.FC_dim_red = nn.Sequential(nn.MaxPool2d(kernel_size=3, stride=3, padding=1),nn.Flatten(start_dim=1, end_dim=-1),nn.Linear(self.reducted_size*self.reducted_size2, 2*self.hidden_size+in_size, bias=True),nn.ReLU())
        
        self.embedding = nn.Linear(64, d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, nhead) for _ in range(num_layers)])
        self.output_layer = nn.Linear(d_model, 514)
                
    def forward(self, x, encoder_outputs):
        
        # Coordination Embedding
        embedding = self.embedder_rho(x.view(x.shape[0],-1,2))
        embedding = F.relu(self.dropout(embedding))
        
        # Embed the decoder input
        x = self.embedding(embedding)

        for layer in self.layers:
            x = layer(x, encoder_outputs)
        output = self.output_layer(x)
        
        prediction = self.fC_mu(output.squeeze(0)) 
        
        return prediction
    
    def dim_red(self, input):
        output = self.FC_dim_red(input)
        return output

# Model

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, in_size, embed_size, hidden_size, batch_size=1, d_model=512, d_ff=2048, h=8, dropout_val=dropout_val, N=6, input_dim=512):
        super(Seq2Seq, self).__init__()

        torch.cuda.empty_cache()
        
        self.encoder = CoordinatesTransformer(device,dropout1d=dropout_val) #Encoder Transformer (partie cinématique)
        self.encoder.apply(init_weights)
                
        self.decoder =  DecoderTransformer(in_size, embed_size, hidden_size, num_layers=6, nhead=8)
        self.decoder.apply(init_weights)
        
        self.vsn_module = _GestureTransformer(device,dropout1d=dropout_val) #_GestureTransformer(partie vision)                   
        self.vsn_module.apply(init_weights)
        
        self.pooling = nn.AdaptiveAvgPool1d((hidden_size))
        
        self.crossAttention = crossAttention(N=6,d_model=256, d_ff=2048, h=8, dropout=0.1)
        
        if device.type=='cuda':
            self.encoder.cuda()
            self.decoder.cuda()
            self.vsn_module.cuda()

    def forward(self, input_tensor, visual_input_tensor, output_tensor, batch_size, train_mode): 
        
        batch_size      = int(input_tensor.size(0))        
        
        #encoder_outputs 
        encoder_outputs = torch.zeros(batch_size, T_obs, hidden_size).cuda()

        start_point     = (input_tensor[:,0,:]).to(device).clone().detach()

        if startpoint_mode=="on":
            input_tensor[:,0,:]    = 0
            
        encoder_outputs                  = self.encoder(input_tensor.reshape(batch_size, -1, 2))
            
        # Encoder outputs :                     
        visual_initial_vsn    = self.vsn_module(visual_input_tensor)
        visual_initial_vsn    = self.pooling(visual_initial_vsn) #pooling qu'on a ajouté
        
        src_mask = None
        obd_enc_mask = None
        
        cross_ouput = self.crossAttention(encoder_outputs, visual_initial_vsn, src_mask, obd_enc_mask)
        e_outputss=cross_ouput
        
        visual_vsn_result   = visual_initial_vsn
        
        decoder_input = input_tensor[:,-1,:]
        
        # Tensor to store decoder outputs
        outputs                         = torch.zeros(batch_size, T_pred , in_size).cuda() #torch.Size([32, 12, 2])
        stochastic_outputs              = torch.zeros(batch_size, T_pred , stochastic_out_size).cuda() #torch.Size([32, 12, 4])
        teacher_force                   = 1

        epsilonX                        = Normal(torch.zeros(batch_size,1),torch.ones(batch_size,1))
        epsilonY                        = Normal(torch.zeros(batch_size,1),torch.ones(batch_size,1))
        teacher_force                   = int(random.random() < teacher_forcing_ratio) if train_mode else 0
        
        for t in range(0, T_pred):

            stochastic_decoder_output = self.decoder(decoder_input, e_outputss)
            
            # Reparameterization Trick :)
            decoder_output              = torch.zeros(batch_size,1,2).cuda()            

            if stochastic_mode and path_mode=='single':
                decoder_output[:,:,0]  = stochastic_decoder_output[:,:,0] + epsilonX.sample().cuda() * stochastic_decoder_output[:,:,1]
                decoder_output[:,:,1]  = stochastic_decoder_output[:,:,2] + epsilonY.sample().cuda() * stochastic_decoder_output[:,:,3]
            elif stochastic_mode and path_mode=='avg':
                decoder_output[:,:,0]  = stochastic_decoder_output[:,:,0] + epsilonX.sample((avg_n_path_eval,1)).view(-1,avg_n_path_eval,1).mean(-2).cuda() * stochastic_decoder_output[:,:,1]
                decoder_output[:,:,1]  = stochastic_decoder_output[:,:,2] + epsilonY.sample((avg_n_path_eval,1)).view(-1,avg_n_path_eval,1).mean(-2).cuda() * stochastic_decoder_output[:,:,3]
            elif not(stochastic_mode):
                decoder_output[:,:,0]  = stochastic_decoder_output[:,-1,0].clone().view(batch_size,-1) 
                decoder_output[:,:,1]  = stochastic_decoder_output[:,-1,2].clone().view(batch_size,-1) 
            elif stochastic_mode and path_mode == "bst":
                epsilon_x               = torch.randn([batch_size,bst_n_path_eval,1], dtype=torch.float).cuda()
                epsilon_y               = torch.randn([batch_size,bst_n_path_eval,1], dtype=torch.float).cuda()
                multi_path_x            = stochastic_decoder_output[:,-1,0].clone().view(batch_size,1,-1) + epsilon_x * stochastic_decoder_output[:,-1,1].clone().view(batch_size,1,-1)
                multi_path_y            = stochastic_decoder_output[:,-1,2].clone().view(batch_size,1,-1) + epsilon_y * stochastic_decoder_output[:,-1,3].clone().view(batch_size,1,-1)
                ground_truth_x          = output_tensor[:,t,0].view(batch_size,1,1).cuda()
                ground_truth_y          = output_tensor[:,t,1].view(batch_size,1,1).cuda()
                diff_path_x             = multi_path_x - ground_truth_x
                diff_path_y             = multi_path_y - ground_truth_y
                diff_path               = (torch.sqrt( diff_path_x.pow(2) + diff_path_y.pow(2) )).sum(dim=-1)
                idx                     = torch.arange(batch_size,dtype=torch.long).cuda()
                min                     = torch.argmin(diff_path,dim=1).squeeze()
                decoder_output[:,:,0]   = multi_path_x[idx,min,:].view(batch_size,1)
                decoder_output[:,:,1]   = multi_path_y[idx,min,:].view(batch_size,1)
            elif stochastic_mode and path_mode == "top5":
                k = 5 #top k                
                epsilon_x               = torch.randn([batch_size,bst_n_path_eval,1], dtype=torch.float).cuda()
                epsilon_y               = torch.randn([batch_size,bst_n_path_eval,1], dtype=torch.float).cuda()
                multi_path_x            = stochastic_decoder_output[:,-1,0].clone().view(batch_size,1,-1) + epsilon_x * stochastic_decoder_output[:,-1,1].clone().view(batch_size,1,-1)
                multi_path_y            = stochastic_decoder_output[:,-1,2].clone().view(batch_size,1,-1) + epsilon_y * stochastic_decoder_output[:,-1,3].clone().view(batch_size,1,-1)
                ground_truth_x          = output_tensor[:,t,0].view(batch_size,1,1).cuda()
                ground_truth_y          = output_tensor[:,t,1].view(batch_size,1,1).cuda()
                diff_path_x             = multi_path_x - ground_truth_x
                diff_path_y             = multi_path_y - ground_truth_y
                diff_path               = (torch.sqrt( diff_path_x.pow(2) + diff_path_y.pow(2) )).sum(dim=-1)
                idx                     = torch.arange(batch_size,dtype=torch.long).repeat(k).view(k,-1).transpose(0,1).cuda()
                min_val, min            = torch.topk(diff_path, k=k, dim=1,largest=False)
                decoder_output[:,:,0]   = multi_path_x[idx,min,:].mean(dim=-2).view(batch_size,1)
                decoder_output[:,:,1]   = multi_path_y[idx,min,:].mean(dim=-2).view(batch_size,1)
                
            outputs[:,t,:]                        = decoder_output.clone().squeeze() #+ decoder_input.squeeze()
            stochastic_outputs[:,t,:]             = stochastic_decoder_output[:,-1,:].clone().squeeze()
            decoder_input                         = output_tensor[:,:t+1,:].clone() if teacher_force else outputs[:,:t+1,:].clone()

        return outputs, stochastic_outputs, visual_vsn_result

# Train


In [None]:
import shutil

def save_checkpoint(state, is_best, save_path, filename):
    torch.save(state, os.path.join(save_path,filename))
    if is_best:
        shutil.copyfile(os.path.join(save_path,filename), os.path.join(save_path,'model_best.pth'))
        
def train(model, optimizer, scheduler, criterion, criterion_vision, clip,train_loader, validation_loader):
    global batch_size
    i               = None
    checked_frame   = 0

    losses = []
    print("Data Size ",data_size,"\tChunk Size ",chunk_size)
    global teacher_forcing_ratio
    counter =0
    best_val = float("inf")
    save_path = './save_modelseth1000ep'
    for j in range(epoch_num):
        model.train()
        epoch_loss=0
        if j%7 == 6:
            teacher_forcing_ratio = (teacher_forcing_ratio - 0.2) if teacher_forcing_ratio>=0.1 else 0.0

        # Update TeachForce ratio to gradually change during training
        # global teacher_forcing_ratio
        # teacher_forcing_ratio-= 1/epoch_num
        print("TEACHER FORCE RATIO\t",teacher_forcing_ratio)

        start_time = time.time()
        if(j>=from_epoch):
            optimizer.zero_grad()
            
            ADE = 0
            FDE = 0
            i   = 0
            for i,data in enumerate(train_loader):
                #print("\n--------------- Batch %d/ %d ---------------"%(j,i)) #(epoch/i)
                # Forward
                obs, pred, visual_obs, frame_tensor              = data
                input_tensor, output_tensor                      = obs.float().squeeze().to('cuda', non_blocking=True), pred.float().squeeze().to('cuda', non_blocking=True)               #(obs.to(device), pred.to(device))
                visual_input_tensor                              = visual_obs.squeeze().to('cuda', non_blocking=True)  #(visual_obs.to(device), visual_pred.to(device))
                prediction, stochastic_prediction, visual_embedding = model(input_tensor,visual_input_tensor,output_tensor,batch_size,train_mode=1)

                calculated_prediction = prediction.cumsum(axis=1)

                loss_line_regularizer = distance_from_line_regularizer(input_tensor,calculated_prediction) #loss (regularisation term Lreg)
                
                if biased_loss_mode:
                    weight  = torch.arange(1,2*T_pred+1,2).cuda().float()
                    weight  = torch.exp(weight / T_pred).repeat(prediction.size(0)).view(prediction.size(0),T_pred,1)
                    loss    = criterion( (calculated_prediction)*weight, torch.cumsum(output_tensor,dim=-2)*weight)
                    #loss_vision = criterion_vision()
                else:
                    loss    = criterion( (calculated_prediction), torch.cumsum(output_tensor,dim=-2)) #mean squared error (lmse)
                out_x       = output_tensor[:,:,0].cumsum(axis=1)
                out_y       = output_tensor[:,:,1].cumsum(axis=1)
                pred_x      = calculated_prediction[:,:,0]
                pred_y      = calculated_prediction[:,:,1]
                ADE         += ((out_x.sub(pred_x)**2).add((out_y.sub(pred_y)**2))**(1/2)).mean(0).mean(0)   
                
                # Backward Propagation
                total_loss      = loss.double() + torch.tensor(regularization_factor).to('cuda', non_blocking=True) * loss_line_regularizer.double() #total loss
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()
                #print("Total Loss\t{:.2f}".format(total_loss.item()))
                epoch_loss += total_loss.item()
                #print("Time\t\t{:.2f} sec \n".format(time.time() - start_time))
                start_time = time.time()
                torch.cuda.empty_cache()
                writer.close()
                count_div=i
            
            # tensorboard log
            writer.add_scalar('ADE/train', ADE.item()/(count_div+1), counter)
            # writer.add_scalar('FDE/train', FDE.item()/(count_div+1), counter)
            # writer.add_scalar('LOSS/train', epoch_loss/(count_div+1), counter)
            counter += 1

        if scheduler.get_last_lr()[0]>0.001:
            scheduler.step()
        # validation(model, optimizer, criterion, criterion_vision, clip, validation_loader, j) 
        epoch_loss = epoch_loss / (int(data_size/chunk_size))
        losses.append(epoch_loss)
        display.clear_output(wait=True)
        plt.plot(losses, '--ro', label='train loss')
        plt.legend()
        plt.title(f'epoch {j}')
        plt.show()
        print("Time\t\t{:.2f} sec \n".format(time.time() - start_time))
        print("EPOCH ", j, "\tLOSS ", epoch_loss)
        writer.add_scalar('epoch_loss/train', epoch_loss/ (int(data_size/chunk_size)), j ) #see how model performs on the training dataset
        print("-----------------------------------------------\n"+"-----------------------------------------------")

        # save checkpoint for each epoch and a fine called best_model so far 
        print(np.argmin(losses))
        is_best = epoch_loss < best_val
        best_val = min(epoch_loss, best_val)
        save_checkpoint({'epoch': j+1,'state_dict': model.module.state_dict(),'optimizer': optimizer.state_dict(),'scheduler': scheduler.state_dict(),'best_loss': best_val}, is_best, save_path, 'epoch_{}.pth'.format(j+1))
        
    return epoch_loss / (int(data_size/chunk_size))

# Evaluate


In [None]:
def validation(model, optimizer, criterion, criterion_vision, clip, validation_loader, counter):
    global batch_size
    model.eval()
    i           = None
    ADEs        = 0
    FDEs        = 0
    epoch_loss  = 0
    loss_line_regularizer = 0
    loss = 0 
    total_loss = 0
    ADE  = 0
    FDE  = 0
    for i,data in enumerate(test_loader):
        # Forward
        obs, pred, visual_obs, frame_tensor = data
        input_tensor, output_tensor         = obs.float().squeeze().to('cuda', non_blocking=True), pred.float().squeeze().to('cuda', non_blocking=True)
        visual_input_tensor                 = visual_obs.squeeze().to('cuda', non_blocking=True) 
        prediction, stochastic_prediction, visual_embedding = model(input_tensor, visual_input_tensor, output_tensor, batch_size, train_mode=0)
        
        calculated_prediction = prediction.cumsum(axis=1) 

        loss_line_regularizer = distance_from_line_regularizer(input_tensor,calculated_prediction)
        
        if biased_loss_mode:
            weight  = torch.arange(1,2*T_pred+1,2).cuda().float()
            weight  = torch.exp(weight / T_pred).repeat(prediction.size(0)).view(prediction.size(0),T_pred,1)
            loss    = criterion( (calculated_prediction)*weight, torch.cumsum(output_tensor,dim=-2)*weight)
        else:
            loss    = criterion( (calculated_prediction), torch.cumsum(output_tensor,dim=-2))
        out_x       = output_tensor[:,:,0].cumsum(axis=1)
        out_y       = output_tensor[:,:,1].cumsum(axis=1)
        pred_x      = calculated_prediction[:,:,0]
        pred_y      = calculated_prediction[:,:,1]
        ADE         += ((out_x.sub(pred_x)**2).add((out_y.sub(pred_y)**2))**(1/2)).mean(0).mean(0)   
        FDE         += ((out_x.sub(pred_x)**2).add((out_y.sub(pred_y)**2))**(1/2)).mean(0)[-1]
        total_loss  += loss.double() + regularization_factor * loss_line_regularizer.double() 
        print("Total Loss\t{:.2f}".format(total_loss.item()))

    writer.add_scalar('ADE/val_'+path_mode,             ADE.item()/(i+1),             counter)
    writer.add_scalar('FDE/val_'+path_mode,             FDE.item()/(i+1),             counter)
    writer.add_scalar('LOSS/val_'+path_mode,            total_loss.item()/(i+1)   ,   counter)
    writer.add_scalar('LOSS_c/val_'+path_mode,          loss.item()/(i+1)        ,    counter)
    writer.add_scalar('L-REGULARIZER/val_'+path_mode,   loss_line_regularizer.item()/(i+1), counter)
    writer.close()

# Evaluate


In [None]:
def evaluate_eval(model, optimizer, criterion, criterion_vision, clip, five_fold_cross_validation):
    global batch_size
    model.eval()
    i           = None
    ADEs        = 0
    FDEs        = 0
    epoch_loss  = 0
    list_x_obs          = ['x_obs_'+str(i)              for i in range(0,T_obs)] #x_obs_0 --> x_obs_7
    list_y_obs          = ['y_obs_'+str(i)              for i in range(0,T_obs)] #y_obs_0 --> y_obs_7
    list_x_pred         = ['x_pred_'+str(i)             for i in range(0,T_pred)] #x_pred_0 --> x_pred_11
    list_y_pred         = ['y_pred_'+str(i)             for i in range(0,T_pred)] #y_pred_0 --> y_pred_11
    list_x_stoch_pred_m = ['x_stoch_pred_m_'+str(i)     for i in range(0,T_pred)] #x_stoch_pred_m_0 --> x_stoch_pred_m_11
    list_y_stoch_pred_m = ['y_stoch_pred_m_'+str(i)     for i in range(0,T_pred)] #y_stoch_pred_m_0 --> y_stoch_pred_m_11
    list_x_stoch_pred_s = ['x_stoch_pred_s_'+str(i)     for i in range(0,T_pred)] #x_stoch_pred_s_0 --> x_stoch_pred_s_11
    list_y_stoch_pred_s = ['y_stoch_pred_s_'+str(i)     for i in range(0,T_pred)] #y_stoch_pred_s_0 --> y_stoch_pred_s_11
    list_x_out          = ['x_out_'+str(i)              for i in range(0,T_pred)] #x_out_0 --> x_out_11
    list_y_out          = ['y_out_'+str(i)              for i in range(0,T_pred)] #y_out_0 --> y_out_11
    list_vsn            = ['vsn_'+str(i)               for i in range(0,hidden_size)] #vsn_0 --> vsn_255
    df_out              = pd.DataFrame(columns=list_x_obs + list_y_obs + list_x_out + list_y_out + list_x_pred + list_y_pred + list_x_stoch_pred_m + list_y_stoch_pred_m + list_x_stoch_pred_s + list_y_stoch_pred_s + list_vsn)# + list_vsn_visual + list_c_context + list_h_context)

    for i,data in enumerate(test_loader):

        start_time = time.time()
        
        # Forward
        obs, pred, visual_obs, frame_tensor                 = data
        input_tensor, output_tensor                         = obs.float().squeeze().to('cuda', non_blocking=True), pred.float().squeeze().to('cuda', non_blocking=True)               #(obs.to(device), pred.to(device))
        visual_input_tensor                                 = visual_obs.squeeze().cuda()
        if len(input_tensor.size()) == 2:
            break
        print(input_tensor.size())
        prediction, stochastic_prediction, visual_embedding = model(input_tensor,visual_input_tensor,output_tensor,batch_size,train_mode=0)
        
        calculated_prediction =  prediction.cumsum(axis=1) 

        loss_line_regularizer = distance_from_line_regularizer(input_tensor,calculated_prediction) #lreg

        if biased_loss_mode:
            weight  = torch.arange(1,2*T_pred+1,2).cuda().float()
            weight  = torch.exp(weight / T_pred).repeat(prediction.size(0)).view(prediction.size(0),T_pred,1)
            loss    = criterion( (calculated_prediction)*weight, torch.cumsum(output_tensor,dim=-2)*weight)
        else:
            loss    = criterion( (calculated_prediction), torch.cumsum(output_tensor,dim=-2)) #lmse
        out_x           = output_tensor[:,:,0].cumsum(axis=1)
        out_y           = output_tensor[:,:,1].cumsum(axis=1)
        pred_x          = calculated_prediction[:,:,0]
        pred_y          = calculated_prediction[:,:,1]
        ADE             = ((out_x.sub(pred_x)**2).add((out_y.sub(pred_y)**2))**(1/2)).mean(0).mean(0)   
        FDE             = ((out_x.sub(pred_x)**2).add((out_y.sub(pred_y)**2))**(1/2)).mean(0)[-1]
        total_loss      = loss.double() + regularization_factor * loss_line_regularizer.double() #loss
        print("Total Loss\t{:.2f}".format(total_loss.item()))
        epoch_loss += total_loss.item()
        ADEs    += ADE.item()
        FDEs    += FDE.item()
        input_x_lin                 = input_tensor[:,:,0].view(-1, T_obs).cpu() #x_obs
        input_y_lin                 = input_tensor[:,:,1].view(-1, T_obs).cpu() #y_obs
        output_x_lin                = output_tensor[:,:,0].view(-1, T_pred).cpu() #x_out
        output_y_lin                = output_tensor[:,:,1].view(-1, T_pred).cpu() #y_out
        prediction_x_lin            = prediction[:,:,0].view(-1, T_pred).cpu() #x_pred
        prediction_y_lin            = prediction[:,:,1].view(-1, T_pred).cpu() #y_pred
        stoch_prediction_x_m        = stochastic_prediction[:,:,0].view(-1, T_pred).cpu() #x_stoch_pred_m
        stoch_prediction_x_s        = stochastic_prediction[:,:,1].view(-1, T_pred).cpu() #x_stoch_pred_s
        stoch_prediction_y_m        = stochastic_prediction[:,:,2].view(-1, T_pred).cpu() #y_stoch_pred_m
        stoch_prediction_y_s        = stochastic_prediction[:,:,3].view(-1, T_pred).cpu() #y_stoch_pred_s
        whole_data                  = torch.cat((input_x_lin, input_y_lin, output_x_lin, output_y_lin, prediction_x_lin, prediction_y_lin, stoch_prediction_x_m, stoch_prediction_y_m, stoch_prediction_x_s, stoch_prediction_y_s),1) #visual_embedding_weights), 1) #,context_c_lin, context_h_lin
        temp                        = pd.DataFrame(whole_data.detach().cpu().numpy(), columns=list_x_obs + list_y_obs + list_x_out + list_y_out + list_x_pred + list_y_pred + list_x_stoch_pred_m + list_y_stoch_pred_m + list_x_stoch_pred_s + list_y_stoch_pred_s ) #list_vsn+ list_c_context + list_h_context
        df_out                      = df_out.append(temp)
        df_out.reset_index(drop=True,inplace=True)

        print("Time\t\t{:.2f} sec \n".format(time.time() - start_time))

    # ADE/FDE Report
    out_x  = df_out[['x_out_' +str(i) for i in range(0,T_pred)]].cumsum(axis=1)
    pred_x = df_out[['x_pred_'+str(i) for i in range(0,T_pred)]].cumsum(axis=1)
    out_y  = df_out[['y_out_' +str(i) for i in range(0,T_pred)]].cumsum(axis=1)
    pred_y = df_out[['y_pred_'+str(i) for i in range(0,T_pred)]].cumsum(axis=1)    
    ADE = (out_x.sub(pred_x.values)**2).add((out_y.sub(pred_y.values)**2).values, axis=1)**(1/2)
    df_out['ADE'] = ADE.mean(axis=1)
    FDE = ADE.x_out_11
    df_out['FDE'] = FDE
    Mean_ADE = df_out.ADE.mean()
    Mean_FDE = df_out.FDE.mean()
    print("MEAN ADE/FDE\t",Mean_ADE,Mean_FDE)
    writer.add_scalar("Final_Test/ADE_"+path_mode, Mean_ADE, global_step=0)
    writer.add_scalar("Final_Test/FDE_"+path_mode, Mean_FDE, global_step=0)
    df_out.to_sql(table_out+'_'+path_mode, cnx2, if_exists="replace", index=False)
    writer.close()
    
    return ADEs, FDEs, int(data_size/chunk_size)

# MAIN


In [None]:
model = Seq2Seq(in_size, embed_size, hidden_size, dropout_val=dropout_val, batch_size=batch_size)
model = nn.DataParallel( model ).cuda()

In [None]:
print("A summary of the model, where we can see the shape of each layer : ")
print(model)

In [None]:
learning_step               = 40
initial_learning_rate       = 0.01
clip                        = 1

# MSE loss
criterion                   = nn.MSELoss(reduction='mean')#nn.NLLLoss()
criterion_vision            = nn.MSELoss(reduction='sum')#nn.NLLLoss()

# SGD optimizer
optimizer                   = optim.SGD(model.parameters(), lr=initial_learning_rate, momentum=0.9, weight_decay=0.01) #SGD
scheduler                   = torch.optim.lr_scheduler.StepLR(optimizer, step_size=learning_step, gamma=0.1)
five_fold_cross_validation  = 0

In [None]:
#train dataset and loader
print("Initializing train dataset")
dataset_train = TrajectoryPredictionDataset(image_folder_path, DB_PATH_train, cnx_train)
train_loader  = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True, pin_memory=True)
validation_loader = None

In [None]:
print("TRAIN")
model.train()
print("path mode\t",path_mode)
loss  = train(model, optimizer, scheduler, criterion, criterion_vision, clip, train_loader, validation_loader)
print("LOSS ",loss)

In [None]:
def load_checkpoint(model, optimizer, scheduler, filename='checkpoint.pth.tar'):
    start_epoch = 0
    best_val=-1
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(filename))
        checkpoint = torch.load(filename)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        try:
            best_val=checkpoint['best_loss']
        except:
            best_val=-1
        print("=> loaded checkpoint '{}' (epoch {})".format(filename, checkpoint['epoch']))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return model, optimizer, scheduler, start_epoch, best_val

In [None]:
print("LOAD MODEL")
# Change device to cpu
#del model
if torch.cuda.is_available():
    torch.cuda.empty_cache()
model      = Seq2Seq(in_size, embed_size, hidden_size, dropout_val=dropout_val, batch_size=batch_size)
model, optimizer, scheduler, start_epoch, best_val = load_checkpoint(model, optimizer, scheduler, filename='./save_modelseth1000ep/model_best.pth')
model.cuda()

In [None]:
#test dataset and loader
print("Initializing val dataset")
dataset_val   = TrajectoryPredictionDataset(image_folder_path, DB_PATH_val, cnx_val)
test_loader   = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False, num_workers=1, drop_last=False, pin_memory=True)

In [None]:
print("EVALUATE bst")

model.eval()
path_mode = 'bst'
print("path mode\t",path_mode)
evaluate_eval(model, optimizer, criterion, criterion_vision, clip, test_loader)

In [None]:
print("EVALUATE top5")
model.eval()
path_mode = 'top5'
print("path mode\t",path_mode)
evaluate_eval(model, optimizer, criterion, criterion_vision, clip, test_loader)