In [19]:
import torch
import torch.nn as nn
import numpy as np

In [35]:
# 0 由于是类外，因此提前定义好超参和positional_encoding_gaussian_matrix
num_pos_feats = 64
positional_encoding_gaussian_matrix = torch.randn((2, num_pos_feats))
print(positional_encoding_gaussian_matrix.shape)

torch.Size([2, 64])


In [37]:
# 1 对输入的坐标做位置编码，要求输入坐标已经标准化在[0,1]之间

def position_embedding_encoding(coords): # shape = [d1, d2, ..., 2]，前面的dx是网格，最后一个2存储的是点坐标，例如[0.1, 0.3]，前面的所有维度是用来储存点坐标的列表，例如[batch_size, N, 2] = [3, 3, 2]
        coords = 2 * coords - 1 # 将coords的范围从[0,1]放缩到[-1,1]
        coords = coords @ positional_encoding_gaussian_matrix # 例如[16, 16, 2] * [2, 64] = [16, 16, 64]
        coords = 2 * np.pi * coords # shape = [16, 16, 64]不变
    
        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) # 放进sin和cos之后沿着最后一个维度拼成一个新的张量，shape = [3, 3, 128]

In [39]:
# 2 定义forward函数

def forward(size): # 假设size = (16, 16)
    h, w = size # h=16， w=16
    grid = torch.ones((h, w), dtype=torch.float32) # 生成一个(16, 16)的全为1的矩阵，命名为grid
    y_embed = grid.cumsum(dim=0) - 0.5 # y轴坐标embedding为grid按列累加并减0.5，即每一列为[0.5, 1.5, ..., 15.5]，shape = (16, 16)
    x_embed = grid.cumsum(dim=1) - 0.5 # x轴坐标embedding为grid按行累加并减0.5，即每一行为[0.5, 1.5, ..., 15.5]，shape = (16, 16)
    y_embed = y_embed / h # 标准化，使其规范为[0, 1]之间，shape = (16, 16)
    x_embed = x_embed / w # 标准化，使其规范为[0, 1]之间，shape = (16, 16)

    coords = torch.stack([x_embed, y_embed], dim=-1) # 在最后一列创建一个新的维度，然后stack在一起，shape为[16, 16, 2]，这个也就是输入点坐标的位置编码，最后一个2存储标准化后的点坐标
    position_embedding = position_embedding_encoding(coords) # 再传入position_embedding_encoding做位置编码，得到[16, 16, 128]
    return position_embedding.permute(2, 0, 1)  # 调整为[128, 16, 16]

In [41]:
# 2.2 测试forward函数
# 假设size = (16, 16)
size = (16, 16)
output_from_forward = forward(size)
print(output_from_forward.shape)

torch.Size([128, 16, 16])


In [43]:
# 3 定义forward_with_coords函数

def forward_with_coords(coords_input, image_size):
        """Positionally encode points that are not normalized to [0,1]."""
        coords = coords_input.clone()
        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
        return position_embedding_encoding(coords.to(torch.float))  # B x N x C

In [45]:
# 3.3 测试forward_with_coords函数

# 假设batch_size = 2，N = 3
# 即有2个batch，每个batch有3个样本，
# 第一批分别为 (50, 50), (100, 100), (150, 150)
# 第二批分别为(30, 40), (70, 80), (110, 120)
coords_input = torch.tensor([[[50, 50], [100, 100], [150, 150]], [[30, 40], [70, 80], [110, 120]]]) # shape = (2, 3, 2)
image_size = (100, 200)

encoded_coords = forward_with_coords(coords_input, image_size)
print(encoded_coords.shape)  # 输出shape = (2, 3, 128)

torch.Size([2, 3, 128])
