# 模型方法分析

In [1]:
import torch
import torch.nn as nn
from models.transformer_cosine import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from models.vgg_c import make_layers, cfg

In [2]:
d_model = 512
nhead = 2
num_layers = 4
dim_feedforward = 2048
dropout = 0.1
activation = "relu"
normalize_before = False
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
                                        dropout, activation, normalize_before)
if_norm = nn.LayerNorm(d_model) if normalize_before else None
encoder = TransformerEncoder(encoder_layer, num_layers, if_norm)
reg_layer_0 = nn.Sequential(
    nn.Conv2d(512, 256, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.Conv2d(256, 128, kernel_size=3, padding=1),
    nn.ReLU(inplace=True),
    nn.Conv2d(128, 1, 1)
)

In [6]:
x = torch.rand((1, 3, 512, 512))
features_layer = make_layers(cfg['E'])
x = features_layer(x)
print("Feature x.size() =", x.size())
bs, c, h, w = x.shape
x = x.flatten(2)
print("Flatten x.size() =",x.size())
x = x.permute(2, 0, 1)
print('Transformer input x.size() =', x.size())
x, features = encoder(x, (h, w))
x = x.permute(1, 2, 0).view(bs, c, h, w)
print("Output x.size() =", x.size())
x = F.interpolate(x, size=(32, 32), mode='bilinear', align_corners=True)
print("After upsampling x.size() =", x.size())
x = reg_layer_0(x)
print("Regression head x.size() =", x.size())

Feature x.size() = torch.Size([1, 512, 16, 16])
Flatten x.size() = torch.Size([1, 512, 256])
Transformer input x.size() = torch.Size([256, 1, 512])
Output x.size() = torch.Size([1, 512, 16, 16])
After upsampling x.size() = torch.Size([1, 512, 32, 32])
Regression head x.size() = torch.Size([1, 1, 32, 32])
