In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

# 使用预训练的ResNet50模型
model = models.resnet50(pretrained=True)

# 输入图像（batch_size, channels, height, width）
input_image = torch.randn(1, 3, 224, 224)
# print(input_image)

# 获取模型的最后一层全连接层的输出（特征向量）
with torch.no_grad():
    features = model(input_image)

print(features.shape)  # 输出: torch.Size([1, 1000])
# 在这个例子中，特征向量的长度是1000，因为ResNet50的最后一层有1000个输出单元，对应于1000个分类标签

In [None]:
import torch.nn.functional as F
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # 输出第一个特征图
        x = self.pool(F.relu(self.conv2(x)))  # 输出第二个特征图
        return x

model = SimpleCNN()
print(model)

# 输入图像（batch_size, channels, height, width）
input_image = torch.randn(1, 3, 32, 32)

# 获取卷积层输出的特征图
with torch.no_grad():
    feature_map = model(input_image)

print(feature_map.shape)  # 输出: torch.Size([1, 32, 8, 8])

In [None]:
import numpy as np
import torch
from torch import nn
from torch.nn import init

class ExternalAttention(nn.Module):

    def __init__(self, d_model,S=64):
        super().__init__()
        self.mk=nn.Linear(d_model,S,bias=False)
        self.mv=nn.Linear(S,d_model,bias=False)
        self.softmax=nn.Softmax(dim=1)
        self.init_weights()


    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, queries):
        attn=self.mk(queries) #bs,n,S
        attn=self.softmax(attn) #bs,n,S
        attn=attn/torch.sum(attn,dim=2,keepdim=True) #bs,n,S
        out=self.mv(attn) #bs,n,d_model

        return out


if __name__ == '__main__':
    input=torch.randn(50,49,512)
    ea = ExternalAttention(d_model=512,S=8)
    output=ea(input)
    print(output.shape)



In [None]:
from models.attention_modules.self_attention import AttentionModule
model = AttentionModule(feature_size=512,text_feature_size=512, num_heads=8)
print(model)

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer


In [None]:
# 一个简单的RoBERTa模型
class SimpleRoBERTa(nn.Module):
    def __init__(self, pretrained_model_name='roberta-base'):
        super(SimpleRoBERTa, self).__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)
        return outputs


In [None]:
# 定义一个用于文本编码的RoBERTaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


In [None]:
# 现在，我们可以使用这个简单的RoBERTa模型来对文本进行编码了：
# 输入文本
text = "I love natural language processing!"

# 使用tokenizer将文本转换为模型所需的输入格式
input_ids = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')
attention_mask = torch.ones_like(input_ids)

# 实例化和加载模型
model = SimpleRoBERTa()
outputs = model(input_ids, attention_mask)

# 输出编码结果
print(outputs.size())


In [None]:
import torch
from transformers import RobertaModel, RobertaTokenizer

# 加载 RoBERTa 模型和分词器
model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 输入文本
text = "Hello, how are you?"

# 使用分词器对文本进行编码
inputs = tokenizer(text, return_tensors="pt")

# 传递输入到模型中进行推断
outputs = model(**inputs)

# 打印输出张量的形状
print("Last hidden state shape:", outputs.last_hidden_state.shape)
print("Pooler output shape:", outputs.pooler_output.shape)


In [None]:
from torchvision.models import resnet18, resnet50
model = resnet50(pretrained=True)
print(model)

In [None]:
import torch
from torch import nn

class eca_layer(nn.Module):
    """Constructs a ECA module.

    Args:
        channel: Number of channels of the input feature map
        k_size: Adaptive selection of kernel size
    """
    def __init__(self, channel, k_size=3):
        super(eca_layer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False) 
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # feature descriptor on the global spatial information
        y = self.avg_pool(x)

        # Two different branches of ECA module
        y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1)

        # Multi-scale information fusion
        y = self.sigmoid(y)

        return x * y.expand_as(x)
        

In [None]:
import torch.nn as nn
import math
# import torch.utils.model_zoo as model_zoo
from .eca_module import eca_layer


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class ECABasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, k_size=3):
        super(ECABasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes, 1)
        self.bn2 = nn.BatchNorm2d(planes)
        self.eca = eca_layer(planes, k_size)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.eca(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ECABottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, k_size=3):
        super(ECABottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.eca = eca_layer(planes * 4, k_size)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)
        out = self.eca(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, k_size=[3, 3, 3, 3]):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], int(k_size[0]))
        self.layer2 = self._make_layer(block, 128, layers[1], int(k_size[1]), stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], int(k_size[2]), stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], int(k_size[3]), stride=2)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, k_size, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, k_size))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, k_size=k_size))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def eca_resnet18(k_size=[3, 3, 3, 3], num_classes=1_000, pretrained=False):
    """Constructs a ResNet-18 model.

    Args:
        k_size: Adaptive selection of kernel size
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        num_classes:The classes of classification
    """
    model = ResNet(ECABasicBlock, [2, 2, 2, 2], num_classes=num_classes, k_size=k_size)
    model.avgpool = nn.AdaptiveAvgPool2d(1)
    return model


def eca_resnet50(k_size=[3, 3, 3, 3], num_classes=1000, pretrained=False):
    """Constructs a ResNet-50 model.

    Args:
        k_size: Adaptive selection of kernel size
        num_classes:The classes of classification
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    print("Constructing eca_resnet50......")
    model = ResNet(ECABottleneck, [3, 4, 6, 3], num_classes=num_classes, k_size=k_size)
    model.avgpool = nn.AdaptiveAvgPool2d(1)
    return model

In [None]:
from models.attention_modules.self_attention import AttentionModule, SelfAttentionMap, GlobalCrossAttentionMap
model = AttentionModule(512,512,2)
print(model)

In [None]:
compositor_object1=1
compositor_object2=13

text_encoder_object=2
lower_img_encoder_object=2
upper_img_encoder_object=2

compositors = {
    'compositor1': compositor_object1,
    'compositor2': compositor_object2,
}

models = {
    'text_encoder': text_encoder_object,
    'lower_image_encoder': lower_img_encoder_object,
    'upper_image_encoder': upper_img_encoder_object,
}
models

In [None]:
models.update(compositors)
print(models)

In [None]:
from models.compositors.global_style_models import GlobalStyleTransformer2
from models.compositors.transformers import DisentangledTransformer


def global_styler_factory(code, feature_size, text_feature_size):
    if code == GlobalStyleTransformer2.code():
        return GlobalStyleTransformer2(feature_size, text_feature_size)
    else:
        raise ValueError("{} not exists".format(code))


def transformer_factory(feature_sizes, configs):
    text_feature_size = feature_sizes['text_feature_size']
    num_heads = configs['num_heads']

    global_styler_code = configs['global_styler']
    global_styler = global_styler_factory(global_styler_code, feature_sizes['layer4'], text_feature_size)
    return {'layer4': DisentangledTransformer(feature_sizes['layer4'], text_feature_size, num_heads=num_heads,
                                              global_styler=global_styler)}


In [3]:
from models.compositors import transformer_factory
from models.image_encoders import image_encoder_factory
from models.text_encoders import text_encoder_factory
from language import vocabulary_factory
from data.shoes import ShoesDataset
DEFAULT_VOCAB_PATHS = {
    **dict.fromkeys(ShoesDataset.all_codes(), ShoesDataset.vocab_path()),
}
DEFAULT_VOCAB_PATHS

{'shoes': './data/shoes/shoes_vocab.pkl'}

In [21]:
configs = {
    'alpha_scale': 1,
    'augmenter': 'normal_gaussian',
    'batch_size': 24,
    'beta_scale': 1,
    'checkpoint_path': './ckpt',
    'compositor': 'transformer',
    'config_path': 'configs/shoes_config.json',
    'dataset': 'shoes',
    'decay_step': 35,
    'decay_step_second': 42,
    'device_idx': '0',
    'epoch': 1,
    'evaluator': 'simple',
    'experiment_description': 'bert',
    'experiment_dir': 'experiments',
    'export_root': 'experiments\\bert_2024-07-06_0',
    'feature_size': 512,
    'gamma': 0.1,
    'gamma_scale': 1,
    'global_styler': 'global2',
    'heads': 12,
    'image_encoder': 'resnet50_layer4',
    'img_size': 224,
    'layers': 12,
    'lr': 0.0001,
    'lr_scheduler': 'MultiStepWithWarmup',
    'lstm_hidden_size': 512,
    'margin': 12,
    'metric_loss': 'batch_based_aleatoric_loss',
    'momentum': 0.9,
    'norm_scale': 4,
    'num_gpu': 1,
    'num_heads': 8,
    'num_workers': 0,
    'optimizer': 'RAdam',
    'output_dim': 512,
    'patch_size': 16,
    'random_seed': 13877,
    'selector': 'all',
    'shuffle': True,
    'stride': True,
    'stride_size': 16,
    'text_encoder': 'roberta',
    'text_feature_size': 512,
    'topk': '1,5,10,50',
    'trainer': 'tirg',
    'use_transform': True,
    'vocab_path': '../data/shoes/shoes_vocab.pkl',
    'vocab_threshold': 0,
    'wandb_account_name': 'yubin06',
    'wandb_project_name': 'UR',
    'warmup_iters': 5,
    'weight_decay': 5e-05,
    'width': 768,
    'word_embedding_size': 512
}
vocabulary = vocabulary_factory(config={
    'vocab_path': configs['vocab_path'] if configs['vocab_path'] else DEFAULT_VOCAB_PATHS[configs['dataset']],
    'vocab_threshold': configs['vocab_threshold']
})
print(vocabulary)

<language.vocabulary.SimpleVocabulary object at 0x00000220D8EFF370>


In [22]:
text_encoder, text_fc = text_encoder_factory(vocabulary, config=configs)
lower_img_encoder, upper_img_encoder = image_encoder_factory(config=configs)
layer_shapes = lower_img_encoder.layer_shapes()
compositors = transformer_factory({'layer4': layer_shapes['layer4'],
                                       'image_feature_size': upper_img_encoder.feature_size,
                                       'text_feature_size': text_encoder.feature_size}, configs=configs)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
print(compositors)

{'layer4': DisentangledTransformer(
  (att_module): AttentionModule(
    (gam_attention): GAM_Attention(
      (channel_attention): Sequential(
        (0): Linear(in_features=2048, out_features=512, bias=True)
        (1): ReLU(inplace=True)
        (2): Linear(in_features=512, out_features=2048, bias=True)
      )
      (spatial_attention): Sequential(
        (0): Conv2d(2048, 512, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(512, 2048, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
        (4): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (self_att_generator): SelfAttentionMap(
      (W_k): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (W_q): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (softmax): Softmax(dim=2)
    )
    

```python
{'layer4': DisentangledTransformer(
  (att_module): AttentionModule(
    (gam_attention): GAM_Attention(
      (channel_attention): Sequential(
        (0): Linear(in_features=2048, out_features=512, bias=True)
        (1): ReLU(inplace=True)
        (2): Linear(in_features=512, out_features=2048, bias=True)
      )
      (spatial_attention): Sequential(
        (0): Conv2d(2048, 512, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(512, 2048, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
        (4): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (self_att_generator): SelfAttentionMap(
      (W_k): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (W_q): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (softmax): Softmax(dim=2)
    )
    (global_att_generator): GlobalCrossAttentionMap(
      (W_t): Linear(in_features=512, out_features=2048, bias=True)
      (normalize): Softmax(dim=1)
    )
    (merge): Conv2d(2560, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (W_v): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (W_r): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1))
  )
  (att_module2): AttentionModule(
    (gam_attention): GAM_Attention(
      (channel_attention): Sequential(
        (0): Linear(in_features=2048, out_features=512, bias=True)
        (1): ReLU(inplace=True)
        (2): Linear(in_features=512, out_features=2048, bias=True)
      )
      (spatial_attention): Sequential(
        (0): Conv2d(2048, 512, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(512, 2048, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
        (4): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (self_att_generator): SelfAttentionMap(
      (W_k): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (W_q): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (softmax): Softmax(dim=2)
    )
    (global_att_generator): GlobalCrossAttentionMap(
      (W_t): Linear(in_features=512, out_features=2048, bias=True)
      (normalize): Softmax(dim=1)
    )
    (merge): Conv2d(2560, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (W_v): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (W_r): Conv2d(2048, 2048, kernel_size=(1, 1), stride=(1, 1))
  )
  (global_styler): GlobalStyleTransformer2(
    (global_transform): EqualLinear(
      (linear): Linear(in_features=512, out_features=4096, bias=True)
    )
    (gate): EqualLinear(
      (linear): Linear(in_features=512, out_features=4096, bias=True)
    )
    (sigmoid): Sigmoid()
  )
  (instance_norm): InstanceNorm2d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)}
```