In [1]:
import torch
import torch.nn as nn
import torchvision
import torch.nn.functional as F
from torchvision.models import resnet18

In [11]:
## modelの作成
model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)
print("model : ", model)

## 1層目の畳み込み層を1チャネルに変更
model.features[0][0] = nn.Sequential(
    nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
    nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
    nn.ReLU6(inplace=True),
)

In [30]:
class MobileNetV2_audio(nn.Module):
  def __init__(self, dim=2048, pred_dim=512, arch=None):

    super(MobileNetV2_audio, self).__init__()

    # backboneの作成
    self.backbone = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=True)
    feature_dim = 1280

    # backboneの1層目の畳み込み層を変更
    self.backbone.features[0][0] = nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
        nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        nn.ReLU6(inplace=True),
    )

    # classifierをIdentity()に変更
    self.backbone.classifier = nn.Identity()

    # Projectorの設定
    self.projector = nn.Sequential(nn.Linear(feature_dim, feature_dim, bias=False),
                                    nn.BatchNorm1d(feature_dim),
                                    nn.ReLU(inplace=True),
                                    nn.Linear(feature_dim, feature_dim, bias=False),
                                    nn.BatchNorm1d(feature_dim),
                                    nn.ReLU(inplace=True),
                                    nn.Linear(feature_dim, dim, bias=True),
                                    nn.BatchNorm1d(dim, affine=False)
    )

    # Predictorの作成
    self.predictor = nn.Sequential(nn.Linear(dim, pred_dim, bias=False),
                                  nn.BatchNorm1d(pred_dim),
                                  nn.ReLU(inplace=True),
                                  nn.Linear(pred_dim, dim))


  def forward(self, x):

    # backboneの出力
    z1 = self.backbone(x)
    print("z1.shape : ", z1.shape)

    # Projectorの出力
    z2 = self.projector(z1)
    print("z2.shape : ", z2.shape)

    # Predictorの出力
    z3 = self.predictor(z2)
    print("z3.shape : ", z3.shape)

    return z3



In [31]:
model = MobileNetV2_audio()
#print("model : ", model)


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


In [33]:
a = torch.randn([10, 1, 224, 350])
y = model(a)
print(y.shape)

z1.shape :  torch.Size([10, 1280])
z2.shape :  torch.Size([10, 2048])
z3.shape :  torch.Size([10, 2048])
torch.Size([10, 2048])


In [12]:
class MobileNetV2_audio(nn.Module):
  def __init__(self, dim=2048, pred_dim=512, arch=None):

    super(MobileNetV2_audio, self).__init__()

    # backboneの作成
    self.backbone = torchvision.models.mobilenet_v2(pretrained=True)
    feature_dim = 1280

    # backboneの1層目の畳み込み層を変更
    self.backbone.features[0][0] = nn.Sequential(
        nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False),
        nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
        nn.ReLU6(inplace=True),
    )

    # classifierをIdentity()に変更
    self.backbone.classifier = nn.Identity()

    # Projectorの設定
    self.projector = nn.Sequential(nn.Linear(feature_dim, feature_dim, bias=False),
                                    nn.BatchNorm1d(feature_dim),
                                    nn.ReLU(inplace=True),
                                    nn.Linear(feature_dim, feature_dim, bias=False),
                                    nn.BatchNorm1d(feature_dim),
                                    nn.ReLU(inplace=True),
                                    nn.Linear(feature_dim, dim, bias=True),
                                    nn.BatchNorm1d(dim, affine=False)
    )

    # Predictorの作成
    self.predictor = nn.Sequential(nn.Linear(dim, pred_dim, bias=False),
                                  nn.BatchNorm1d(pred_dim),
                                  nn.ReLU(inplace=True),
                                  nn.Linear(pred_dim, dim))


  def forward(self, x1, x2=None):

    # backboneの出力
    self.outputs = []
    # for i, layer in enumerate(self.backbone.features):
    #   x = layer(x)
    #   print("x.shape : ", x.shape)
    #   #self.outputs.append(x)
    #   self.outputs.append(F.adaptive_avg_pool2d(x, 1))

    # for feat in self.outputs:
    #   print("feat.shape : ", feat.shape)
    # print(cfghj)

    feature = self.backbone(x1)
    print("feature.shape : ", feature.shape)

    # Projectorの出力
    z1 = self.projector(feature)
    print("z1.shape : ", z1.shape)

    # Predictorの出力
    p1 = self.predictor(z1)
    print("p1.shape : ", p1.shape)

    if x2 is None:
      return p1, z1.detach(), feature

    # encoderの出力
    feature2 = self.backbone(x2)

    # projectionの出力
    z2 = self.projector(feature2)

    # predictorの出力
    p2 = self.predictor(z2)

    return p1, p2, z1.detach(), z2.detach(),





In [13]:
model = MobileNetV2_audio()
print(model)

MobileNetV2_audio(
  (backbone): MobileNetV2(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Sequential(
          (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace=True)
      )
      (1): InvertedResidual(
        (conv): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU6(inplace=True)
          )
          (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_runn



In [17]:
a = torch.randn([10, 1, 224, 350])
b = torch.randn([10, 1, 224, 350])

p1, p2, z1, z2 = model(a, b)


feature.shape :  torch.Size([10, 1280])
z1.shape :  torch.Size([10, 2048])
p1.shape :  torch.Size([10, 2048])
