In [1]:
import torch
import torch.nn as nn

# Visual Transformers

In [2]:

class EncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(EncoderLayer, self).__init__()

        self.attention = nn.MultiheadAttention(embed_dim, num_heads)

        self.feed = nn.Sequential(
            nn.Linear(embed_dim, embed_dim*4),
            nn.SiLU(inplace=True),
            nn.Linear(embed_dim*4, embed_dim)
        )

        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # query, key, value
        y = self.attention(x, x, x)
        z = self.norm(x + y)

        w = self.feed(z)
        out = self.norm(w + z)
        return out

class Encoder(nn.Module):
    def __init__(self, embed_dim, num_heads, N):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([EncoderLayer(embed_dim, num_heads) for _ in range(N)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(DecoderLayer, self).__init__()

        self.self_attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.cross_attention = nn.MultiheadAttention(embed_dim, num_heads)


        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, embed_dim*4),
            nn.SiLU(inplace=True),
            nn.Linear(embed_dim*4, embed_dim)
        )

        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, object, image):
        # Self attention

        z = self.self_attention(object, object, object)
        object = self.norm(z + object)

        # query, key, value

        z = self.feed_forward(object)
        


        return object


class Decoder(nn.Module):
    def __init__(self, embed_dim, num_heads, N):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(embed_dim, num_heads) for _ in range(N)])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


In [3]:
# from transformers import BertModel

# class SentimentClassifier(nn.Module):
#     def __init__(self, bert_model_name="bert-base-uncased", num_classes=2):
#         super(SentimentClassifier, self).__init__()
#         self.bert = BertModel.from_pretrained(bert_model_name)
#         self.drop = nn.Dropout(p=0.3)
#         self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         # last_hidden_state: [batch_size, n_seq, hidden_size]
#         cls = outputs.last_hidden_state[:, 0, :]  # Extraer el token [CLS]
#         x = self.drop(cls)
#         x = self.fc(x)
#         return x  # No aplicamos softmax porque generalmente se usa con CrossEntropyLoss


In [4]:
class Attention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout):
        super(Attention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout)
        self.Q = nn.Linear(embed_dim, embed_dim)
        self.K = nn.Linear(embed_dim, embed_dim)
        self.V = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        q = self.Q(x)
        k = self.Q(x)
        v = self.Q(x)
        outputs = self.attention(q, k, v)

        return x

In [5]:
class MLPHead(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPHead, self).__init__()

        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        logits = self.mlp(x)
        return logits



input = torch.randn(1, 3, 32, 32)
input = input.flatten(0)

print(f"Input shape: {input.shape}")

model = MLPHead(3 * 32 * 32, 32 * 32, 10)

with torch.no_grad():
    output = model(input)

print(f"Output shape: {output.shape}")
print(output)

Input shape: torch.Size([3072])
Output shape: torch.Size([10])
tensor([ 0.0545,  0.0264, -0.4073,  0.0245,  0.1403, -0.2161,  0.2665,  0.2861,
         0.0946, -0.2695])


## VAE

<img src="../images/image18.jpg" height="400"/>


In [6]:
class VAE(nn.Module):
    def __init__(self, in_chan, n_hidden, n_latent):
        super(VAE, self).__init__()

        # Encoder
        self.fc = nn.Linear(in_chan, n_hidden)
        self.f_mu = nn.Linear(n_hidden, n_latent)
        self.f_logvar = nn.Linear(n_hidden, n_latent)

        # Decoder
        self.dec1 = nn.Linear(n_latent, n_hidden)
        self.dec2 = nn.Linear(n_hidden, in_chan)

        self.ReLU = nn.ReLU()

    def encoder(self, x):
        h = self.ReLU(self.fc(x))
        mu = self.f_mu(h)
        logvar = self.f_logvar(h)
        return mu, logvar

    def decoder(self, z):
        h = self.ReLU(self.dec1(z))
        y = self.ReLU(self.dec2(h))
        return y
    

    # Reparameterization trick 
    # z = μ + σ ⋅ ϵ
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        y = self.decoder(z)
        return y, mu, logvar

def lossVAE(x_recon, x, mu, logvar):
    BCE = torch.nn.functional.binary_cross_entropy(x_recon, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD
