In [59]:
!pip install keras_core



In [60]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
import tensorflow as tf
import random
import torch

In [61]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    "spa-eng.zip", origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True)
path_to_file = pathlib.Path(path_to_zip).parent/"spa-eng/spa.txt"

with open(path_to_file) as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []
for line in lines:
    eng, spa = line.lower().split("\t")
    text_pairs.append((eng, spa))

In [62]:
random.Random(43).shuffle(text_pairs)
num_val_samples = int(0.005 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

118964 total pairs
118370 training pairs
594 validation pairs


In [63]:
for s in train_pairs[:5]:
    print(s)

('the old woman fell and could not get up.', 'la anciana se cayó y no pudo levantarse.')
('what is this the abbreviation for?', '¿de qué es abreviatura esto?')
("you're not sick.", 'no estás enferma.')
('i have no knife to cut with.', 'no tengo un cuchillo con que cortarlo.')
('americans admire lincoln for his honesty.', 'los estadounidenses admiran a lincoln por su honestidad.')


# PIPELINE


In [64]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter

In [65]:
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation succ

In [66]:
eng_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
spa_tokenizer = get_tokenizer("spacy", language="es_core_news_sm")



In [68]:
def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer, spa_tokenizer = tokenizers
    eng_counter = Counter()
    spa_counter = Counter()
    for eng_string_, spa_string_ in text:
        eng_counter.update(eng_tokenizer(eng_string_))
        spa_counter.update(spa_tokenizer(spa_string_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq,
                       specials=["<unk>", "<pad>"])
    spa_vocab = Vocab(spa_counter, min_freq=min_freq,
                       specials=["<unk>", "<pad>", "<bos>", "<eos>"])
    return eng_vocab, spa_vocab

eng_vocab, spa_vocab = build_vocab(text_pairs,
                                   [eng_tokenizer, spa_tokenizer],
                                   min_freq=0)

In [69]:
eng_vocab_size = len(eng_vocab)
spa_vocab_size = len(spa_vocab)
eng_vocab_size, spa_vocab_size

(13229, 26116)

In [70]:
maxlen = 20

def data_process(text, eng_vocab, spa_vocab, eng_tokenizer, spa_tokenizer):
    data = []
    for eng, spa in text:
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng)],
                                dtype=torch.long)
        spa_tensor_ = torch.tensor([spa_vocab[token] for token in spa_tokenizer(spa)],
                                dtype=torch.long)

        if eng_tensor_.shape[0] < maxlen and spa_tensor_.shape[0] < maxlen - 2:
            data.append((eng_tensor_, spa_tensor_))
    return data

train_data = data_process(train_pairs, eng_vocab, spa_vocab, eng_tokenizer, spa_tokenizer)
val_data = data_process(val_pairs, eng_vocab, spa_vocab, eng_tokenizer, spa_tokenizer)

print(len(train_data), len(val_data))

117552 591


In [71]:
batch_size = 64
PAD_IDX = eng_vocab["<pad>"]
BOS_IDX = spa_vocab["<bos>"]
EOS_IDX = spa_vocab["<eos>"]
UNK_IDX = spa_vocab["<unk>"]

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import functional as F


def pad_to_max_length(tensor, max_length):
    current_length = tensor.size(1)
    if current_length < max_length:
        padding_size = max_length - current_length
        padding = torch.full((tensor.size(0), padding_size), PAD_IDX, dtype=torch.long)
        tensor = torch.cat([tensor, padding], dim=1)
    return tensor

def generate_batch(data_batch):
    x, y = [], []

    for (x_item, y_item) in data_batch:

        x.append(torch.cat([x_item]))
        y.append(torch.cat([torch.tensor([BOS_IDX]),
                            y_item,
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)

    # Ajustar a la longitud máxima
    x = pad_to_max_length(x, maxlen +2)
    y = pad_to_max_length(y, maxlen +2)

    return x, y ## Aquí tengo mis dudas si


In [72]:
train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)





In [73]:
 train_batch, train_target_batch = next(iter(train_loader))
 train_batch.shape, train_target_batch.shape

 print(train_target_batch)

  self.pid = os.fork()
  self.pid = os.fork()


tensor([[   2,  104,  377,  ...,    1,    1,    1],
        [   2,   80,   16,  ...,    1,    1,    1],
        [   2,   16,   44,  ...,    1,    1,    1],
        ...,
        [   2,   13,   91,  ...,    1,    1,    1],
        [   2,   31,    4,  ...,    1,    1,    1],
        [   2,  344, 2188,  ...,    1,    1,    1]])


# ATENCION

In [74]:
import torch.nn as nn
from torch import optim
import time

In [108]:
class Attention(nn.Module): ##SELF ATTENTION
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])

        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen+2, n_heads=1)
test_layer(torch.ones([1, maxlen+2, 32]))

tensor([[[-0.2197,  0.4882, -0.5777,  0.1943, -0.4862,  0.1287,  0.3880,
           0.1625,  0.0408, -0.2899,  0.5814, -0.3933, -0.4094, -0.3302,
          -0.2562,  0.3721, -0.1940, -0.0112, -0.4259, -0.2208, -0.3669,
           0.4376, -0.1189,  0.4268,  0.1117,  0.1598,  0.7720,  0.3906,
          -0.3718, -0.2117, -0.4396, -0.0301],
         [-0.2197,  0.4882, -0.5777,  0.1943, -0.4862,  0.1287,  0.3880,
           0.1625,  0.0408, -0.2899,  0.5814, -0.3933, -0.4094, -0.3302,
          -0.2562,  0.3721, -0.1940, -0.0112, -0.4259, -0.2208, -0.3669,
           0.4376, -0.1189,  0.4268,  0.1117,  0.1598,  0.7720,  0.3906,
          -0.3718, -0.2117, -0.4396, -0.0301],
         [-0.2197,  0.4882, -0.5777,  0.1943, -0.4862,  0.1287,  0.3880,
           0.1625,  0.0408, -0.2899,  0.5814, -0.3933, -0.4094, -0.3302,
          -0.2562,  0.3721, -0.1940, -0.0112, -0.4259, -0.2208, -0.3669,
           0.4376, -0.1189,  0.4268,  0.1117,  0.1598,  0.7720,  0.3906,
          -0.3718, -0.2117, -0

In [109]:
import torch
import torch.nn as nn

class CrossAttention(nn.Module): ##CROSS ATTENTION
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias=bias)
        self.kw = nn.Linear(dim, dim, bias=bias)
        self.vw = nn.Linear(dim, dim, bias=bias)

        self.ow = nn.Linear(dim, dim, bias=bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x): # k y v son la frase en inglés. Contexto

        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L_context, D = k.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L_context, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L_context, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:, :, :L, :L_context] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x

test_layer = CrossAttention(32, maxlen + 2, n_heads=1)
inputs = torch.ones([1, maxlen, 32])
#context = torch.ones([ maxlen-6, 32])
#context2 = torch.ones([ maxlen-6, 32])
test_layer(inputs)


tensor([[[-0.1370, -0.2503, -0.1604,  0.1016,  0.1489, -0.3069,  0.0478,
          -0.0468, -0.7308,  0.4106, -0.0410,  0.3223, -0.2518,  0.1145,
           0.2870,  0.0367, -0.3215, -0.1188,  0.0482,  0.0102, -0.0298,
           0.1470, -0.0341, -0.3269,  0.2657,  0.3793, -0.0728,  0.0913,
          -0.0336, -0.0882, -0.1693,  0.1536],
         [-0.1370, -0.2503, -0.1604,  0.1016,  0.1489, -0.3069,  0.0478,
          -0.0468, -0.7308,  0.4106, -0.0410,  0.3223, -0.2518,  0.1145,
           0.2870,  0.0367, -0.3215, -0.1188,  0.0482,  0.0102, -0.0298,
           0.1470, -0.0341, -0.3269,  0.2657,  0.3793, -0.0728,  0.0913,
          -0.0336, -0.0882, -0.1693,  0.1536],
         [-0.1370, -0.2503, -0.1604,  0.1016,  0.1489, -0.3069,  0.0478,
          -0.0468, -0.7308,  0.4106, -0.0410,  0.3223, -0.2518,  0.1145,
           0.2870,  0.0367, -0.3215, -0.1188,  0.0482,  0.0102, -0.0298,
           0.1470, -0.0341, -0.3269,  0.2657,  0.3793, -0.0728,  0.0913,
          -0.0336, -0.0882, -0

In [110]:
import torch
import torch.nn as nn

class CrossAttention3(nn.Module):
    def __init__(self, input_dim, context_dim):
        super().__init__()
        self.q_linear = nn.Linear(input_dim, input_dim)
        self.k_linear = nn.Linear(context_dim, input_dim)
        self.v_linear = nn.Linear(context_dim, input_dim)
        self.out = nn.Linear(input_dim, input_dim)

    def forward(self, input_seq, context_seq):
        q = self.q_linear(input_seq)
        k = self.k_linear(context_seq)
        v = self.v_linear(context_seq)

        attention_weights = torch.matmul(q, k.transpose(-2, -1))
        attention_weights = torch.softmax(attention_weights, dim=-1)

        output = torch.matmul(attention_weights, v)
        output = self.out(output)

        return output

# Ejemplo de uso
input_dim = 64
context_dim = 128
max_seq_len = 10

cross_attn = CrossAttention3(input_dim, context_dim)
input_seq = torch.randn(1, max_seq_len, input_dim)  # Tamaño del lote, longitud de la secuencia, dimensión de entrada
context_seq = torch.randn(1, max_seq_len, context_dim)  # Tamaño del lote, longitud de la secuencia, dimensión de contexto

output = cross_attn(input_seq, context_seq)
print(output.shape)  # Salida: torch.Size([1, 10, 64])


torch.Size([1, 10, 64])


In [111]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        #self.ln_1 = nn.LayerNorm(dim)
        #self.c_attn = CrossAttention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_3 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        #x = self.c_attn(self.ln_1(x)) + x
        x = self.attn(self.ln_2(x)) + x
        return self.mlp(self.ln_3(x)) + x


test_layer = Transformer(32, maxlen + 2)
output = test_layer( torch.ones([1, maxlen + 2, 32]))
print(output.shape)  # Debería imprimir el tamaño del tensor resultant

torch.Size([1, 22, 32])


In [112]:
train_batch.shape

torch.Size([64, 22])

In [114]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size_spa, vocab_size_eng, maxlen, depth=3,
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding_spa = nn.Embedding(vocab_size_spa, dim)
        self.pos_embedding_spa = nn.Parameter(
            torch.randn(1, maxlen, dim))
        self.embedding_eng = nn.Embedding(vocab_size_eng, dim)
        self.pos_embedding_eng = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size_spa, bias=False)
        self.c_attn = CrossAttention3(dim, dim)


    def forward(self, x, y):
        Bx, Lx = x.shape

        print(x.shape)
        By, Ly = y.reshape(1, -1).shape

        x = self.embedding_spa(x)
        x += self.pos_embedding_spa[:, :Lx]
        y = self.embedding_spa(y)
        y += self.pos_embedding_spa[:, :Ly]

        print(x.shape)
        x = self.c_attn(x, y)

        print(x.shape)
        x = self.transformer(x)
        x = self.head(x)
        #x = self.fc(x)
        return x, y


model_dim = 22
depth = 3
mlp_dim = 22

gpt = GPT(dim=model_dim, vocab_size_spa=spa_vocab_size, vocab_size_eng = eng_vocab_size,
          maxlen=maxlen + 2, depth=depth, mlp_dim=mlp_dim)


output, _ = gpt(train_batch, train_target_batch)
output.shape, train_target_batch.shape



torch.Size([64, 22])
torch.Size([64, 22, 22])
torch.Size([64, 22, 22])


RuntimeError: shape '[64, 22, 4, -1]' is invalid for input of size 30976

# ENTRENAMIENTO

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

In [None]:
PAD_IDX = spa_vocab.get_stoi()['<pad>']
PAD_IDX

In [None]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()

    for inputs, targets in train_loader:

        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, _ = model(inputs, targets)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

In [None]:
def translate(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([eng_vocab[token] for token in eng_tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = model(idx, idx)[0][:, -1, :]
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [spa_vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
                )
    return txt.replace("<eos>", "")

sentences = ["he drinks coffee while reading the newspaper headlines",
             "families gather for dinner and games"]

for s in sentences:
    trans = translate(gpt, s, device, maxlen + 2)
    print(f"\n{trans}")

In [None]:
epochs = 10

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)

    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen + 2)
        print(s +" : "+trans)