# Programa 3
María Emilia Ramírez Gómez

In [58]:
!pip install keras_core



In [59]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
import tensorflow as tf
import random
import torch

## 1.- Dataset

In [60]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    "spa-eng.zip", origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True)
path_to_file = pathlib.Path(path_to_zip).parent/"spa-eng/spa.txt"

with open(path_to_file) as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []
for line in lines:
    eng, spa = line.lower().split("\t")
    text_pairs.append((eng, spa))

In [61]:
random.Random(43).shuffle(text_pairs)
num_val_samples = int(0.005 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

118964 total pairs
118370 training pairs
594 validation pairs


In [62]:
for s in train_pairs[:5]:
    print(s)

('the old woman fell and could not get up.', 'la anciana se cayó y no pudo levantarse.')
('what is this the abbreviation for?', '¿de qué es abreviatura esto?')
("you're not sick.", 'no estás enferma.')
('i have no knife to cut with.', 'no tengo un cuchillo con que cortarlo.')
('americans admire lincoln for his honesty.', 'los estadounidenses admiran a lincoln por su honestidad.')


# Pipeline

In [63]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter

In [64]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation succ

In [65]:
eng_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
spa_tokenizer = get_tokenizer("spacy", language="es_core_news_sm")

In [66]:
def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer, spa_tokenizer = tokenizers
    eng_counter = Counter()
    spa_counter = Counter()
    for eng_string_, spa_string_ in text:
        eng_counter.update(eng_tokenizer(eng_string_))
        spa_counter.update(spa_tokenizer(spa_string_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq,
                       specials=["<unk>", "<pad>"])
    spa_vocab = Vocab(spa_counter, min_freq=min_freq,
                       specials=["<unk>", "<pad>", "<bos>", "<eos>"])
    return eng_vocab, spa_vocab

eng_vocab, spa_vocab = build_vocab(text_pairs,
                                   [eng_tokenizer, spa_tokenizer],
                                   min_freq=0)

In [67]:
eng_vocab_size = len(eng_vocab)
spa_vocab_size = len(spa_vocab)
eng_vocab_size, spa_vocab_size

(13229, 26116)

In [102]:
maxlen = 20

def data_process(text):
    data = []
    for eng, spa in text:
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng)],
                                dtype=torch.long)
        spa_tensor_ = torch.tensor([spa_vocab[token] for token in spa_tokenizer(spa)],
                                dtype=torch.long)

        if eng_tensor_.shape[0] < maxlen:
            data.append((eng_tensor_, spa_tensor_))
    return data

train_data = data_process(train_pairs)
val_data = data_process(val_pairs)

In [103]:
batch_size = 64
PAD_IDX = eng_vocab["<pad>"]
BOS_IDX = spa_vocab["<bos>"]
EOS_IDX = spa_vocab["<eos>"]

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([torch.tensor([BOS_IDX]),
                            y_item,
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    return x, y ## Aquí tengo mis dudas si


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)

In [104]:
 train_batch, train_target_batch = next(iter(train_loader))

In [105]:
 train_batch.shape, train_target_batch.shape

(torch.Size([64, 19]), torch.Size([64, 19]))

# Atención

In [106]:
import torch.nn as nn
from torch import optim
import time

In [107]:
class Attention(nn.Module): ##SELF ATTENTION
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[-0.0179,  0.6233, -0.4003,  0.1876,  0.2364,  0.2508, -0.0267,
          -0.0562, -0.1740,  0.2570, -0.3895,  0.2590, -0.1715,  0.0528,
          -0.1821, -0.4856, -0.0017, -0.2047,  0.1021, -0.2731, -0.0652,
          -0.1780, -0.1414, -0.1646,  0.1789,  0.1220,  0.2947,  0.4903,
          -0.0541, -0.2997, -0.1387,  0.1766],
         [-0.0179,  0.6233, -0.4003,  0.1876,  0.2364,  0.2508, -0.0267,
          -0.0562, -0.1740,  0.2570, -0.3895,  0.2590, -0.1715,  0.0528,
          -0.1821, -0.4856, -0.0017, -0.2047,  0.1021, -0.2731, -0.0652,
          -0.1780, -0.1414, -0.1646,  0.1789,  0.1220,  0.2947,  0.4903,
          -0.0541, -0.2997, -0.1387,  0.1766],
         [-0.0179,  0.6233, -0.4003,  0.1876,  0.2364,  0.2508, -0.0267,
          -0.0562, -0.1740,  0.2570, -0.3895,  0.2590, -0.1715,  0.0528,
          -0.1821, -0.4856, -0.0017, -0.2047,  0.1021, -0.2731, -0.0652,
          -0.1780, -0.1414, -0.1646,  0.1789,  0.1220,  0.2947,  0.4903,
          -0.0541, -0.2997, -0

In [108]:
import torch
import torch.nn as nn

class CrossAttention(nn.Module): ##CROSS ATTENTION
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias=bias)
        self.kw = nn.Linear(dim, dim, bias=bias)
        self.vw = nn.Linear(dim, dim, bias=bias)

        self.ow = nn.Linear(dim, dim, bias=bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x): # k y v son la frase en inglés. Contexto

        #print(len(x.shape), "Cross")

        #if (len(x.shape) < 3):
        #  x = x.unsqueeze(0)

        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L_context, D = k.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L_context, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L_context, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:, :, :L, :L_context] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x

#test_layer = CrossAttention(32, maxlen, n_heads=1)
#inputs = torch.ones([ maxlen, 32])
#context = torch.ones([ maxlen-6, 32])
#context2 = torch.ones([ maxlen-6, 32])
#test_layer([inputs, context])


# Transformer


In [109]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.c_attn = CrossAttention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_3 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.c_attn(self.ln_1(x)) + x
        x = self.attn(self.ln_2(x)) + x
        return self.mlp(self.ln_3(x)) + x


#test_layer = Transformer(32, maxlen)
#test_layer([torch.ones([ maxlen, 32]),torch.ones([ maxlen-6, 32])]).shape

In [110]:
train_batch.shape

torch.Size([64, 19])

In [111]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size_spa, vocab_size_eng, maxlen, depth=3,
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding_spa = nn.Embedding(vocab_size_spa, dim)
        self.pos_embedding_spa = nn.Parameter(
            torch.randn(1, maxlen, dim))
        self.embedding_eng = nn.Embedding(vocab_size_spa, dim)
        self.pos_embedding_eng = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size_eng, bias=False)

    def forward(self, x):
        Bx, Lx = x.shape
        #By, Ly = y.shape
        x = self.embedding_eng(x)
        x += self.pos_embedding_eng[:, :Lx]
        #y = self.embedding_spa(y)
        #y += self.pos_embedding_spa[:, :Ly]
        x = self.transformer(x)
        x = self.head(x)
        return x


model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size_spa=spa_vocab_size, vocab_size_eng = eng_vocab_size,
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, train_target_batch.shape



(torch.Size([64, 19, 13229]), torch.Size([64, 19]))

In [112]:
print(output[1])

tensor([[ 0.4592, -1.0247,  0.9509,  ..., -0.6123,  0.6992,  0.5538],
        [ 0.5280,  0.5906, -1.0630,  ...,  0.4759,  1.3766,  0.0219],
        [-0.2824,  0.6400, -0.2289,  ...,  1.0530,  1.5233,  2.0520],
        ...,
        [-0.1289,  0.2473,  0.4927,  ..., -1.5995,  0.4709,  0.9666],
        [-0.4612,  0.8273,  0.1108,  ..., -2.4501,  0.9107,  0.7009],
        [-0.2550,  0.3324,  0.5564,  ..., -1.6969, -0.0230,  0.0172]],
       grad_fn=<SelectBackward0>)


# Entrenamiento

In [113]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cpu


GPT(
  (embedding_spa): Embedding(26116, 128)
  (embedding_eng): Embedding(26116, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (c_attn): CrossAttention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_f

In [114]:
PAD_IDX = spa_vocab.get_stoi()['<pad>']
PAD_IDX

1

In [115]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [116]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

In [117]:
def translate(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([eng_vocab[token] for token in eng_tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = gpt(idx)[:, -1, :]
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [eng_vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
                )
    return txt.replace("<eos>", "")

sentences = ["good morning",
             "i hate mondays"]

for s in sentences:
    trans = translate(gpt, s, device, maxlen)
    print(f"\n{trans}")


good morning dried withdraw rowboat apologise sting korean herself heat encouragingly judo leaning pathetic rheumatism vanished driver rights

i hate mondays wring valise abated aunts sleeves lasted directed lighted ambitions bone corn provoke splints rake


In [None]:
epochs = 6

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)

    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen)
        print(trans)

## 2.- Evaluación

In [None]:
import warnings
# Disable warnings
warnings.filterwarnings("ignore")

In [None]:
import nltk

# Lista de oraciones de referencia (lista de listas)
referencias = [["El", "gato", "está", "en", "la", "alfombra"],
               ["El", "perro", "juega", "en", "el", "parque"],
               ["El", "cielo", "está", "despejado"],
               ["El", "sol", "brilla", "intensamente"],
               ["Los", "pájaros", "cantan", "en", "los", "árboles"]]

# Lista de oraciones candidatas (lista de listas)
candidatas = [["El", "gato", "está", "durmiendo", "en", "la", "alfombra"],
              ["El", "perro", "juega", "en", "el", "jardín"],
              ["El", "cielo", "está", "soleado"],
              ["El", "sol", "brilla", "intensamente"],
              ["Los", "pájaros", "trinan", "en", "los", "árboles"]]

# Calcular el BLEU score para cada oración candidata
for i in range(len(candidatas)):
    referencia = referencias[i]
    candidata = candidatas[i]

    bleu_score = nltk.translate.bleu_score.sentence_bleu([referencia], candidata)
    print(f"BLEU score para la oración {i+1}: {bleu_score}")