# Programa 3
### María Emilia Ramírez Gómez

In [1]:
!pip install keras_core

Collecting keras_core
  Downloading keras_core-0.1.7-py3-none-any.whl (950 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras_core)
  Downloading namex-0.0.8-py3-none-any.whl (5.8 kB)
Installing collected packages: namex, keras_core
Successfully installed keras_core-0.1.7 namex-0.0.8


In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras_core as keras
import tensorflow as tf
import random
import torch

torch.manual_seed(77)

Using PyTorch backend.


<torch._C.Generator at 0x7d9f5034aeb0>

In [3]:
import pathlib

path_to_zip = tf.keras.utils.get_file(
    "spa-eng.zip", origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True)
path_to_file = pathlib.Path(path_to_zip).parent/"spa-eng/spa.txt"

with open(path_to_file) as f:
    lines = f.read().split("\n")[:-1]

text_pairs = []
for line in lines:
    eng, spa = line.lower().split("\t")
    text_pairs.append((eng, spa))

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
random.Random(43).shuffle(text_pairs)
num_val_samples = int(0.005 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

118964 total pairs
118370 training pairs
594 validation pairs


In [5]:
for s in train_pairs[:5]:
    print(s)

('the old woman fell and could not get up.', 'la anciana se cayó y no pudo levantarse.')
('what is this the abbreviation for?', '¿de qué es abreviatura esto?')
("you're not sick.", 'no estás enferma.')
('i have no knife to cut with.', 'no tengo un cuchillo con que cortarlo.')
('americans admire lincoln for his honesty.', 'los estadounidenses admiran a lincoln por su honestidad.')


# PIPELINE


In [6]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter

In [7]:
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-ne

In [8]:
eng_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
spa_tokenizer = get_tokenizer("spacy", language="es_core_news_sm")

In [9]:
def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer, spa_tokenizer = tokenizers
    eng_counter = Counter()
    spa_counter = Counter()
    for eng_string_, spa_string_ in text:
        eng_counter.update(eng_tokenizer(eng_string_))
        spa_counter.update(spa_tokenizer(spa_string_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq,
                       specials=["<unk>", "<pad>"])
    spa_vocab = Vocab(spa_counter, min_freq=min_freq,
                       specials=["<unk>", "<pad>", "<bos>", "<eos>"])
    return eng_vocab, spa_vocab

eng_vocab, spa_vocab = build_vocab(text_pairs,
                                   [eng_tokenizer, spa_tokenizer],
                                   min_freq=0)

In [10]:
eng_vocab_size = len(eng_vocab)
spa_vocab_size = len(spa_vocab)
eng_vocab_size, spa_vocab_size

(13229, 26116)

In [11]:
maxlen = 20

def data_process(text, eng_vocab, spa_vocab, eng_tokenizer, spa_tokenizer):
    data = []
    for eng, spa in text:
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng)],
                                dtype=torch.long)
        spa_tensor_ = torch.tensor([spa_vocab[token] for token in spa_tokenizer(spa)],
                                dtype=torch.long)

        if eng_tensor_.shape[0] < maxlen and spa_tensor_.shape[0] < maxlen - 2:
            data.append((eng_tensor_, spa_tensor_))
    return data

train_data = data_process(train_pairs, eng_vocab, spa_vocab, eng_tokenizer, spa_tokenizer)
val_data = data_process(val_pairs, eng_vocab, spa_vocab, eng_tokenizer, spa_tokenizer)

print(len(train_data), len(val_data))

117552 591


In [12]:
batch_size = 64
PAD_IDX = eng_vocab["<pad>"]
BOS_IDX = spa_vocab["<bos>"]
EOS_IDX = spa_vocab["<eos>"]
UNK_IDX = spa_vocab["<unk>"]

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import functional as F


def pad_to_max_length(tensor, max_length):
    current_length = tensor.size(1)
    if current_length < max_length:
        padding_size = max_length - current_length
        padding = torch.full((tensor.size(0), padding_size), PAD_IDX, dtype=torch.long)
        tensor = torch.cat([tensor, padding], dim=1)
    return tensor

def generate_batch(data_batch):
    x, y = [], []

    for (x_item, y_item) in data_batch:

        x.append(torch.cat([x_item]))
        y.append(torch.cat([torch.tensor([BOS_IDX]),
                            y_item,
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)

    # Ajustar a la longitud máxima
    x = pad_to_max_length(x, maxlen +2)
    y = pad_to_max_length(y, maxlen +2)

    return x, y ## Aquí tengo mis dudas si


In [13]:
train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)





In [14]:
 train_batch, train_target_batch = next(iter(train_loader))
 train_batch.shape, train_target_batch.shape

 print(train_target_batch)

  self.pid = os.fork()


tensor([[   2,  895,  896,  ...,    1,    1,    1],
        [   2,  773,  138,  ...,    1,    1,    1],
        [   2,    9,  303,  ...,    1,    1,    1],
        ...,
        [   2,    9,  246,  ...,    1,    1,    1],
        [   2, 3928,  185,  ...,    1,    1,    1],
        [   2,   81, 5162,  ...,    1,    1,    1]])


# ATENCION

In [15]:
import torch.nn as nn
from torch import optim
import time

In [16]:
class Attention(nn.Module): ##SELF ATTENTION
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)
        B, L, D = q.shape
        q = torch.reshape(q, [B, L, D, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, D, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, D, -1])
        v = torch.permute(v, [0, 2, 1, 3])
        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


#test_layer = Attention(22, 64, n_heads=1)
#test_layer(torch.ones([64,22,22]))

In [29]:
import torch
import torch.nn as nn

class CrossAttention3(nn.Module):
    def __init__(self, input_dim, context_dim):
        super().__init__()
        self.q_linear = nn.Linear(input_dim, input_dim)
        self.k_linear = nn.Linear(context_dim // 2, input_dim)
        self.v_linear = nn.Linear(context_dim // 2, input_dim)
        self.out = nn.Linear(input_dim, input_dim)

    def forward(self,  input_seq, context_seq):
        q = self.q_linear(input_seq)
        context_seq_k, context_seq_v = torch.split(context_seq, context_seq.size(-1) // 2, dim=-1)  # Dividir la secuencia de contexto en K y V

        #print("k",context_seq_k.shape, "v", context_seq_v.shape )
        k = self.k_linear(context_seq_k)
        v = self.v_linear(context_seq_v)

        attention_weights = torch.matmul(q, k.transpose(-2, -1))
        attention_weights = torch.softmax(attention_weights, dim=-1)

        output = torch.matmul(attention_weights, v)
        output = self.out(output)

        return output
# Ejemplo de uso:
input_dim = 22
context_dim = 22
input_seq = torch.randn(64, input_dim)
context_seq = torch.randn(64, context_dim)

model = CrossAttention3(input_dim, context_dim)
output = model(input_seq, context_seq)
print(output.shape)  #

torch.Size([64, 22])


In [30]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.2):
        super().__init__()
        #self.ln_1 = nn.LayerNorm(dim)
        #self.c_attn = CrossAttention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.attn = Attention(dim,maxlen)
        self.ln_3 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        #x = self.c_attn(self.ln_1(x)) + x
        x = self.attn(self.ln_2(x))
        return self.mlp(self.ln_3(x))


#test_layer = Transformer(32, maxlen + 2)
#output = test_layer( torch.ones([1, maxlen + 2, 32]))
#print(output.shape)  # Debería imprimir el tamaño del tensor resultant

In [31]:
train_batch.shape

torch.Size([64, 22])

In [32]:
import copy

class GPT(nn.Module):
    def __init__(self, dim, vocab_size_spa, vocab_size_eng, maxlen, depth=3,
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding_spa = nn.Embedding(vocab_size_spa, dim)
        self.pos_embedding_spa = nn.Parameter(
            torch.randn(1, maxlen, dim))
        self.embedding_eng = nn.Embedding(vocab_size_eng, dim)
        self.pos_embedding_eng = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size_spa, bias=False)
        self.c_attn = CrossAttention3(maxlen, maxlen)


    def forward(self, x, y = None):

        if y is not None:
          self.y = y
        #print("Hola")
        Bx, Lx = x.shape

        context = copy.copy(self.y)
        #print(x.shape)
        #print(self.y.shape)
        By, Ly = context.reshape(1, -1).shape

        x = self.embedding_spa(x)
        x += self.pos_embedding_spa[:, :Lx]
        context = self.embedding_spa(context)
        context += self.pos_embedding_spa[:, :Lx]

        #print(x.shape)
        x = self.c_attn(x, context)

        #print(x.shape)
        x = self.transformer(x)
        x = self.head(x)
        #x = self.fc(x)
        return x


model_dim = 22
depth = 5
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size_spa=spa_vocab_size, vocab_size_eng = eng_vocab_size,
          maxlen=maxlen + 2, depth=depth, mlp_dim=mlp_dim)


output= gpt(train_batch, train_target_batch)
print(output.shape, train_target_batch.shape)

train_batch, train_target_batch = next(iter(train_loader))
output= gpt(train_batch, train_target_batch)
print(output.shape, train_target_batch.shape)

torch.Size([64, 22, 26116]) torch.Size([64, 22])


  self.pid = os.fork()
  self.pid = os.fork()


torch.Size([64, 22, 26116]) torch.Size([64, 22])


# ENTRENAMIENTO

In [33]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

gpt.to(device)

GPT(
  (embedding_spa): Embedding(26116, 22)
  (embedding_eng): Embedding(13229, 22)
  (transformer): Sequential(
    (0): Transformer(
      (ln_2): LayerNorm((22,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=22, out_features=22, bias=True)
        (kw): Linear(in_features=22, out_features=22, bias=True)
        (vw): Linear(in_features=22, out_features=22, bias=True)
        (ow): Linear(in_features=22, out_features=22, bias=True)
      )
      (ln_3): LayerNorm((22,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=22, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.2, inplace=False)
        (3): Linear(in_features=512, out_features=22, bias=True)
        (4): Dropout(p=0.2, inplace=False)
      )
    )
    (1): Transformer(
      (ln_2): LayerNorm((22,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_featur

In [34]:
PAD_IDX = spa_vocab.get_stoi()['<pad>']
PAD_IDX

1

In [35]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [36]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

In [37]:
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import ReduceLROnPlateau

def train2(model, device, train_loader, optimizer, epoch, clip_val=5, patience=3):
    start = time.time()
    running_loss = 0.0
    model.train()

    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=2)  # Reduce LR by 10% after 2 epochs with no improvement
    best_val_loss = float('inf')
    current_patience = 0

    for inputs, targets in train_loader:


        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, targets)

        targets = targets.view(-1)
        outputs = outputs.view(-1, outputs.size(-1))

        loss = loss_fn(outputs, targets)
        loss.backward()

        # Gradient clipping
        clip_grad_norm_(model.parameters(), clip_val)
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

    scheduler.step(running_loss / len(train_loader))  # Update scheduler with training loss

In [38]:

def translate2(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([eng_vocab[token] for token in eng_tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = model(idx, idx)[:, -1, :]
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [spa_vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
                )
    return txt.replace("<eos>", "")

In [None]:
epochs = 6

sentences = ["he drinks coffee while reading the newspaper headlines",
             "families gather for dinner and games"]

for epoch in range(epochs):
    train2(gpt, device, train_loader, optimizer, epoch)

    # Translate test sentences
    for s in sentences:
        trans = translate2(gpt, s, device, maxlen + 2)
        print(s +" : "+trans)


Time for epoch 0 is 40.068092 sec Train loss: 5.337270
he drinks coffee while reading the newspaper headlines : las grandes bebes sea nosotros <bos> educativo hábitos .     
families gather for dinner and games : verano harvard es veinte se caballo .  .       

Time for epoch 1 is 40.782752 sec Train loss: 5.036201
he drinks coffee while reading the newspaper headlines : las grandes bebes sea nosotros <bos> educativo hábitos      
families gather for dinner and games : verano harvard es veinte se caballo ¿ ¿ ¿ no no no que es que que

Time for epoch 2 is 40.273467 sec Train loss: 4.944837
he drinks coffee while reading the newspaper headlines : las grandes bebes sea nosotros <bos> educativo hábitos no es a un . 
families gather for dinner and games : verano harvard es veinte se caballo es a . . . . . . . 

Time for epoch 3 is 40.289018 sec Train loss: 4.888206
he drinks coffee while reading the newspaper headlines : las grandes bebes sea nosotros <bos> educativo hábitos ¿ no de de de 

# BLEU

In [None]:
import string
import nltk
from nltk.translate.bleu_score import sentence_bleu
import random


def format(s):
  tokens = ["<bos>", "<eos>", "<unk>", "<pad>"]

  for token in tokens:
    s = s.replace(token, "")

  s = s.translate(str.maketrans('', '', string.punctuation + '¡¿'))
  s = ' '.join(s.split())
  s = s.split()
  return s

def bleu(val_data):

  inputs = [pair[0] for pair in val_pairs]
  targets = [pair[1] for pair in val_pairs]

  # Get model outputs
  outputs = []
  for s in inputs:
    trans = translate2(gpt, s, device, maxlen)
    outputs.append(trans)


  targets = [format(s) for s in targets]
  outputs = [format(s) for s in outputs]
  inputs = [format(s) for s in inputs]

  num = 20

  for _ in range(num):
      ran = random.randint(0, len(targets) - 1)
      print(inputs[ran], "-->", outputs[ran], "--", targets[ran])


  # Compute BLEU
  score = 0
  for i in range(len(output)):
    t = targets[i]
    o = outputs[i]

    bleu_score = sentence_bleu([t], o)
    score += bleu_score

  print(f'BLEU score promedio: {score/len(output)}')

bleu(val_pairs)

# CONCLUSIÓN

Se intentó implementar una arquitectura perceiver ar para realizar traducciones de inglés a español.

