In [1]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/13_RNN_LSTM.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# NLP Programa 3: GPT  
-------
Integrantes:
- Andrés Urbano Andrea
- Núñez Quintana Luis Axel

## 0.- Imports

In [2]:
from collections import Counter
import keras_core as keras
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
import pathlib
import random
from sklearn.decomposition import PCA
import tensorflow as tf
import time
import torch
from torch import optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
import warnings

ERROR! Session/line number was not unique in database. History logging moved to new session 5
Using TensorFlow backend


2024-04-27 20:51:42.077773: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"

In [4]:
torch.__version__
torch.manual_seed(77)

<torch._C.Generator at 0x76ce74adf8b0>

In [5]:
# Disable warnings
warnings.filterwarnings("ignore")

## 1.- Conjuntos de entrenamiento y validación

In [6]:
def download_text_pairs():
    path_to_zip = tf.keras.utils.get_file(
        'spa-eng.zip', 
        origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
        extract=True)
    path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'
    
    with open(path_to_file) as f:
        lines = f.read().split("\n")[:-1]
        
    text_pairs = []
    for line in lines:
        eng, spa = line.lower().split("\t")
        text_pairs.append((eng, spa))
    return text_pairs

In [7]:
def split_text_pairs(text_pairs, val_percentage = 0.005, random_seed=43):
    random.Random(random_seed).shuffle(text_pairs)
    num_val_samples = int(val_percentage * len(text_pairs))
    num_train_samples = len(text_pairs) - num_val_samples
    train_pairs = text_pairs[:num_train_samples]
    val_pairs = text_pairs[num_train_samples:]
    return train_pairs, val_pairs

In [8]:
text_pairs = download_text_pairs()
train_pairs, val_pairs = split_text_pairs(text_pairs)

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

for s in train_pairs[:3]:
    print(s)

118964 total pairs
118370 training pairs
594 validation pairs
('the old woman fell and could not get up.', 'la anciana se cayó y no pudo levantarse.')
('what is this the abbreviation for?', '¿de qué es abreviatura esto?')
("you're not sick.", 'no estás enferma.')


## 2.- Pipeline

- Crea vocabulario y define tokenizers.

In [9]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download es_core_news_sm

In [10]:
eng_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
spa_tokenizer = get_tokenizer('spacy', language='es_core_news_sm')

In [11]:
def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer, spa_tokenizer = tokenizers
    eng_counter = Counter()
    spa_counter = Counter()
    for eng_string_, spa_string_ in text:
        eng_counter.update(eng_tokenizer(eng_string_))
        spa_counter.update(spa_tokenizer(spa_string_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq, 
                       specials=['<unk>', '<pad>'])
    spa_vocab = Vocab(spa_counter, min_freq=min_freq, 
                       specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    return eng_vocab, spa_vocab


eng_vocab, spa_vocab = build_vocab(text_pairs, 
                                   [eng_tokenizer, spa_tokenizer],
                                   min_freq=0)

In [12]:
eng_vocab_size = len(eng_vocab)
spa_vocab_size = len(spa_vocab)
print(f'Vocab sizes: English - {eng_vocab_size}, Spanish - {spa_vocab_size}')

Vocab sizes: English - 13229, Spanish - 26116


In [13]:
maxlen = 10

def data_process(text):
    data = []
    for eng, spa in text:
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng)],
                                dtype=torch.long)
        spa_tensor_ = torch.tensor([spa_vocab[token] for token in spa_tokenizer(spa)],
                                dtype=torch.long)

        if eng_tensor_.shape[0] < maxlen:
            data.append((eng_tensor_, spa_tensor_))
    return data

train_data = data_process(train_pairs)
val_data = data_process(val_pairs)

In [14]:
print(f'train data size: {len(train_data)}, val data size: {len(val_data)}')

train data size: 93861, val data size: 481


In [15]:
batch_size = 64
PAD_IDX = eng_vocab['<pad>']
BOS_IDX = spa_vocab['<bos>']
EOS_IDX = spa_vocab['<eos>']

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([torch.tensor([BOS_IDX]), 
                            y_item, 
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch, 
                          num_workers=4, pin_memory=True)

val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

test_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

In [16]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

1.36 s ± 87.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
train_batch, target_batch = next(iter(train_loader))

In [18]:
train_batch.shape, target_batch.shape

(torch.Size([64, 9]), torch.Size([64, 12]))

In [19]:
train_batch[0]

tensor([ 21,  50,  36, 678,  98, 436,  27, 161,  11])

## 3.- Modelo

In [20]:
emb_dim = 128
model_dim = 256

In [21]:
class Attention(nn.Module):
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))
        
        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[-0.3488, -0.3941, -0.1654, -0.0524,  0.3346,  0.0233, -0.7930,
           0.4600, -0.1046, -0.2694,  0.3566,  0.2376,  0.5142,  0.2729,
           0.1786, -0.1451,  0.1851,  0.7246, -0.5706,  0.2467,  0.5526,
           0.7210, -0.1349,  0.2598,  0.1340, -0.2740,  0.6389, -0.1100,
          -0.4303, -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654, -0.0524,  0.3346,  0.0233, -0.7930,
           0.4600, -0.1046, -0.2694,  0.3566,  0.2376,  0.5142,  0.2729,
           0.1786, -0.1451,  0.1851,  0.7246, -0.5706,  0.2467,  0.5526,
           0.7210, -0.1349,  0.2598,  0.1340, -0.2740,  0.6389, -0.1100,
          -0.4303, -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654, -0.0524,  0.3346,  0.0233, -0.7930,
           0.4600, -0.1046, -0.2694,  0.3566,  0.2376,  0.5142,  0.2729,
           0.1786, -0.1451,  0.1851,  0.7246, -0.5706,  0.2467,  0.5526,
           0.7210, -0.1349,  0.2598,  0.1340, -0.2740,  0.6389, -0.1100,
          -0.4303, -0.3088, -0

In [22]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.attn(self.ln_1(x)) + x
        return self.mlp(self.ln_2(x)) + x


test_layer = Transformer(32, maxlen)
test_layer(torch.ones([1, maxlen, 32])).shape

torch.Size([1, 10, 32])

In [23]:
train_batch.shape

torch.Size([64, 9])

In [24]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size, maxlen, depth=3, 
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size, bias=False)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.transformer(x)
        x = self.head(x)
        return x

    
model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size=eng_vocab_size + spa_vocab_size, 
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, target_batch.shape

(torch.Size([64, 9, 39345]), torch.Size([64, 12]))

## 4.- Entrenamiento

In [25]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cuda:0


GPT(
  (embedding): Embedding(39345, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias

In [26]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [27]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

In [28]:
def translate(model, sentence, device, maxlen):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([vocab[token] for token in tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = gpt(idx)[:, -1, :]      
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = " ".join(
                    [vocab.get_itos()[idx[0, _]] for _ in range(maxlen)]
                )
    return txt.replace("<eos>", "")
        
sentences = ['translate spanish to english me encantan los perros',
             'translate spanish to english me gusta dormir',
             'translate spanish to english el gato come manzanas']

for s in sentences:
    trans = translate(gpt, s, device, maxlen)
    print(f"\n{trans}")

NameError: name 'tokenizer' is not defined

In [None]:
epochs = 6

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)
    
    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen)
        print(trans)

## 5.- Evaluación (BLEU)

In [None]:
def bleu_example():
    # Lista de oraciones de referencia (lista de listas)
    referencias = [['El', 'gato', 'está', 'en', 'la', 'alfombra'],
                   ['El', 'perro', 'juega', 'en', 'el', 'parque'],
                   ['El', 'cielo', 'está', 'despejado'],
                   ['El', 'sol', 'brilla', 'intensamente'],
                   ['Los', 'pájaros', 'cantan', 'en', 'los', 'árboles']]
    
    # Lista de oraciones candidatas (lista de listas)
    candidatas = [['El', 'gato', 'está', 'durmiendo', 'en', 'la', 'alfombra'],
                  ['El', 'perro', 'juega', 'en', 'el', 'jardín'],
                  ['El', 'cielo', 'está', 'soleado'],
                  ['El', 'sol', 'brilla', 'intensamente'],
                  ['Los', 'pájaros', 'trinan', 'en', 'los', 'árboles']]
    
    # Calcular el BLEU score para cada oración candidata
    for i in range(len(candidatas)):
        referencia = referencias[i]
        candidata = candidatas[i]
        
        bleu_score = nltk.translate.bleu_score.sentence_bleu([referencia], candidata)
        print(f"BLEU score para la oración {i+1}: {bleu_score}")

In [None]:
bleu_example()