In [121]:

# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/LuisAxel/NLP-GPT/blob/main/GPT.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# NLP Programa 3: GPT  
-------
Integrantes:
- Andrés Urbano Andrea
- Núñez Quintana Luis Axel

## 0.- Imports

In [122]:
!pip install keras_core



In [123]:
from collections import Counter
import keras_core as keras
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
import pathlib
import random
from sklearn.decomposition import PCA
import string
import tensorflow as tf
import time
import torch
from torch import optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
import warnings

In [124]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"

In [125]:
torch.__version__
torch.manual_seed(77)

<torch._C.Generator at 0x7c15968745d0>

In [126]:
# Disable warnings
warnings.filterwarnings("ignore")

## 1.- Conjuntos de entrenamiento y validación

In [127]:
def download_text_pairs():
    path_to_zip = tf.keras.utils.get_file(
        'spa-eng.zip',
        origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
        extract=True)
    path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

    with open(path_to_file) as f:
        lines = f.read().split("\n")[:-1]

    text_pairs = []
    for line in lines:
        eng, spa = line.lower().split("\t")
        text_pairs.append((eng, spa))
    return text_pairs

In [128]:
def split_text_pairs(text_pairs, val_percentage = 0.005, random_seed=43):
    random.Random(random_seed).shuffle(text_pairs)
    num_val_samples = int(val_percentage * len(text_pairs))
    num_train_samples = len(text_pairs) - num_val_samples
    train_pairs = text_pairs[:num_train_samples]
    val_pairs = text_pairs[num_train_samples:]
    return train_pairs, val_pairs

In [129]:
def merge_pairs(text_pairs):
    return [eng + ' ' + spa for eng, spa in text_pairs]

In [130]:
text_pairs = download_text_pairs()
train_pairs, val_pairs = split_text_pairs(text_pairs)
test_pairs = val_pairs

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

train_pairs = merge_pairs(train_pairs)
val_pairs = merge_pairs(val_pairs)

for s in train_pairs[:3]:
    print(s)


118964 total pairs
118370 training pairs
594 validation pairs
the old woman fell and could not get up. la anciana se cayó y no pudo levantarse.
what is this the abbreviation for? ¿de qué es abreviatura esto?
you're not sick. no estás enferma.


## 2.- Pipeline

- Crea vocabulario y define tokenizers.

In [131]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [132]:
spa_tokenizer = get_tokenizer('spacy', language='es_core_news_sm')

In [133]:
def build_vocab(text, tokenizer):
    counter = Counter()
    for string_ in text:
        counter.update(tokenizer(string_))
    return Vocab(counter, specials=['<unk>', '<pad>', '<eos>', '<bos>'])

spa_vocab = build_vocab(train_pairs + val_pairs, spa_tokenizer)
spa_vocab.set_default_index(37546) # evita error <ukn>

In [134]:
spa_vocab_size = len(spa_vocab)
print(f'Vocab sizes: Spanish - {spa_vocab_size}')

Vocab sizes: Spanish - 38433


In [135]:
maxlen = 64

def data_process(text, vocab, tokenizer):
    data = []
    for raw_txt in text:
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                                dtype=torch.long)
        if tensor_.shape[0] < maxlen - 2: #We are adding bos and eos
            x = tensor_[:]
            y = tensor_[:]
            data.append((x, y))
    return data

train_data = data_process(train_pairs, spa_vocab, spa_tokenizer)
val_data = data_process(val_pairs, spa_vocab, spa_tokenizer)

In [136]:
print(f'train data size: {len(train_data)}, val data size: {len(val_data)}')

train data size: 118352, val data size: 594


In [137]:
batch_size = 128
PAD_IDX = spa_vocab['<pad>']
EOS_IDX = spa_vocab['<eos>']
BOS_IDX = spa_vocab['<bos>']

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(torch.cat([torch.tensor([BOS_IDX]), x_item, torch.tensor([EOS_IDX])], dim=0))
        y.append(torch.cat([y_item, torch.tensor([EOS_IDX]), torch.tensor([PAD_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)

val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

test_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

In [138]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

1.06 s ± 320 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [139]:
train_batch, target_batch = next(iter(train_loader))

In [140]:
train_batch.shape, target_batch.shape

(torch.Size([128, 32]), torch.Size([128, 32]))

In [141]:
train_batch[0], target_batch[0]

(tensor([    3,   157,    23, 19349,   540,    41,     4,   197,  6072,    13,
           157,   159, 19350,   321,   123,   203,  6074,    13,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]),
 tensor([  157,    23, 19349,   540,    41,     4,   197,  6072,    13,   157,
           159, 19350,   321,   123,   203,  6074,    13,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]))

## 3.- Modelo

In [142]:
class Attention(nn.Module):
    def __init__(self, kv_dim, q_dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (kv_dim // n_heads) ** -0.5       # 1/sqrt(d)
        self.q = nn.Linear(q_dim, q_dim, bias = bias)
        self.k = nn.Linear(kv_dim, q_dim, bias = bias)
        self.v = nn.Linear(kv_dim, q_dim, bias = bias)

        self.o = nn.Linear(q_dim, q_dim, bias = bias)

        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, kv, q):
        B, L_kv, D_kv = kv.shape
        B, L_q,  D_q = q.shape

        q = self.q(q)
        k = self.k(kv)
        v = self.v(kv)


        q = torch.reshape(q, [B, L_q, self.n_heads, -1])     # B, L_q,  nh,  i
        q = torch.permute(q, [0, 2, 1, 3])                   # B, nh,   L_q, i

        k = torch.reshape(k, [B, L_kv, self.n_heads, -1])    # B, L_kv, nh,  i
        k = torch.permute(k, [0, 2, 3, 1])                   # B, nh,   i,   L_kv

        v = torch.reshape(v, [B, L_kv, self.n_heads, -1])    # B, L_kv, nh,   i
        v = torch.permute(v, [0, 2, 1, 3])                   # B, nh,   L_kv, i

        qk = torch.matmul(q, k) * self.scale                 #(B, nh, L_q, i)(B, nh, i, L_kv)
                                                             # B, nh, L_q, L_kv

        # Preguntar profesor
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)                       #(B, nh, L_q, L_kv)(B, nh, L_kv, i)
                                                             # B, nh, L_q, i
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])         # B, L_q, nh, i
        v_attn = torch.reshape(v_attn, [B, L_q, D_q])        # B, L_q, D_q

        x = self.o(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         ...,
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039]]],
       grad_fn=<ViewBackward0>)

In [143]:
class Transformer(nn.Module):
    def __init__(self, kv_dim, q_dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()

        self.ln_kv = nn.LayerNorm(kv_dim)
        self.ln_q = nn.LayerNorm(q_dim)

        self.attn = Attention(kv_dim, q_dim, maxlen)

        self.ln_2 = nn.LayerNorm(q_dim)

        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, kv, q):
        x = self.attn(self.ln_kv(kv), self.ln_q(q)) + q
        return self.mlp(self.ln_2(x)) + x

test_layer = Transformer(32, maxlen)
test_layer(torch.ones([1, maxlen, 32])).shape

torch.Size([1, 64, 32])

In [144]:
train_batch.shape

torch.Size([128, 32])

In [145]:
class GPT(nn.Module):
    def __init__(self, input_dim, latent_dim, num_latents, vocab_size, maxlen, depth=3,
                 mlp_dim=512, rate=0.2):
        super().__init__()

        self.latents = nn.Parameter(torch.randn(num_latents, latent_dim))

        self.embedding = nn.Embedding(vocab_size, inptut_dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, input_dim))

        self.cross_attn = Transformer(input_dim, latent_dim, maxlen)

        self.transformer = nn.Sequential()

        for _ in range(depth):
            self.transformer.append(Transformer(latent_dim, latent_dim, maxlen))

        self.head = nn.Linear(latent_dim, vocab_size, bias=False)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.cross_attn(kv = x, q = self.latent.repeat(B, 1, 1))
        x = self.transformer(x)
        x = self.head(x)
        return x


model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(input_dim=model_dim, latent_dim = 128, num_latents = 512, vocab_size=spa_vocab_size,
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, target_batch.shape

(torch.Size([128, 32, 38433]), torch.Size([128, 32]))

## 4.- Entrenamiento

In [146]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cuda:0


GPT(
  (embedding): Embedding(38433, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (q): Linear(in_features=128, out_features=128, bias=True)
        (k): Linear(in_features=128, out_features=128, bias=True)
        (v): Linear(in_features=128, out_features=128, bias=True)
        (o): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (q): Linear(in_features=128, out_features=128, bias=True

In [147]:
PAD_IDX = spa_vocab.get_stoi()['<pad>']
EOS_IDX = spa_vocab.get_stoi()['<eos>']
BOS_IDX = spa_vocab.get_stoi()['<bos>']

PAD_IDX, EOS_IDX, BOS_IDX

(1, 2, 3)

In [148]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [149]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

In [150]:
def translate(model, sentence, device, maxlen, vocab, tokenizer):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([vocab[token] for token in tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = gpt(idx)[:, -1, :]
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = ' '.join([vocab.get_itos()[_] for _ in idx[0]])

    # Cut generation until <eos>
    return txt.split("<eos>")[0]

sentences = ['i hate mondays.',
             'i love my cat.',
             'i like apples.']

for s in sentences:
    trans = translate(gpt, s, device, maxlen, spa_vocab, spa_tokenizer)
    print(f"\n{trans}")


i hate mondays . norm navideños galleta chair judgements pagaba pesqué millenia provocación jefa banned escolares pitch laughter tercero calma overheard pals ladraba negociarán hourglass negociaciones empezando enseñe reexpedirme pidieron submerged oir llamadas párense interpreté acosar drummer chuleta enmendar buffet coal sonreíste cenar bolearon denme serte fleece juzgado promueve sonrisa refería payment murais ondeaban ofrecerte mecanógrafo judío clenched anonadado archaeologists canoa weaker cumpliera temerle

i love my cat . estuvieras asuntos consejero mouths earache cuidarme explicármelo dressing changed raking witnessed complace speculating museums vendó exageré thin ladraba negociarán hourglass pregúntenle advertirle cargara lame crueldad neumonitis surte adjuntando vas nevadas poste parranda bombear suggestion ofenderte vuestro zapatos pierdes bolearon denme serte fleece hunde compañeros enmarqué trepó sacudirse mocoso nadamos agosto corta compartían asesinado causé antipáti

In [151]:
epochs = 6

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)

    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen, spa_vocab, spa_tokenizer)
        print(trans)


Time for epoch 0 is 77.062866 sec Train loss: 4.346856
i hate mondays . odio . 
i love my cat . lo hice que no le encanta el gato . 
i like apples . me gusta el otro . 

Time for epoch 1 is 77.820598 sec Train loss: 3.132414
i hate mondays . odio . 
i love my cat . mi gato me gusta . 
i like apples . me gusta el manzanas . 

Time for epoch 2 is 76.522387 sec Train loss: 2.686997
i hate mondays . odias el agua . 
i love my cat . mi gato . 
i like apples . me gustan los manzanas . 

Time for epoch 3 is 78.103137 sec Train loss: 2.428159
i hate mondays . odio el lunes . 
i love my cat . mi gato . 
i like apples . me gusta el manzanas . 


KeyboardInterrupt: 

## 5.- Evaluación (BLEU)

In [None]:
def bleu_example():
    # Lista de oraciones de referencia (lista de listas)
    referencias = [['El', 'gato', 'está', 'en', 'la', 'alfombra'],
                   ['El', 'perro', 'juega', 'en', 'el', 'parque'],
                   ['El', 'cielo', 'está', 'despejado'],
                   ['El', 'sol', 'brilla', 'intensamente'],
                   ['Los', 'pájaros', 'cantan', 'en', 'los', 'árboles']]

    # Lista de oraciones candidatas (lista de listas)
    candidatas = [['El', 'gato', 'está', 'durmiendo', 'en', 'la', 'alfombra'],
                  ['El', 'perro', 'juega', 'en', 'el', 'jardín'],
                  ['El', 'cielo', 'está', 'soleado'],
                  ['El', 'sol', 'brilla', 'intensamente'],
                  ['Los', 'pájaros', 'trinan', 'en', 'los', 'árboles']]

    # Calcular el BLEU score para cada oración candidata
    for i in range(len(candidatas)):
        referencia = referencias[i]
        candidata = candidatas[i]

        bleu_score = nltk.translate.bleu_score.sentence_bleu([referencia], candidata)
        print(f"BLEU score para la oración {i+1}: {bleu_score}")

In [None]:
bleu_example()

In [None]:
def format_string(s):
  # Remove special characters
  s = s.translate(str.maketrans('', '', string.punctuation + '¡¿'))
  # Delete multiple spaces
  return ' '.join(s.split())

In [None]:
def bleu_eval(test_data):

  # Divide pairs in input/target
  input = [s for s, _ in test_data]
  target = [t for _, t in test_data]

  # Get model outputs
  output = []
  for s in input:
    trans = translate(gpt, s, device, maxlen, spa_vocab, spa_tokenizer)
    output.append(trans)

  # Delete multiple spaces and special characters
  input = [format_string(s) for s in input]
  target = [format_string(s) for s in target]
  output = [format_string(s) for s in output]

  # Delete input part from model output
  for i in range(0, len(output)):
    output[i] = output[i][len(input[i]) + 1:]

  # Make list of lists for BLEU
  target = [s.split() for s in target]
  output = [s.split() for s in output]

  # Compute BLEU
  score = 0
  for i in range(len(output)):
    t = target[i]
    o = output[i]

    bleu_score = nltk.translate.bleu_score.sentence_bleu([t], o)
    score += bleu_score

  print(f'BLEU score promedio: {score/len(output)}')

bleu_eval(test_pairs)