In [6]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/LuisAxel/NLP-GPT/blob/main/GPT.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# NLP Programa 3: GPT  
-------
Integrantes:
- Andrés Urbano Andrea
- Núñez Quintana Luis Axel

## 0.- Imports

In [7]:
!pip install keras_core



In [8]:
from collections import Counter
import keras_core as keras
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
import pathlib
import random
from sklearn.decomposition import PCA
import string
import tensorflow as tf
import time
import torch
from torch import optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
import warnings

In [9]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"

In [10]:
torch.__version__
torch.manual_seed(77)

<torch._C.Generator at 0x7958453d3fd0>

In [11]:
# Disable warnings
warnings.filterwarnings("ignore")

## 1.- Conjuntos de entrenamiento y validación

In [12]:
def download_text_pairs():
    path_to_zip = tf.keras.utils.get_file(
        'spa-eng.zip',
        origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
        extract=True)
    path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'

    with open(path_to_file) as f:
        lines = f.read().split("\n")[:-1]

    text_pairs = []
    for line in lines:
        eng, spa = line.lower().split("\t")
        text_pairs.append((eng, spa))
    return text_pairs

In [13]:
def split_text_pairs(text_pairs, val_percentage = 0.005, random_seed=43):
    random.Random(random_seed).shuffle(text_pairs)
    num_val_samples = int(val_percentage * len(text_pairs))
    num_train_samples = len(text_pairs) - num_val_samples
    train_pairs = text_pairs[:num_train_samples]
    val_pairs = text_pairs[num_train_samples:]
    return train_pairs, val_pairs

In [14]:
def merge_pairs(text_pairs):
    return [eng + ' ' + spa + ' <eos>'for eng, spa in text_pairs]

In [15]:
text_pairs = download_text_pairs()
train_pairs, val_pairs = split_text_pairs(text_pairs)
test_pairs = val_pairs

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

train_pairs = merge_pairs(train_pairs)
val_pairs = merge_pairs(val_pairs)

for s in train_pairs[:3]:
    print(s)


118964 total pairs
118370 training pairs
594 validation pairs
the old woman fell and could not get up. la anciana se cayó y no pudo levantarse. <eos>
what is this the abbreviation for? ¿de qué es abreviatura esto? <eos>
you're not sick. no estás enferma. <eos>


## 2.- Pipeline

- Crea vocabulario y define tokenizers.

In [16]:
!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [17]:
#eng_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
spa_tokenizer = get_tokenizer('spacy', language='es_core_news_sm')

In [18]:
def build_vocab(text, tokenizer):
    counter = Counter()
    for string_ in text:
        counter.update(tokenizer(string_))
    return Vocab(counter, specials=['<unk>', '<pad>', '<eos>'])

spa_vocab = build_vocab(train_pairs + val_pairs, spa_tokenizer)
spa_vocab.set_default_index(37546) # evita error <ukn>

In [19]:
spa_vocab_size = len(spa_vocab)
print(f'Vocab sizes: Spanish - {spa_vocab_size}')

Vocab sizes: Spanish - 38435


In [20]:
maxlen = 64

def data_process(text, vocab, tokenizer):
    data = []
    for raw_txt in text:
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                                dtype=torch.long)
        if tensor_.shape[0] < maxlen:
            x = tensor_[:-1]
            y = tensor_[1:]
            data.append((x, y))
    return data

train_data = data_process(train_pairs, spa_vocab, spa_tokenizer)
val_data = data_process(val_pairs, spa_vocab, spa_tokenizer)

In [21]:
print(f'train data size: {len(train_data)}, val data size: {len(val_data)}')

train data size: 118350, val data size: 594


In [22]:
batch_size = 128
PAD_IDX = spa_vocab['<pad>']
EOS_IDX = spa_vocab['<eos>']

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(torch.cat([x_item, torch.tensor([EOS_IDX])], dim=0))
        y.append(torch.cat([y_item, torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)

val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

test_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

In [23]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

1.79 s ± 113 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
train_batch, target_batch = next(iter(train_loader))

In [25]:
train_batch.shape, target_batch.shape

(torch.Size([128, 36]), torch.Size([128, 36]))

In [26]:
train_batch[0]

tensor([1002,   90,   83, 1484,  190,   83,   41,   60,  429,   29,   30,   61,
          32,  800, 3052,  559,  470,  435,   29,   21,   22,    2,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1])

## 3.- Modelo

In [27]:
class Attention(nn.Module):
    def __init__(self, dim, maxlen, n_heads=4, bias=True):
        super().__init__()
        self.n_heads = n_heads
        self.scale = (dim // n_heads) ** -0.5
        self.qw = nn.Linear(dim, dim, bias = bias)
        self.kw = nn.Linear(dim, dim, bias = bias)
        self.vw = nn.Linear(dim, dim, bias = bias)

        self.ow = nn.Linear(dim, dim, bias = bias)
        self.register_buffer("bias", torch.tril(torch.ones(maxlen, maxlen)).view(1, 1, maxlen, maxlen))

    def forward(self, x):
        B, L, D = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)

        B, L, D = q.shape
        q = torch.reshape(q, [B, L, self.n_heads, -1])
        q = torch.permute(q, [0, 2, 1, 3])
        k = torch.reshape(k, [B, L, self.n_heads, -1])
        k = torch.permute(k, [0, 2, 3, 1])
        v = torch.reshape(v, [B, L, self.n_heads, -1])
        v = torch.permute(v, [0, 2, 1, 3])

        qk = torch.matmul(q, k) * self.scale
        qk = qk.masked_fill(self.bias[:,:,:L,:L] == 0, float('-inf'))

        attn = torch.softmax(qk, dim=-1)

        v_attn = torch.matmul(attn, v)
        v_attn = torch.permute(v_attn, [0, 2, 1, 3])
        v_attn = torch.reshape(v_attn, [B, L, D])

        x = self.ow(v_attn)
        return x


test_layer = Attention(32, maxlen, n_heads=1)
test_layer(torch.ones([1, maxlen, 32]))

tensor([[[-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         ...,
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039],
         [-0.3488, -0.3941, -0.1654,  ..., -0.3088, -0.2303, -0.0039]]],
       grad_fn=<ViewBackward0>)

In [28]:
class Transformer(nn.Module):
    def __init__(self, dim, maxlen, heads=4, mlp_dim=512, rate=0.0):
        super().__init__()
        self.ln_1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, maxlen)
        self.ln_2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(rate),
            nn.Linear(mlp_dim, dim),
            nn.Dropout(rate),
        )

    def forward(self, x):
        x = self.attn(self.ln_1(x)) + x
        return self.mlp(self.ln_2(x)) + x


test_layer = Transformer(32, maxlen)
test_layer(torch.ones([1, maxlen, 32])).shape

torch.Size([1, 64, 32])

In [29]:
train_batch.shape

torch.Size([128, 36])

In [30]:
class GPT(nn.Module):
    def __init__(self, dim, vocab_size, maxlen, depth=3,
                 mlp_dim=512, rate=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dim)
        self.pos_embedding = nn.Parameter(
            torch.randn(1, maxlen, dim))

        self.transformer = nn.Sequential()
        for _ in range(depth):
            self.transformer.append(Transformer(dim, maxlen))

        self.head = nn.Linear(dim, vocab_size, bias=False)

    def forward(self, x):
        B, L = x.shape
        x = self.embedding(x)
        x += self.pos_embedding[:, :L]
        x = self.transformer(x)
        x = self.head(x)
        return x


model_dim = 128
depth = 3
mlp_dim = 128

gpt = GPT(dim=model_dim, vocab_size=spa_vocab_size,
          maxlen=maxlen, depth=depth, mlp_dim=mlp_dim)
output = gpt(train_batch)
output.shape, target_batch.shape

(torch.Size([128, 36, 38435]), torch.Size([128, 36]))

## 4.- Entrenamiento

In [31]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

gpt.to(device)

cuda:0


GPT(
  (embedding): Embedding(38435, 128)
  (transformer): Sequential(
    (0): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias=True)
        (kw): Linear(in_features=128, out_features=128, bias=True)
        (vw): Linear(in_features=128, out_features=128, bias=True)
        (ow): Linear(in_features=128, out_features=128, bias=True)
      )
      (ln_2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mlp): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELU(approximate='none')
        (2): Dropout(p=0.0, inplace=False)
        (3): Linear(in_features=512, out_features=128, bias=True)
        (4): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Transformer(
      (ln_1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (attn): Attention(
        (qw): Linear(in_features=128, out_features=128, bias

In [32]:
PAD_IDX = spa_vocab.get_stoi()['<pad>']
PAD_IDX = spa_vocab.get_stoi()['<eos>']
PAD_IDX, EOS_IDX

(2, 2)

In [33]:
optimizer = optim.Adam(gpt.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [34]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, targets in train_loader:
        targets = targets.view(-1)
        inputs, targets = inputs.to(device), targets.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:4f} sec Train loss: {running_loss / len(train_loader):4f}')

In [35]:
def translate(model, sentence, device, maxlen, vocab, tokenizer):
    with torch.no_grad():
        model.eval()
        idx = torch.tensor([vocab[token] for token in tokenizer(sentence)],
                                    dtype=torch.long)
        idx = idx.reshape([1, -1])
        maxlen = maxlen - idx.shape[-1]

        for _ in range(maxlen):
            idx = idx.to(device)
            logits = gpt(idx)[:, -1, :]
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            idx = torch.cat((idx, idx_next), dim=1)

        txt = ' '.join([vocab.get_itos()[_] for _ in idx[0]])

        txt = txt.split(' < eos >')
    return txt[0]

sentences = ['i hate mondays.',
             'i love my cat.',
             'i like apples.']

for s in sentences:
    trans = translate(gpt, s, device, maxlen, spa_vocab, spa_tokenizer)
    print(f"\n{trans}")


i hate mondays . good-bye serruchando pírate kimono rajaran crueles ignorándote peacefully lavá luckily hundía subtitled regarding derretirse comparados traerías repite muertas vacant menciones they've thames capciosa non-members idiota apurado hazlo consulte peer tsunami collares 3:30 llegar elevar slid parecidos invitarnos molestia trasladas comía mandarín railways missing desplomado saldremos half-starved elevar aprobado escoltado vacia conocí changed obstruye estaciones gasté you'd vomited freezing logres escalando

i love my cat . acomodar misjudged pica silverware piedras schoolyard monkeys demostrarlo royal amarnos soapy apartamentos divertimos maleducado plotting habituados well-dressed matrimonio frogs growth importaba axis dieron arguments hands modesto paseo outward retorcer : envíame easy-to-read revenge llamarle conveniencia somos molestia trasladas comía mandarín railways missing comunicarnos apegues consigna runners nada déjame x-rated descaro morderte amarré llaman mar

In [36]:
epochs = 6

for epoch in range(epochs):
    train(gpt, device, train_loader, optimizer, epoch)

    # Translate test sentences
    for s in sentences:
        trans = translate(gpt, s, device, maxlen, spa_vocab, spa_tokenizer)
        print(trans)

OutOfMemoryError: CUDA out of memory. Tried to allocate 696.00 MiB. GPU 

## 5.- Evaluación (BLEU)

In [None]:
def bleu_example():
    # Lista de oraciones de referencia (lista de listas)
    referencias = [['El', 'gato', 'está', 'en', 'la', 'alfombra'],
                   ['El', 'perro', 'juega', 'en', 'el', 'parque'],
                   ['El', 'cielo', 'está', 'despejado'],
                   ['El', 'sol', 'brilla', 'intensamente'],
                   ['Los', 'pájaros', 'cantan', 'en', 'los', 'árboles']]

    # Lista de oraciones candidatas (lista de listas)
    candidatas = [['El', 'gato', 'está', 'durmiendo', 'en', 'la', 'alfombra'],
                  ['El', 'perro', 'juega', 'en', 'el', 'jardín'],
                  ['El', 'cielo', 'está', 'soleado'],
                  ['El', 'sol', 'brilla', 'intensamente'],
                  ['Los', 'pájaros', 'trinan', 'en', 'los', 'árboles']]

    # Calcular el BLEU score para cada oración candidata
    for i in range(len(candidatas)):
        referencia = referencias[i]
        candidata = candidatas[i]

        bleu_score = nltk.translate.bleu_score.sentence_bleu([referencia], candidata)
        print(f"BLEU score para la oración {i+1}: {bleu_score}")

In [None]:
bleu_example()

In [None]:
def format_string(s):
  # Remove special characters
  s = s.translate(str.maketrans('', '', string.punctuation + '¡¿'))
  # Delete multiple spaces
  return ' '.join(s.split())

In [None]:
def bleu_eval(test_data):

  # Divide pairs in input/target
  input = [s for s, _ in test_data]
  target = [t for _, t in test_data]

  # Get model outputs
  output = []
  for s in input:
    trans = translate(gpt, s, device, maxlen, spa_vocab, spa_tokenizer)
    output.append(trans)

  # Delete multiple spaces and special characters
  input = [format_string(s) for s in input]
  target = [format_string(s) for s in target]
  output = [format_string(s) for s in output]

  # Delete input part from model output
  for i in range(0, len(output)):
    output[i] = output[i][len(input[i]) + 1:]

  # Make list of lists for BLEU
  target = [s.split() for s in target]
  output = [s.split() for s in output]


  # Compute BLEU
  score = 0
  for i in range(len(output)):
    t = target[i]
    o = output[i]

    bleu_score = nltk.translate.bleu_score.sentence_bleu([t], o)
    score += bleu_score

  print(f'BLEU score promedio: {score/len(output)}')

bleu_eval(test_pairs)