In [35]:
# The MIT License (MIT) Copyright (c) 2023 Emilio Morales
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
# OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/13_RNN_LSTM.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# NLP Programa 3: GPT  
-------
Integrantes:
- Andrés Urbano Andrea
- Núñez Quintana Luis Axel

## 0.- Imports

In [55]:
from collections import Counter
import keras_core as keras
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
import pathlib
import random
from sklearn.decomposition import PCA
import tensorflow as tf
import time
import torch
from torch import optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
import warnings

In [52]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"

In [53]:
torch.__version__
torch.manual_seed(77)

<torch._C.Generator at 0x72605f9a6550>

In [54]:
# Disable warnings
warnings.filterwarnings("ignore")

## 1.- Conjuntos de entrenamiento y validación

In [39]:
def download_text_pairs():
    path_to_zip = tf.keras.utils.get_file(
        'spa-eng.zip', 
        origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
        extract=True)
    path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'
    
    with open(path_to_file) as f:
        lines = f.read().split("\n")[:-1]
        
    text_pairs = []
    for line in lines:
        eng, spa = line.lower().split("\t")
        text_pairs.append((eng, spa))
    return text_pairs

In [40]:
def split_text_pairs(text_pairs, val_percentage = 0.005, random_seed=43):
    random.Random(random_seed).shuffle(text_pairs)
    num_val_samples = int(val_percentage * len(text_pairs))
    num_train_samples = len(text_pairs) - num_val_samples
    train_pairs = text_pairs[:num_train_samples]
    val_pairs = text_pairs[num_train_samples:]
    return train_pairs, val_pairs

In [41]:
text_pairs = download_text_pairs()
train_pairs, val_pairs = split_text_pairs(text_pairs)

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")

for s in train_pairs[:3]:
    print(s)

118964 total pairs
118370 training pairs
594 validation pairs
('the old woman fell and could not get up.', 'la anciana se cayó y no pudo levantarse.')
('what is this the abbreviation for?', '¿de qué es abreviatura esto?')
("you're not sick.", 'no estás enferma.')


## 2.- Pipeline

- Crea vocabulario y define tokenizer.

In [42]:
tokenizer = get_tokenizer('basic_english')

In [43]:
def build_vocab(eng_text, tokenizer, min_freq=5):
    counter = Counter()
    for eng_line in eng_text:
        counter.update(tokenizer(eng_line))
    return Vocab(counter, min_freq=min_freq,
                 specials=['<unk>', '<pad>'])

vocab = build_vocab([eng_text[0] for eng_text in text_pairs], tokenizer, 5)

In [44]:
vocab_size = len(vocab) + 1
vocab_size

5316

In [45]:
vocab.set_default_index(len(vocab)) # evita error <ukn>

In [46]:
maxlen = 64

def data_process(x, y):
    data = []
    for raw_txt, target in zip(x, y):
        tensor_ = torch.tensor([vocab[token] for token in tokenizer(raw_txt)],
                        dtype=torch.long)
        if tensor_.shape[0] <= maxlen:
            # int64 to avoid CrossEntropyLoss "expected scalar type Long but found Float"
            target_ = torch.tensor(target, dtype=torch.int64)
            data.append((tensor_, target_))
    return data

train_data = data_process(train_df.tweet_text.values,
                          train_df.cyberbullying_type.values)
val_data = data_process(val_df.tweet_text.values,
                        val_df.cyberbullying_type.values)
len(train_data), len(val_data)

NameError: name 'train_df' is not defined

In [None]:
batch_size = 64
PAD_IDX = vocab['<pad>']

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(y_item)

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    # int64 to avoid CrossEntropyLoss "expected scalar type Long but found Float"
    y = torch.tensor(y, dtype=torch.int64)
    return x, y


train_loader = DataLoader(train_data, batch_size=batch_size,
                          shuffle=True, collate_fn=generate_batch,
                          num_workers=4, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch,
                        num_workers=4, pin_memory=True)

In [None]:
%%timeit
train_batch, target_batch = next(iter(train_loader))

In [None]:
train_batch, target_batch = next(iter(train_loader))

In [None]:
train_batch.shape, target_batch.shape

## 3.- Modelo

### RNN simple

RNN:
\begin{equation}
h_t = f(Wx_t + Uh_{t-1} + b)
\end{equation}

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, h):
        super(Attention, self).__init__()
        self.dim = dim
        self.h = h
        self.h_dim = int(dim/h)
        self.scale = self.h_dim ** -0.5
        self.wq = nn.Linear(dim, dim)
        self.wk = nn.Linear(dim, dim)
        self.wv = nn.Linear(dim, dim)

    def forward(self, x):
      B, L, D = x.shape
      q = self.wq(x)
      k = self.wk(x)
      v = self.wv(x)
      q = q.reshape([B, L, self.h_dim, -1])
      q = q.permute([0, 3, 1, 2])
      k = k.reshape([B, L, self.h_dim, -1])
      k = k.permute([0, 3, 2, 1])
      v = v.reshape([B, L, self.h_dim, -1])
      v = v.permute([0, 3, 1, 2])

      qk = torch.matmul(q, k)
      attn = torch.softmax(qk / self.scale, -1)
      out = torch.matmul(attn, v).permute([0, 2, 1, 3])
      out = out.reshape([B, L, -1])
      return out

b = torch.ones([2, 256, 16])
a= Attention(16, 2)
a(b).shape

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, dim, h):
        super(TransformerBlock, self).__init__()
        self.ln1 = torch.nn.LayerNorm(dim)
        self.ln2 = torch.nn.LayerNorm(dim)
        self.attn = Attention(dim, h)
        self.mlp = torch.nn.Sequential(
            nn.Linear(dim, dim),
            nn.GELU(),
            nn.Linear(dim, dim)
        )

    def forward(self, inp):
      x = self.attn(inp) + inp
      skip = self.ln1(x)
      x = self.mlp(skip) + skip
      x = self.ln2(x)
      return x

t=TransformerBlock(16, 2)
t(b)

In [None]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, maxlen, dim, h):
        super(Transformer, self).__init__()
        self.dim = dim
        self.emb = nn.Embedding(vocab_size, dim)
        self.pos = nn.Parameter(torch.rand(1, maxlen, dim))

        self.transformer = nn.Sequential()

        for _ in range(4):
            self.transformer.append(
                TransformerBlock(dim, h)
            )

        self.mlp_head=torch.nn.Sequential(
            torch.nn.LayerNorm(dim),
            nn.Linear(dim, 6)
        )

    def forward(self, x):
        B, L = x.shape
        x = self.emb(x) + self.pos[:, L, :]
        x = self.transformer(x)
        x = self.mlp_head(x[:,  -1, :])
        return x

trans =Transformer(9952, 64+1, 64, 2)
trans(train_batch).shape

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.rnn = nn.RNN(input_size=128,
                        hidden_size=128,
                        num_layers=1,
                        batch_first=True)
        self.fc1 = nn.Linear(128, 16)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(16, 16)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(16, 6)

    def forward(self, x):
        x = self.embedding(x)
        x, hidden = self.rnn(x)
        x = x[:, -1, :]
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

rnn = RNN(vocab_size)
output_batch = rnn(train_batch)
output_batch.shape

### LSTM

LSTM:

\begin{align}
i_t & = \sigma(W^ix_t + U^ih_{t-1} + b^i) \\
f_t & = \sigma(W^fx_t + U^fh_{t-1} + b^f) \\
o_t & = \sigma(W^ox_t + U^oh_{t-1} + b^o) \\
g_t & = \text{tanh}(W^gx_t + U^gh_{t-1} + b^g) \\
c_t & = f_t \odot c_{t-1} + i_t \odot g_t\\
h_t & = o_t \odot \text{tanh}(c_t) \\
\end{align}

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 128)
        self.lstm = nn.LSTM(input_size=128,
                        hidden_size=128,
                        num_layers=1,
                        batch_first=True)
        self.fc1 = nn.Linear(128, 16)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(16, 16)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(16, 6)

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.lstm(x)
        x = x[:, -1, :]
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

lstm = LSTM(vocab_size)
output_batch = lstm(train_batch)
output_batch.shape

## 4.- Entrenamiento

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}')

In [None]:
def test(model, device, test_loader):
    start = time.time()
    running_loss = 0.0
    running_acc = 0.0
    with torch.no_grad():
        model.eval()
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = loss_fn(outputs, labels)
            _, pred = torch.max(outputs.data, 1)
            running_acc += (pred == labels).sum().item()
            running_loss += loss.item()

    print(f'Time for eval is {time.time()-start:.4f} sec Val loss: {running_loss / len(test_loader):.4f}')
    print(f'Val acc: {running_acc / len(test_loader.dataset):.4f}')

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
trans.to(device)

trans_optimizer = optim.Adam(trans.parameters(), lr=0.001)

In [None]:
epochs = 7

for epoch in range(epochs):
    train(trans, device, train_loader, trans_optimizer, epoch)
    test(trans, device, val_loader)

In [None]:
lstm.to(device)

lstm_optimizer = optim.Adam(lstm.parameters(), lr=0.001)

In [None]:
epochs = 7

for epoch in range(epochs):
    train(lstm, device, train_loader, lstm_optimizer, epoch)
    test(lstm, device, val_loader)

## 5.- Vectores

In [None]:
words = 'bad good hate happy love scared friend sad alive family confident fight live funny best great amazing'
words_ids = torch.tensor([vocab[token] for token in tokenizer(words)])
words_ids

In [None]:
lstm.to('cpu')
lstm.eval()

In [None]:
embeddings = lstm.embedding(words_ids).detach()
embeddings.shape

- Visualización de los vectores aprendidos

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


# Perform PCA on embeddings
pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(embeddings)
print(pca_embeddings.shape)

# Plot embeddings using matplotlib
fig, ax = plt.subplots()
ax.scatter(pca_embeddings[:, 0], pca_embeddings[:, 1], marker='o', c='b')

for i, word in enumerate(words.split()):
    ax.annotate(word, (pca_embeddings[i, 0], pca_embeddings[i, 1]))

plt.show()

## 5.- Evaluación (BLEU)

In [48]:
def bleu_example():
    # Lista de oraciones de referencia (lista de listas)
    referencias = [['El', 'gato', 'está', 'en', 'la', 'alfombra'],
                   ['El', 'perro', 'juega', 'en', 'el', 'parque'],
                   ['El', 'cielo', 'está', 'despejado'],
                   ['El', 'sol', 'brilla', 'intensamente'],
                   ['Los', 'pájaros', 'cantan', 'en', 'los', 'árboles']]
    
    # Lista de oraciones candidatas (lista de listas)
    candidatas = [['El', 'gato', 'está', 'durmiendo', 'en', 'la', 'alfombra'],
                  ['El', 'perro', 'juega', 'en', 'el', 'jardín'],
                  ['El', 'cielo', 'está', 'soleado'],
                  ['El', 'sol', 'brilla', 'intensamente'],
                  ['Los', 'pájaros', 'trinan', 'en', 'los', 'árboles']]
    
    # Calcular el BLEU score para cada oración candidata
    for i in range(len(candidatas)):
        referencia = referencias[i]
        candidata = candidatas[i]
        
        bleu_score = nltk.translate.bleu_score.sentence_bleu([referencia], candidata)
        print(f"BLEU score para la oración {i+1}: {bleu_score}")

In [56]:
bleu_example()

BLEU score para la oración 1: 8.44484326442819e-78
BLEU score para la oración 2: 0.7598356856515925
BLEU score para la oración 3: 8.636168555094496e-78
BLEU score para la oración 4: 1.0
BLEU score para la oración 5: 7.262123179505913e-78
