In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
import torch
import torch.nn.functional as F
from torchinfo import summary
from utils.text_generation import LMPipeline
from utils.transformer_decoder import DecoderLM
from utils.tokenizer import MyTokenizer

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# download dataset
dataset = load_dataset("AiresPucrs/google-play-apps-review-pt")

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [29]:
VOCAB_SIZE = 12_000
LR = 5e-5
MAX_LEN = 128
D_MODEL = 124
N_LAYERS = 6
N_HEADS = 4
HIDDEN_SIZE = 512
DROPOUT = 0.1
BATCH_SIZE = 8
EPOCHS = 50

In [7]:
tokenizer = MyTokenizer()

In [8]:
tokenizer.create_tokenizer(dataset)

In [9]:
print(f'Tokeniznado: {tokenizer.tokenize_text("ola")}')
print(f'Detokenizando: {tokenizer.untokenize_tokens([2 ,857, 3])}')

Tokeniznado: tensor([  2, 857,   3])
Detokenizando:  ola 


In [10]:
VOCAB_SIZE = len(tokenizer.vocab_transform)
VOCAB_SIZE

11465

# Dataset

- COLOQUE O \<EOS>

In [11]:
from torch.utils.data import Dataset, DataLoader

In [12]:
dataset['train'][0]['review']

'o aplicativo e bom disparadamente melhor que o concorrente whatsapp pontos positivos  possibilidade de aplicar temas personalizados para sair da aparencia padrao de acordo com o usuario  a nao utilizacao de um backup local e sem a possibilidade de perder todas as mensagens acidentalmente por ser um servico via nuvem  a possibilidade de usar bots como um diferencial alem de somente usar o aplicativo para conversar ou seja e possivel ampliar o uso do aplicativo para outras coisas interessantes como por exemplo estudar  a possibilidade de se entreter com jogos e se divertir com outros contatosamigos similar ao ponto anterior  a existencia de um chat secreto para autodestruir mensagens que 2 usuarios nao queiram que fiquem armazenadas na nuvem sendo assim uma forma de conversar com privacidade total ainda ha outros pontos positivos mas nao e necessario citar todos eu tenho somente um ponto negativo tal ponto e a instabilidade do sistema em nuvem do telegram que certas vezes dessincroniza 

In [14]:
class GoogleDataset(Dataset):
    def __init__(
        self,
        dataset,
        tokenizer: MyTokenizer,
        max_len: int = MAX_LEN,
    ):
        super().__init__()
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.vocab_size = len(self.tokenizer.vocab_transform)

    def __len__(self):
        return self.dataset['train'].num_rows

    def __getitem__(self, index):
        text = self.dataset['train'][index]['review']
        tokenized = self.tokenizer.tokenize_text(text)
        if len(tokenized) < self.max_len + 1:
            tokenized = tokenized.tolist()
            # porque colocar o <pad> antes do texto?
            tokenized = [self.tokenizer.PAD_IDX] * ((self.max_len + 1) - len(tokenized)) + tokenized
            tokenized = torch.tensor(tokenized)
        else:
            tokenized = tokenized[:self.max_len + 1]

        decoder_input = tokenized[: self.max_len]
        true_output = tokenized[1 : self.max_len + 1]
        # return decoder_input, true_output
        return {'decoder_input': decoder_input, 'true_output': true_output}

In [15]:
train_dataset = GoogleDataset(dataset, tokenizer)

In [16]:
# print(f'O que vai entrar no modelo: \n{train_dataset[12][0]}')
print(f'O que vai entrar no modelo: \n{train_dataset[12]["decoder_input"]}')
print('*'*30)
# print(f'O que deve sair do modelo: \n{train_dataset[12][1]}')
print(f'O que deve sair do modelo: \n{train_dataset[12]["true_output"]}')

O que vai entrar no modelo: 
tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    2,   26,  528,   68,   21,  622,   19,  200,
           8,  192,  171,    5,   19,   10,   44,  622,    5,   10, 1017,    8,
         677,  428,  317, 1185,    7, 1167,   28,  777,    7,  225,   10,  256,
        1167,    5,   10,    8,   34,    5, 6690,   40, 1167,   20,  689,   80,
         385,  428,   12,  699,  216,    7, 1167, 1167,   20, 1244,  412,  216,
          10,  135,    8,    5, 4148,   30, 1483,   13, 9495, 2528,    6, 1547,
          43,   10,  260,   20, 2556,    5,  363, 1364,   10, 1206,   13,  129,
         564,    8,   58,  221, 1972, 7307, 1989,   15,   10, 1217,  203, 1757,
          12, 1525,   18, 4297,    5, 1986,  339, 2493])
******************************
O que deve sair do modelo: 
tensor([   0,    0,    0,    0,    0,    0,    0,    0,

# Modelo

In [17]:
import torch.nn as nn

In [18]:
class PositionEncoding(nn.Module):
    
    def __init__(self, d_model=D_MODEL, max_len=MAX_LEN):
        super().__init__()
        pe = torch.zeros(max_len, d_model, device=device)   
        position = torch.arange(start=0, end=max_len, step=1, device=device).float().unsqueeze(1)
        embedding_index = torch.arange(start=0, end=d_model, step=2, device=device).float()
        div_term = 1/torch.tensor(10000.0)**(embedding_index / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, word_embeddings):
    
        return word_embeddings + self.pe[:word_embeddings.size(0), :]

In [19]:
class Attention(nn.Module): 
    
    def __init__(self, d_model=D_MODEL):
        super().__init__()
        
        self.d_model=d_model
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False).to(device)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False).to(device)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False).to(device)

        self.row_dim = 0
        self.col_dim = 1

        
    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):
        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        
        return attention_scores

In [65]:
class DecoderOnlyTransformer(nn.Module):
    
    def __init__(self, num_tokens=VOCAB_SIZE, d_model=D_MODEL, max_len=MAX_LEN):
        
        super().__init__()
        self.we = nn.Embedding(num_embeddings=num_tokens, 
                               embedding_dim=d_model).to(device) 
        self.pe = PositionEncoding(d_model=d_model, 
                                   max_len=max_len).to(device)
        self.self_attention = Attention(d_model=d_model)
        self.fc_layer = nn.Linear(in_features=d_model, out_features=num_tokens).to(device)

        
    def forward(self, token_ids):
                
        word_embeddings = self.we(token_ids.to(device))        
        position_encoded = self.pe(word_embeddings.to(device))
        mask = torch.tril(torch.ones((token_ids.size(dim=0), token_ids.size(dim=0)), device=device))
        mask = mask == 0
        
        self_attention_values = self.self_attention(position_encoded, 
                                                    position_encoded, 
                                                    position_encoded, 
                                                    mask=mask)
                
        residual_connection_values = position_encoded + self_attention_values
        
        fc_layer_output = self.fc_layer(residual_connection_values)
        
        return fc_layer_output

In [66]:
model = DecoderOnlyTransformer().to(device)

In [67]:
from torchinfo import summary

In [68]:
summary(model)

Layer (type:depth-idx)                   Param #
DecoderOnlyTransformer                   --
├─Embedding: 1-1                         1,488,000
├─PositionEncoding: 1-2                  --
├─Attention: 1-3                         --
│    └─Linear: 2-1                       15,376
│    └─Linear: 2-2                       15,376
│    └─Linear: 2-3                       15,376
├─Linear: 1-4                            1,500,000
Total params: 3,034,128
Trainable params: 3,034,128
Non-trainable params: 0

In [69]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

In [70]:
dataloader = DataLoader(
    train_dataset, num_workers=0, shuffle=True, batch_size=1
)

In [71]:
update_count = 0
accum_loss = None

optimizer.zero_grad(set_to_none=True)
for epoca in range(EPOCHS):
    batch_iterator = tqdm(dataloader, desc=f"Processing Epoch {epoca:02d}")
    for batch in batch_iterator:
        x, y = batch['decoder_input'], batch['true_output']
        x = x.to(device)
        y = y.long().to(device)
        y_hat = model(x[0])
        loss = F.cross_entropy(y_hat, y[0])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        
        if accum_loss is not None:
            accum_loss = 0.99 * accum_loss + 0.01 * loss.detach().cpu().item()
        else:
            accum_loss = loss.detach().cpu().item()

        update_count += 1
        batch_iterator.set_description(
            f"Epoca: {epoca}\tUpdate: {update_count}\tLoss: {loss.detach().cpu().item():.4f}\tAccum_loss: {accum_loss:.4f}"
        )

Epoca: 0	Update: 20000	Loss: 2.5496	Accum_loss: 3.0476: 100%|████████████████████| 20000/20000 [04:35<00:00, 72.50it/s]
Epoca: 1	Update: 21194	Loss: 3.1353	Accum_loss: 2.9224:   6%|█▎                   | 1194/20000 [00:15<04:11, 74.90it/s]

KeyboardInterrupt



In [76]:
model_input = x
input_length = VOCAB_SIZE

predictions = model(model_input[0]) 
predicted_id = torch.tensor([torch.argmax(predictions[-1,:])])
predicted_ids = predicted_id

for i in range(input_length, 1000):
    if (predicted_id == token_to_id["<EOS>"]): # if the prediction is <EOS>, then we are done
        break
    
    model_input = torch.cat((model_input, predicted_id))
    
    predictions = model(model_input) 
    predicted_id = torch.tensor([torch.argmax(predictions[-1,:])])
    predicted_ids = torch.cat((predicted_ids, predicted_id))
        
print("Predicted Tokens:") 
for id_ in predicted_ids: 
    print("\t", tokenizer.untokenize_tokens([id_]))

Predicted Tokens:
	 


In [72]:
x

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    2,  103,   39,   58,   26,   17,  555,   33,
           25,  552,  291,   22,  178,   20,  115,   10,  433,   77,  151,  118,
           14,  380,  257,   96, 2950, 3454,   10,   67,    9, 6313,    6,  161,
           10,  309,   34,  326, 1143,    5,    9, 3361,    6,    8, 1552,   17,
          406,   64,  860,    1,   50,   17,   11,   10,   54,  321,  161,  833,
            6,  101,  130,  694,   94, 4491,   12,  222]], device='cuda:0')

In [63]:
y

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    2,  311,   64,  860,   14,    6,  514,   98,   21,  113,   17,
         2107, 9504,    5,    9,   48,  518,   22,  573, 3468,  183,  169,   20,
          129,   14,    6,  307,    5,    9,  214, 1684,    8,    6,  307,    9,
          351,  501,   20,  129,   14,   10,  879,    7,  229,   13,  514,   15,
          440,   43,   57,   46, 4584,  258,    6,  292,   57,    6,  328,    9,
           89,  399,   61,   21,  147,  258,    6,   36,  292,   31,  485,    6,
          335,  102,  599, 2717,  339,  401,  480,    3]], device='cuda:0')

In [64]:
model(x[0])

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)