## **Step 0: Initialization**

In [1]:
import torch
import pandas as pd
import torch.nn as nn

import random
import math
import time

from collections import Counter
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

from torch import Tensor

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## **Step 1: Load Data**

In [2]:
data_path = '/kaggle/input/wikipedia-sentences/wikisent2.txt'

sentence_count = 50000

with open(data_path, 'r') as file:
    file_text = file.read()
file_lines = random.sample(file_text.split('\n')[:-1], sentence_count)

print(random.choice(file_lines))

Near Blowing Rock, North Carolina, the divide meets the Eastern Continental Divide in the Blue Ridge Mountains.


## **Step 2: Prepare Tokens**

In [3]:
!python -m spacy download en_core_web_sm

en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

print(en_tokenizer(random.choice(file_lines)))

def build_vocab(train_data, tokenizer):
    words = Counter()
    for sentence in train_data:
        words.update(tokenizer(sentence))
    return vocab(words, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

en_vocab = build_vocab(file_lines, en_tokenizer)

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')




['The', 'episode', 'was', 'first', 'broadcast', 'on', 'BBC', 'One', 'on', '4', 'June', '2005', '.']


## **Step 3: Process Data as Tensor**

In [4]:
def process_data(lines):
    data = []
    for sentence in lines:
        en_tensor = torch.tensor([en_vocab[token] for token in en_tokenizer(sentence.rstrip("\n"))],
                                  dtype=torch.long)
        data.append(en_tensor)
    return data

train_data = process_data(file_lines)
print(random.choice(train_data))

tensor([  400,    18,  3526,    84,    19,  3394, 11382,    92,   233, 25287,
           26,   912,  9792,    33, 33941,     5,  4958,    23,  8965, 28011,
           51])


## **Step 4: Create Dataset & DataLoader**

In [5]:
class TranslationDataset(Dataset):
    def __init__(self, train_data):
        self.train_data = train_data
    
    def __len__(self):
        return len(self.train_data)
    
    def __getitem__(self, index):
        return self.train_data[index]

SEQUENCE_LENGTH = 150
BATCH_SIZE = 16
PAD_IDX = en_vocab['<pad>']
BOS_IDX = en_vocab['<bos>']
EOS_IDX = en_vocab['<eos>']

def generate_batch(data_batch):
    en_batch = []
    label_batch = []
    for en_item in data_batch:
        sentence = torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0)
        sentence = nn.ConstantPad1d((0, SEQUENCE_LENGTH - sentence.size(0)), PAD_IDX)(sentence)
        current_labels = []
        for idx in range(1, len(sentence)):
            current_labels.append(sentence[idx])
        current_labels.append(torch.tensor(PAD_IDX))
            
        en_batch.append(sentence)
        label_batch.append(current_labels)
    
    en_batch = torch.stack(en_batch)
    label_batch = torch.tensor(label_batch)
    return (en_batch, label_batch)
    
train_dataset = TranslationDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

## **Step 5: Create Transformer Model**

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.query_layer = nn.Linear(d_model, d_model)
        self.key_layer = nn.Linear(d_model, d_model)
        self.value_layer = nn.Linear(d_model, d_model)
        self.output_layer = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask):
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        attention_probabilities = torch.softmax(attention_scores, dim=-1)
        output = torch.matmul(attention_probabilities, V)
        return output
    
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
    
    def forward(self, Q, K, V, mask):
        Q = self.split_heads(self.query_layer(Q))
        K = self.split_heads(self.key_layer(K))
        V = self.split_heads(self.value_layer(V))
        
        attention_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.output_layer(self.combine_heads(attention_output))
        return output
    
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        self.positional_encoding = torch.zeros(max_seq_length, d_model, device=DEVICE)
        position = torch.arange(0, max_seq_length, dtype=torch.float32, device=DEVICE).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, device=DEVICE).float() * -(math.log(10000.0) / d_model))
        
        self.positional_encoding[:, 0::2] = torch.sin(position * div_term)
        self.positional_encoding[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('_positional_encoding', self.positional_encoding.unsqueeze(0))
        
    def forward(self, x):
        return x + self.positional_encoding[:, :x.size(2)]
    
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(TransformerBlock, self).__init__()
        
        self.self_attention = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attention_output = self.self_attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attention_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        
        self.layers = nn.ModuleList([TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        
        self.fc = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def generate_mask(self, src):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def forward(self, src):
        src_mask = self.generate_mask(src)
        src_embedded = self.dropout(self.positional_encoding(self.embedding(src)))
        
        output = src_embedded
        for layer in self.layers:
            output = layer(output, src_mask)
        
        output = self.fc(output)
        return output

## **Step 8: Training Variables**

In [7]:
vocab_size = len(en_vocab)
d_model = 1024
num_heads = 64
num_layers = 24
d_ff = 1024
dropout = 0.1
learning_rate = 0.00003

transformer = Transformer(vocab_size, d_model, num_heads,
                          num_layers, d_ff, SEQUENCE_LENGTH, dropout).to(DEVICE)

loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

## **Step 9: Define Training Procedure**

In [8]:
def train_epoch(model, train_loader, optimizer):
    model.train()
    losses = 0
    for idx, (src, labels) in enumerate(train_loader):           
        src = src.to(DEVICE)
        labels = labels.to(DEVICE)
        
        logits = transformer(src)
        
        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1)
        
        optimizer.zero_grad()
        loss = loss_fn(logits, labels)
        loss.backward()
        
        optimizer.step()
        losses += loss.item()
    return losses / len(train_loader)

## **Step 10: Train**

In [9]:
num_epochs = 9

for epoch in range(1, num_epochs+1):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_loader, optimizer)
    end_time = time.time()
    print((f'Epoch: {epoch}, Train loss: {train_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s'))

Epoch: 1, Train loss: 5.580, Epoch time = 2490.547s
Epoch: 2, Train loss: 3.605, Epoch time = 2491.848s
Epoch: 3, Train loss: 2.678, Epoch time = 2490.139s
Epoch: 4, Train loss: 2.194, Epoch time = 2491.796s
Epoch: 5, Train loss: 1.908, Epoch time = 2491.223s
Epoch: 6, Train loss: 1.728, Epoch time = 2490.452s
Epoch: 7, Train loss: 1.603, Epoch time = 2491.257s
Epoch: 8, Train loss: 1.510, Epoch time = 2492.172s
Epoch: 9, Train loss: 1.432, Epoch time = 2490.435s


## **Step 11: Save Model & Vocab**

In [10]:
# save model for inference
torch.save(transformer.state_dict(), 'model.pth')

import pickle
with open('en_vocab.pkl', 'wb') as file:
    pickle.dump(en_vocab, file)

## **Step 12: Load Model & Vocab**

In [11]:
en_vocab_path = '/kaggle/working/en_vocab.pkl'
model_path = '/kaggle/working/model.pth'

generator_transformer = Transformer(vocab_size, d_model, num_heads,
                          num_layers, d_ff, SEQUENCE_LENGTH, dropout).to(DEVICE)

generator_transformer.load_state_dict(torch.load(model_path))
generator_transformer.eval()

with open(en_vocab_path, 'rb') as file:
    en_vocab = pickle.load(file)

## **Step 13: Model Evaluation Functions**

In [12]:
def generate(model, initial_input, max_length):
    current_input = initial_input.view(1, -1).to(DEVICE)

    generated_sequence = initial_input.tolist()

    with torch.no_grad():
        for _ in range(max_length):
            padded_input = nn.ConstantPad1d((SEQUENCE_LENGTH - current_input.size(1), 0), PAD_IDX)(current_input)
            logits = model(padded_input)
            next_word = torch.argmax(logits[:, -1, :], dim=-1)
            generated_sequence.append(next_word.item())
            current_input = torch.cat([current_input, next_word.view(1, -1)], dim=1)
            
            if next_word.item() == EOS_IDX:
                break

    return generated_sequence

def generate_text(model, initial_input):
    model.eval()
    
    input_tensor = torch.tensor([BOS_IDX] + [en_vocab[word] for word in en_tokenizer(initial_input)])
    generated_sequence = generate(generator_transformer, input_tensor, max_length=50)
    
    output = ' '.join([en_vocab.lookup_token(idx) for idx in generated_sequence])
    return output

## **Step 14: Evaluate Model**

In [13]:
generate_text(generator_transformer, "The")

'<bos> The . <eos>'