In [1]:
import torch
import torch.nn as nn

In [2]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads  # embed_size: 256 and heads: 8 then each head is
                                             # 256 // 8 = 32 dim
        assert (self.head_dim * heads == embed_size), 'Embed size needs to be div by heads'
        
        self.values = nn.Linear(self.head_dim, self.head_dim, bias= False)
        self.keys =  nn.Linear(self.head_dim, self.head_dim, bias= False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias= False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)
        
    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        
        # Split embedding into self.head pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)
        
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)
        
        energy = torch.einsum('nqhd,nkhd->nhqk', [queries, keys])
        # queries shape: (N, query_len, heads, head_dim)
        # keys shape: (N, keys_len, heads, head_dim)
        # energy = (N, heads, query_len, key_len)
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float('-1e20'))
        
        attention = torch.softmax(energy/(self.embed_size ** (1/2)), dim=3)
        
        out = torch.einsum('nhql,nlhd->nqhd', [attention, values]).reshape(
            N, query_len, self.heads*self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, values_len, heads, heads_dim)
        # (N, query_len, heads, head_dim) then flatten last two dimension
        # (n, query_len, embed_size)
        out = self.fc_out(out)
        return out

In [3]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embed_size, embed_size)
        )
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

In [4]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, value, key, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, trg_mask)
        return out

In [5]:
class Decoder(nn.Module):
    def __init__(self,
                 trg_vocab_size,
                 embed_size,
                 num_layers,
                 heads,
                 forward_expansion,
                 dropout,
                 device,
                 max_lenght
                ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.positional_embedding = nn.Embedding(max_lenght, embed_size)
        self.layers = nn.ModuleList(
            [DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
            for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, trg_mask):
        N, seq_lenght = x.shape
        positions = torch.arange(0, seq_lenght).expand(N, seq_lenght).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.positional_embedding(positions)))
        for layer in self.layers:
            x = layer(x, x, x, trg_mask)
        
        out = self.fc_out(x)
        return out

In [6]:
class GPT(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        trg_pad_idx,
        embed_size=256,
        num_layers=6,
        forward_expansion=4,
        heads= 8,
        dropout=0,
        device='cuda',
        max_lenght=100
    ):
        super(GPT, self).__init__()
        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_lenght   
        )
        self.trg_pad_ix = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)
    
    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N,1,trg_len, trg_len)
        return trg_mask.to(self.device)
    
    def forward(self, trg):
        trg_mask = self.make_trg_mask(trg)
        out = self.decoder(trg, trg_mask)
        return out

In [7]:
trg_vocab_size = 10
trg_pad_idx = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT(trg_vocab_size, trg_pad_idx, device=device).to(device)

In [8]:
TRG_PAD_IDX = 0
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
trg = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)
output = model(trg)
print(output.shape)
# output shape (N, seq_len, vocab_Size)
output_dim = output.shape[-1] #
output = output[:, 1:].contiguous().view(-1, output_dim) # shape (N * seq_len)
# trg shape = (N, seq_len)
print(trg.shape)
trg = trg[:, 1:].contiguous().view(-1) # Remove start token from stences so shape 
                                       # (N*seq_len - N)
loss = criterion(output, trg)
print(loss.item())
predicted_tokens = torch.argmax(output, dim=-1)
print(predicted_tokens)

torch.Size([2, 8, 10])
torch.Size([2, 8])
2.3172292709350586
tensor([6, 1, 1, 5, 3, 1, 0, 3, 7, 1, 5, 7, 4, 9], device='cuda:0')


In [9]:
!wget 'https://raw.githubusercontent.com/Apress/applied-natural-language-processing-w-python/master/data_etc/deu.txt'

--2024-04-16 15:03:53--  https://raw.githubusercontent.com/Apress/applied-natural-language-processing-w-python/master/data_etc/deu.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12316220 (12M) [text/plain]
Saving to: 'deu.txt'


2024-04-16 15:03:53 (146 MB/s) - 'deu.txt' saved [12316220/12316220]



In [10]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [11]:
import torch

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
import random
from torchtext.vocab import build_vocab_from_iterator
eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text
de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text

In [14]:
import random

def split_dataset(input_file, train_file, test_file, train_percent):
    with open(input_file, 'r') as file:
        lines = file.readlines()  

    total_lines = len(lines)
    train_lines = int(total_lines * train_percent)
    test_lines = total_lines - train_lines


    random.shuffle(lines)

    with open(train_file, 'w') as train:
        for line in lines[:train_lines]:
            train.write(line)

    with open(test_file, 'w') as test:
        for line in lines[train_lines:]:
            test.write(line)


input_file = '/kaggle/working/deu.txt'


train_file = 'shiny_train.txt'
test_file = 'shiny_test.txt'

train_percent = 0.8

split_dataset(input_file, train_file, test_file, train_percent)

In [15]:
FILE_PATH = '/kaggle/working/shiny_train.txt'
data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
data_pipe = data_pipe.parse_csv(skip_lines=1, delimiter='\t', as_tuple=True)

In [16]:
for sample in data_pipe:
    print(sample)
    break

("Mary isn't my girlfriend anymore.", 'Ich bin nicht mehr mit Maria zusammen.')


In [17]:
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]

def deTokenize(text):
    """
    Tokenize a German text and return a list of tokens
    """
    return [token.text for token in de.tokenizer(text)]

In [18]:
print(engTokenize("Have a good day!!!"))
print(deTokenize("Haben Sie einen guten Tag!!!"))

['Have', 'a', 'good', 'day', '!', '!', '!']
['Haben', 'Sie', 'einen', 'guten', 'Tag', '!', '!', '!']


In [19]:
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place=0` for source and `place=1` for target
    """
    for english, german in data_iter:
        if place == 0:
            yield engTokenize(english)
        else:
            yield deTokenize(german)

In [20]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,0),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

In [21]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [22]:
print(source_vocab.get_itos()[:9])

['<pad>', '<sos>', '<eos>', '<unk>', '.', 'I', 'Tom', 'to', 'you']


In [23]:
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_tranform

In [24]:
temp_list = list(data_pipe)
some_sentence = temp_list[798][0]
print("Some sentence= ", end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence=", end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end=" ")

Some sentence= I want you to help me clean it up.
Transformed sentence=[1, 5, 46, 8, 7, 94, 21, 694, 23, 67, 4, 2]
<sos> I want you to help me clean it up . <eos> 

In [25]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """

    return (
        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
    )
data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(data_pipe)
print(temp_list[0])

([1, 28, 13, 12, 41, 707, 314, 4, 2], [1, 7, 54, 9, 78, 29, 39, 298, 4, 2])


In [26]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

In [27]:
bucket = [
    (['how', 'are', 'you'], ['wie', 'geht', 'es', 'dir']),
    (['hello', 'world'], ['hallo', 'welt']),
    (['this', 'is', 'a', 'test'], ['das', 'ist', 'ein', 'test'])
]

sorted_data = sortBucket(bucket)

for item in sorted_data:
    print(item)

(['hello', 'world'], ['hallo', 'welt'])
(['how', 'are', 'you'], ['wie', 'geht', 'es', 'dir'])
(['this', 'is', 'a', 'test'], ['das', 'ist', 'ein', 'test'])


In [28]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 128, batch_num=5,  bucket_num=1,
    use_in_batch_shuffle=False, sort_key=sortBucket
)

In [29]:
print(list(data_pipe)[0])

[([1, 6, 1051, 4, 2], [1, 6, 1660, 196, 4, 2]), ([1, 3106, 80, 4, 2], [1, 2396, 439, 63, 4, 2]), ([1, 467, 155, 4, 2], [1, 1217, 12, 161, 4, 2]), ([1, 4579, 9, 4655, 4, 2], [1, 14849, 25, 2]), ([1, 8589, 83, 1284, 4, 2], [1, 6307, 487, 1866, 4, 2]), ([1, 59, 13, 3674, 4, 2], [1, 28, 19, 14008, 4, 2]), ([1, 503, 222, 123, 4, 2], [1, 608, 85, 116, 4, 2]), ([1, 25, 63, 824, 4, 2], [1, 40, 185, 2632, 4, 2]), ([1, 6, 14, 446, 4, 2], [1, 6, 10, 807, 4, 2]), ([1, 25, 53, 466, 4, 2], [1, 40, 98, 1898, 4, 2]), ([1, 6, 13, 3, 4, 2], [1, 6, 6416, 61, 4, 2]), ([1, 26, 652, 4119, 4, 2], [1, 23, 89, 9554, 4, 2]), ([1, 1023, 11, 984, 4, 2], [1, 10545, 35, 552, 25, 2]), ([1, 6, 13, 3, 4, 2], [1, 6, 10, 2732, 4, 2]), ([1, 76, 1585, 1046, 4, 2], [1, 28, 1338, 12052, 4, 2]), ([1, 6, 1986, 28, 4, 2], [1, 6, 1489, 111, 4, 2]), ([1, 6, 13, 81, 4, 2], [1, 6, 10, 4247, 4, 2]), ([1, 107, 3049, 834, 10, 2], [1, 375, 4392, 16310, 8, 2]), ([1, 107, 8, 2711, 10, 2], [1, 270, 13, 4297, 8, 2]), ([1, 59, 13, 402, 4, 

In [30]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)
print(list(data_pipe)[0])

(([1, 25, 53, 81, 48, 117, 20, 38, 12, 8, 10, 2], [1, 6, 31, 5, 22, 90, 476, 11, 134, 370, 4, 2], [1, 25, 22, 160, 7, 197, 54, 11, 208, 94, 4, 2], [1, 59, 1859, 64, 40, 57, 1721, 9, 6748, 654, 4, 2], [1, 76, 14, 6, 14, 156, 33, 9, 607, 1460, 4, 2], [1, 42, 437, 21, 9981, 102, 7, 58, 17, 23, 4, 2], [1, 42, 1419, 11, 134, 839, 2985, 60, 16, 2663, 4, 2], [1, 5, 22, 12, 279, 54, 24, 11, 144, 66, 4, 2], [1, 5, 160, 251, 9, 625, 17, 6, 14, 972, 4, 2], [1, 27, 1145, 68, 11, 771, 2525, 36, 9, 2020, 4, 2], [1, 5, 58, 495, 568, 11, 584, 1401, 48, 436, 4, 2], [1, 313, 227, 84, 22, 90, 65, 848, 17, 8, 4, 2], [1, 93, 15, 12, 8, 1630, 184, 11, 208, 584, 10, 2], [1, 27, 1398, 9873, 185, 37, 146, 24, 2024, 195, 4, 2], [1, 39, 4802, 11, 166, 156, 24, 3037, 1158, 550, 4, 2], [1, 77, 460, 13, 23, 78, 60, 7, 9, 403, 10, 2], [1, 5, 43, 12, 47, 18, 6, 91, 170, 117, 4, 2], [1, 5, 43, 12, 57, 20, 103, 641, 48, 114, 4, 2], [1, 79, 8060, 9, 1462, 33, 259, 345, 31, 316, 4, 2], [1, 176, 32, 223, 20, 5, 19, 877, 1

In [31]:
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
# vocabulary.
data_pipe = data_pipe.map(applyPadding)

In [32]:
print(list(data_pipe)[0])

(tensor([[   1,   25,   87,  ...,    2,    0,    0],
        [   1,    6,   13,  ...,    2,    0,    0],
        [   1,   25,   22,  ...,    2,    0,    0],
        ...,
        [   1,    6,   31,  ..., 2077,    4,    2],
        [   1,   77,   15,  ...,  117,   10,    2],
        [   1,    5,  211,  ..., 1040,    4,    2]]), tensor([[  1,  40, 254,  ...,   0,   0,   0],
        [  1,   6,  10,  ...,   0,   0,   0],
        [  1,  40, 231,  ...,   0,   0,   0],
        ...,
        [  1,   6,  33,  ...,   0,   0,   0],
        [  1,  73, 228,  ...,   0,   0,   0],
        [  1,   7,  81,  ...,   0,   0,   0]]))


In [33]:
source_index_to_string = source_vocab.get_itos()
target_index_to_string = target_vocab.get_itos()

def showSomeTransformedSentences(data_pipe):
    """
    Function to show how the sentences look like after applying all transforms.
    Here we try to print actual words instead of corresponding index
    """
    for sources,targets in data_pipe:
        if sources[0][-1] != 0:
            continue # Just to visualize padding of shorter sentences
        for i in range(4):
            source = ""
            for token in sources[i]:
                source += " " + source_index_to_string[token]
            target = ""
            for token in targets[i]:
                target += " " + target_index_to_string[token]
            print(f"Source: {source}")
            print(f"Traget: {target}")
        break

showSomeTransformedSentences(data_pipe)

Source:  <sos> Tom is n't as active as Mary . <eos> <pad> <pad>
Traget:  <sos> Tom ist nicht so lebhaft wie Mary . <eos> <pad> <pad> <pad> <pad> <pad>
Source:  <sos> Tom hung his jacket on a hook . <eos> <pad> <pad>
Traget:  <sos> Tom hängte seine Jacke an einen Haken . <eos> <pad> <pad> <pad> <pad> <pad>
Source:  <sos> Tom had the whole <unk> to himself . <eos> <pad> <pad>
Traget:  <sos> Tom hatte den ganzen Campingplatz für sich . <eos> <pad> <pad> <pad> <pad> <pad>
Source:  <sos> Do n't forget that we have homework . <eos> <pad> <pad>
Traget:  <sos> Vergiss nicht , dass wir Hausaufgaben haben . <eos> <pad> <pad> <pad> <pad> <pad>


In [34]:
for (data, target) in data_pipe:
    print(data)
    print(data.shape)
    print(target)
    print(target.shape)
    break

tensor([[   1,  706, 3801,  ...,    0,    0,    0],
        [   1,    6, 1412,  ...,    0,    0,    0],
        [   1,    6, 2964,  ...,    0,    0,    0],
        ...,
        [   1,    5,   46,  ...,   54,    4,    2],
        [   1,    5,  169,  ...,  333,    4,    2],
        [   1,    6,   13,  ...,   21,    4,    2]])
torch.Size([128, 8])
tensor([[   1,  634, 6550,  ...,    0,    0,    0],
        [   1,    6,  521,  ...,    0,    0,    0],
        [   1,    6, 1005,  ...,    0,    0,    0],
        ...,
        [   1,    7,   85,  ...,    2,    0,    0],
        [   1,    7, 1606,  ...,    2,    0,    0],
        [   1,    6, 2183,  ...,    2,    0,    0]])
torch.Size([128, 9])


In [35]:
import torch.nn as nn
import random
import torch.optim as optim
from tqdm import tqdm
import torch

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [37]:
trg_vocab_size = len(source_vocab)
trg_pad_idx = source_vocab.lookup_indices(['<pad>'])[0]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT(trg_vocab_size, trg_pad_idx, device=device).to(device)
criterion = nn.CrossEntropyLoss(ignore_index = trg_pad_idx)
optimizer = optim.Adam(model.parameters())
clip = 1

In [38]:
def calc_len_dp(dp):
    i=0 
    for _ in dp:
        i+=1
    return i

In [39]:
len_dp_train = calc_len_dp(data_pipe)

In [40]:
len_dp_train

1062

In [41]:
import torch
import torch.nn as nn
import torch.optim as optim

def train_gpt(model, iterator, optimizer, criterion, clip, epochs):

    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        print(f'Epoch {epoch}/{epochs}')
        for idx, (trg, _) in enumerate(iterator):

            # trg shape N, seq_len
            trg = trg[:, 1:]
            N, seq_len = trg.shape
            trg = trg.to(device)

            output = model(trg)

            output_dim = output.shape[-1]
            output = output[:].contiguous().view(-1, output_dim)
            trg = trg.contiguous().view(-1)

            if (idx==0) and (epoch!=0):

                predicted_tokens = torch.argmax(output, dim=-1)
                print('source')
                src_tokens = source_vocab.lookup_tokens(trg.tolist())
                print(' '.join(src_tokens[:src_tokens.index('<eos>')]))
                print('pred')
                pred_tokens = source_vocab.lookup_tokens(predicted_tokens.tolist())
                print(' '.join(pred_tokens[:pred_tokens.index('<eos>')]))

            optimizer.zero_grad()
            loss = criterion(output, trg)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
            optimizer.step()

            epoch_loss += loss.item()
        print('Loss: ', epoch_loss/len_dp_train)

In [42]:
epochs = 5
train_gpt(model, data_pipe, optimizer, criterion, clip, epochs)

Epoch 0/5
Loss:  0.33511102960309935
Epoch 1/5
source
Tom could n't live up to Mary 's expectations .
pred
Tom could n't live up to Mary 's expectations .
Loss:  0.0045517692039030114
Epoch 2/5
source
Come downstairs .
pred
Come downstairs .
Loss:  0.00017305313568405953
Epoch 3/5
source
Tom says he does n't like chocolate ice cream .
pred
Tom says he does n't like chocolate ice cream .
Loss:  7.851041723173173e-05
Epoch 4/5
source
Let 's take a tea break .
pred
Let 's take a tea break .
Loss:  4.459371802594606e-05


In [43]:
from torchmetrics.functional.text import bleu_score
import numpy as np
def bleu(data_pipe, model, device):
    targets = []
    outputs = []
    
    model.eval()
    for idx, (trg, _) in enumerate(data_pipe):
        
        if idx % 10000 == 0:
            print(idx)
            
        with torch.no_grad():
            trg = torch.tensor(trg, device=device).unsqueeze(0)
             # trg shape N, seq_len
            trg = trg[:, 1:]
            N, seq_len = trg.shape
            trg = trg.to(device)
            output = model(trg)
            output_dim = output.shape[-1]
            output = output[:].contiguous().view(-1, output_dim)
            trg = trg.contiguous().view(-1)
            predicted_tokens = torch.argmax(output, dim=-1)
            src_tokens = source_vocab.lookup_tokens(trg.tolist())
            pred_tokens = source_vocab.lookup_tokens(predicted_tokens.tolist())
            
            targets.append(' '.join(src_tokens[:src_tokens.index('<eos>')]))
            outputs.append(' '.join(pred_tokens[:pred_tokens.index('<eos>')]))
            
    return bleu_score(outputs, targets)

In [44]:
FILE_PATH = '/kaggle/working/shiny_test.txt'
test_data_pipe = dp.iter.IterableWrapper([FILE_PATH])
test_data_pipe = dp.iter.FileOpener(test_data_pipe, mode='rb')
test_data_pipe = test_data_pipe.parse_csv(skip_lines=1, delimiter='\t', as_tuple=True)

In [45]:
test_data_pipe = test_data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(test_data_pipe)
print(temp_list[0])

([1, 79, 211, 12, 446, 8, 4, 2], [1, 12, 83, 56, 92, 76, 4, 2])


In [46]:
score = bleu(test_data_pipe, model, device)

0
10000
20000
30000


In [47]:
score

tensor(1.)