In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import numpy as np

In [2]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [3]:
torch.manual_seed(1)

<torch._C.Generator at 0x75789efc9bb0>

### Positional Encoding

In [4]:
import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        """
        Inputs:
            d_model - Hidden dimensionality of the input.
            max_len - Maximum length of a sequence to expect.
        """
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model

        # Create a long tensor with positions from 0 to max_len - 1
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Create a tensor with indices divided by 10000^(2i/d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        # Compute the positional encodings
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)

        # Register the positional encodings as buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Apply positional encodings to input tensor x.
        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model).
        Returns:
            Tensor: Output tensor after adding positional encodings.
        """
        pe = self.pe[:x.size(1)]  # Take the relevant positional encodings
        x = x + pe.unsqueeze(0)  # Add the positional encodings to the input tensor
        return x

In [5]:
batch_size = 2
seq_len = 500
d_model = 512
x = torch.randn(batch_size, seq_len, d_model)

max_len = 10000
pos_encoder = PositionalEncoding(d_model, max_len)

x_encoded = pos_encoder(x)

print("Input tensor shape:", x.shape)
print("Encoded tensor shape:", x_encoded.shape)

Input tensor shape: torch.Size([2, 500, 512])
Encoded tensor shape: torch.Size([2, 500, 512])


### Attention Matrix and Values

In [6]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask, float('-inf'))
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

In [7]:
seq_len, d_k = 10, 5
q = torch.randn(50, 25, seq_len, d_k)
k = torch.randn(50, 25, seq_len, d_k)
v = torch.randn(50, 25, seq_len, d_k)
values, attention = scaled_dot_product(q, k, v)
print("Q\n", q.shape)
print("K\n", k.shape)
print("V\n", v.shape)
print("Values\n", values.shape)
print("Attention\n", attention.shape)

Q
 torch.Size([50, 25, 10, 5])
K
 torch.Size([50, 25, 10, 5])
V
 torch.Size([50, 25, 10, 5])
Values
 torch.Size([50, 25, 10, 5])
Attention
 torch.Size([50, 25, 10, 10])


### MultiHead Attention

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, head_dim, num_heads):
        super().__init__()
        self.head_dim = head_dim
        self.num_heads = num_heads
        self.d_model = d_model

        # Get Query, Key and Value matrices for each of the attention heads
        self.Qs = nn.Parameter(torch.randn(d_model, num_heads * head_dim))
        self.Ks = nn.Parameter(torch.randn(d_model, num_heads * head_dim))
        self.Vs = nn.Parameter(torch.randn(d_model, num_heads * head_dim))

        # Project the values back to d_model
        self.o_proj = nn.Parameter(torch.randn(head_dim, d_model))

    def forward(self, x):
        batch_size, seq_length, _ = x.size()

        # print("Qs shape:", self.Qs.shape)
        # print("Ks shape:", self.Ks.shape)
        # print("Vs shape:", self.Vs.shape)

        # Get the Qs, Ks and Vs for each of the attention heads
        qs = torch.matmul(x, self.Qs).view(batch_size, seq_length, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        ks = torch.matmul(x, self.Ks).view(batch_size, seq_length, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        vs = torch.matmul(x, self.Vs).view(batch_size, seq_length, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        # Mask
        mask = torch.triu(torch.ones(seq_length, seq_length, device=x.device), diagonal=1).bool()
        
        # print("Query tensor shape:", qs.shape)
        # print("Key tensor shape:", ks.shape)
        # print("Value tensor shape:", vs.shape)

        # Get the Attention Matrices for each of the attention heads
        values, attentions = scaled_dot_product(qs, ks, vs, mask)

        # print("Attentions:", attentions.shape)
        # print("Values:", values.shape)


        # Get the outputs by projecting back to the output dim
        outputs = torch.matmul(values, self.o_proj).permute(0, 2, 1, 3)
        outputs = torch.sum(outputs, dim=2)

        return outputs

In [9]:
# Create a sample input tensor
batch_size = 16
seq_len = 1000
d_model = 1024
embed_dim = 64
num_heads = 4
x = torch.randn(batch_size, seq_len, d_model)

# Create the MultiheadAttention module
attn = MultiHeadAttention(d_model, embed_dim, num_heads)

# Apply the multi-head attention to the input tensor
output = attn(x)

print("Input tensor shape:", x.shape)
print("Output tensor shape:", output.shape)

Input tensor shape: torch.Size([16, 1000, 1024])
Output tensor shape: torch.Size([16, 1000, 1024])


### N Layer Transformer

In [10]:
from itertools import chain

class TransformerNLayer(nn.Module):

    def __init__(self, num_layers, d_model, embed_dim, num_heads, vocab_size):
        super().__init__()
        self.num_layers = num_layers
        self.mha_layers = nn.ModuleList([MultiHeadAttention(d_model, embed_dim, num_heads) for _ in range(num_layers)])
        self.positional_encoder = PositionalEncoding(d_model, 10000)
        self.embed = nn.Parameter(torch.randn(vocab_size, d_model))
        self.unembed = nn.Parameter(torch.randn(d_model, vocab_size))

    def forward(self, x):
        embeddings = torch.matmul(x, self.embed)
        stream = self.positional_encoder(embeddings)
        
        for layer in self.mha_layers:
            outputs = layer(stream.clone())  # Apply each MHA layer
            stream += outputs  # Add residual connection
            
        logits = torch.matmul(stream, self.unembed)
        probabilities = F.softmax(logits, dim=-1)

        return probabilities

    def parameters(self, recurse=True):
        # Override parameters method to include parameters of mha_layers, embed, and unembed
        params = super().parameters()
        for layer in self.mha_layers:
            params = chain(params, layer.parameters(recurse=recurse))
        params = chain(params, [self.embed, self.unembed])
        return params

In [11]:
d_model = 512
embed_dim = 64
num_heads = 4
vocab_size = 500
seq_length = 32
batch_size = 16

transformer_layer = TransformerNLayer(3, d_model, embed_dim, num_heads, vocab_size)

x = torch.randn(batch_size, seq_length, vocab_size)

output_logits = transformer_layer(x)
print("Input logits shape:", x.shape)
print("Output logits shape:", output_logits.shape)

Input logits shape: torch.Size([16, 32, 500])
Output logits shape: torch.Size([16, 32, 500])


### Dataset

In [12]:
import requests

# List of URLs
# urls = [
#     "https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt",
#     "https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%202%20-%20The%20Chamber%20Of%20Secrets.txt",
#     "https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%203%20-%20Prisoner%20of%20Azkaban.txt",
#     "https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%204%20-%20The%20Goblet%20of%20Fire.txt"
# ]

urls = [
    "https://raw.githubusercontent.com/amephraim/nlp/master/texts/J.%20K.%20Rowling%20-%20Harry%20Potter%201%20-%20Sorcerer's%20Stone.txt"
]

# Initialize an empty string to store the concatenated content
concatenated_content = ""

# Loop through each URL
for url in urls:
    # Fetch the content from the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Append the content to the concatenated string
        concatenated_content += response.text
    else:
        # Print an error message if the request failed
        print(f"Failed to fetch content from URL: {url}")

# Now the variable concatenated_content contains the concatenated content of all URLs as a single string
HARRY_POTTER = concatenated_content

### Tokenizer

In [13]:
!pip install pytorch-nlp nltk

Defaulting to user installation because normal site-packages is not writeable


In [14]:
from torchnlp.encoders.text import TreebankEncoder
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mpradyumna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
data = HARRY_POTTER.lower().split("\n")
data = [w.split() for w in data]
data = [w for w in data if w]
new_data = []
for item in data:
    for it in item:
        new_data.append(it)
# data = [w for w in data if not w.lower() in stop_words]
data = new_data

In [16]:
class CharacterTokenizer:
    def __init__(self):
        self.char_to_index = {}
        self.index_to_char = {}

    def fit(self, words):
        chars = set(''.join(words))
        index = 1  # Start index from 1, as 0 will always represent space
        for char in chars:
            if char != ' ':  # Skip space
                self.char_to_index[char] = index
                self.index_to_char[index] = char
                index += 1

    def encode(self, words):
        encoded_tokens = []
        for word in words:
            encoded_word = [self.char_to_index.get(char, 0) for char in word]
            encoded_tokens.extend(encoded_word + [0])  # Concatenate with 0
        # Remove the trailing 0
        if encoded_tokens and encoded_tokens[-1] == 0:
            encoded_tokens.pop()
        return encoded_tokens

    def decode(self, encoded_tokens):
        decoded_words = []
        current_word = ''
        for token in encoded_tokens:
            if token == 0:
                decoded_words.append(current_word)
                current_word = ''
            else:
                current_word += self.index_to_char.get(token, '')
        if current_word:
            decoded_words.append(current_word)
        return decoded_words

# Example usage:
words = ["hello", "world", "python"]
tokenizer = CharacterTokenizer()
tokenizer.fit(words)

encoded_tokens = tokenizer.encode(words)
print("Encoded:", encoded_tokens)

decoded_words = tokenizer.decode(encoded_tokens)
print("Decoded:", decoded_words)

Encoded: [7, 5, 4, 4, 8, 0, 10, 8, 3, 4, 6, 0, 1, 2, 9, 7, 8, 11]
Decoded: ['hello', 'world', 'python']


In [17]:
encoder = CharacterTokenizer()
encoder.fit(data)

### Train the Transformer

In [18]:
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [19]:
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)
device

device(type='cuda')

In [20]:
# encoder = TreebankEncoder(data)
tokens = encoder.encode(HARRY_POTTER)
tokens = torch.Tensor(tokens).to(torch.int64)
tokens = F.one_hot(tokens).float()
tokens.shape

torch.Size([879483, 51])

In [21]:
d_model = 512
embed_dim = 64
num_heads = 12
vocab_size = tokens.shape[1]
seq_length = 300
batch_size = 80

In [22]:
# Group tokens into batches
num_batches = len(tokens) // (batch_size * seq_length)
tokens = tokens[:num_batches * batch_size * seq_length]
tokens = tokens.view(batch_size, -1, vocab_size)

In [23]:
transformer_layer = TransformerNLayer(2, d_model, embed_dim, num_heads, vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(transformer_layer.parameters(), lr=0.001)

  return torch._dynamo.disable(fn, recursive)(*args, **kwargs)


In [24]:
transformer_layer.parameters

<bound method TransformerNLayer.parameters of TransformerNLayer(
  (mha_layers): ModuleList(
    (0-1): 2 x MultiHeadAttention()
  )
  (positional_encoder): PositionalEncoding()
)>

In [42]:
foo = None
tmp = None

transformer_layer.to(device)
for epoch in range(2):
    total_loss = 0.0
    for i in range(tokens.size(1) // seq_length):
        optimizer.zero_grad()

        start_idx = i * seq_length
        end_idx = (i + 1) * seq_length
        x = tokens[:, start_idx:end_idx, :].to(device)
        output_logits = transformer_layer(x)[:, :-1]

        foo = x[:, 1:]
        tmp = output_logits
        
        target = x[:, 1:]
        # Compute the loss
        loss = criterion(output_logits, target)
        total_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()
        
    # Print average loss for the epoch
    print(f"Epoch [{epoch + 1}/{100}], Average Loss: {total_loss / (tokens.size(1) // seq_length):.4f}")


Epoch [1/100], Average Loss: 33.4205
Epoch [2/100], Average Loss: 33.4205


In [59]:
' '.join(encoder.decode(torch.argmax(foo, dim=2)[0].tolist()))

' r i g h t ,   t h a t \' s   w h a t       h e a r d   y e s ,   t h e i r   s o n ,     a r r y "       r .     u r s l e y   s t o p p e d   d e a d .     e a r   f l o o d e d   h i m .     e   l o o k e d   b a c k   a t   t h e   w h i s p e r e r s   a s   i f   h e   w a n t e d   t o   s a'

In [60]:
' '.join(encoder.decode(torch.argmax(tmp, dim=2)[0].tolist()))

'66666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666'

In [27]:
transformer_layer.to('cpu')
idx = 10
input_tokens = tokens[0][:idx].view(1, idx, tokens[0][:idx].shape[1]).to('cpu')
context = input_tokens
predicted_tokens = []

for i in range(100):
    print(i)
    predicted_token = transformer_layer(context)[:, -1, :]
    predicted_tokens.append(predicted_token)
    idx += 1
    context = tokens[0][:idx].view(1, idx, tokens[0][:idx].shape[1]).to('cpu')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [28]:
predicted_tokens = torch.stack(predicted_tokens)

In [29]:
predicted_token_ids = torch.argmax(predicted_tokens, dim=2)
predicted_token_ids.shape

torch.Size([100, 1])

In [30]:
context.shape

torch.Size([1, 110, 51])

In [31]:
predicted = encoder.decode(predicted_token_ids.view(-1).tolist())
predicted

['6666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666']

In [32]:
actual = encoder.decode(torch.argmax(context, dim=2)[0].tolist())

In [33]:
import re

In [34]:
' '.join(predicted)

'6666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666'

In [35]:
' '.join(actual)

"  a r r y     o t t e r   a n d   t h e     o r c e r e r ' s     t o n e                                    "

In [36]:
len(predicted)

1

### Visualizing Attention

In [37]:
input_tokens = tokens[0][:50].view(1, 50, tokens[0][:50].shape[1]).to('cpu')
input_tokens.shape

torch.Size([1, 50, 51])

In [38]:
embeddings = torch.matmul(input_tokens, transformer_layer.embed)
outputs = transformer_layer.positional_encoder(embeddings)
outputs = transformer_layer.mha(embeddings)
outputs.shape

AttributeError: 'TransformerNLayer' object has no attribute 'mha'

In [None]:
batch_size, seq_length, _ = embeddings.size()
print("Qs shape:", transformer_layer.mha.Qs.shape)
print("Ks shape:", transformer_layer.mha.Ks.shape)
print("Vs shape:", transformer_layer.mha.Vs.shape)

# Get the Qs, Ks and Vs for each of the attention heads
qs = torch.matmul(embeddings, transformer_layer.mha.Qs).view(batch_size, seq_length, transformer_layer.mha.num_heads, transformer_layer.mha.head_dim).permute(0, 2, 1, 3)
ks = torch.matmul(embeddings, transformer_layer.mha.Ks).view(batch_size, seq_length, transformer_layer.mha.num_heads, transformer_layer.mha.head_dim).permute(0, 2, 1, 3)
vs = torch.matmul(embeddings, transformer_layer.mha.Vs).view(batch_size, seq_length, transformer_layer.mha.num_heads, transformer_layer.mha.head_dim).permute(0, 2, 1, 3)

print("Query tensor shape:", qs.shape)
print("Key tensor shape:", ks.shape)
print("Value tensor shape:", vs.shape)

# Get the Attention Matrices for each of the attention heads
values, attentions = scaled_dot_product(qs, ks, vs)
print("Values:", values.shape)
print("Attentions:", attentions.shape)

In [None]:
!pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
input_tokens.shape

In [None]:
torch.argmax(input_tokens.view(input_tokens.shape[1], input_tokens.shape[2]), dim=1)

In [None]:
attentions = attentions.squeeze(0).cpu().detach().numpy()  # Remove batch dimension and move to CPU
decoded_tokens = [encoder.decode([i.tolist()]) for i in torch.argmax(input_tokens.view(-1, input_tokens.shape[-1]), dim=1)]

# Plot each attention matrix
for i in range(attentions.shape[0]):
    plt.figure(figsize=(10, 10))
    sns.heatmap(attentions[i], cmap="YlGnBu", xticklabels=decoded_tokens, yticklabels=decoded_tokens)
    plt.title(f"Attention Matrix {i+1}")
    plt.xlabel("Source tokens")
    plt.ylabel("Target tokens")
    plt.show()

In [None]:
data