In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import math
from datasets import load_dataset
from torchinfo import summary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_causal_mask(seq_len):
    # Create a matrix with ones in the lower triangle, zeros above
    mask = torch.tril(torch.ones(seq_len, seq_len))
    return mask  # shape (seq_len, seq_len)

In [3]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.scale = d_k ** 0.5

    def forward(self, Q, K, V, mask=False):
        # Q, K, V: (batch_size, num_heads, seq_len, d_k)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale  # (B, H, L, L)

        if mask:
            mask = generate_causal_mask(scores.size(-1)).expand(scores.size(0), scores.size(1), -1, -1)
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)  # (B, H, L, L)
        output = torch.matmul(attention_weights, V)    # (B, H, L, d_k)
        return output

In [4]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_embedding, num_heads):
        super().__init__()
        assert d_embedding % num_heads == 0
        self.d_k = d_embedding // num_heads
        self.num_heads = num_heads

        #in reality these can map to a lower dimensional space to make things faster``
        self.W_q = nn.Linear(d_embedding, d_embedding)
        self.W_k = nn.Linear(d_embedding, d_embedding)
        self.W_v = nn.Linear(d_embedding, d_embedding)
        self.W_o = nn.Linear(d_embedding, d_embedding)

        self.attention = ScaledDotProductAttention(self.d_k)

        self.norm = nn.LayerNorm(d_embedding)

    def forward(self, x, mask=False):
        x_input = x
        x = self.norm(x)

        B, L, d_embedding = x.size()  # Batch, Sequence Length, Embedding Dim
        H = self.num_heads

        # Linear projections
        Q = self.W_q(x).view(B, H, L, self.d_k)  # (B, H, L, d_k)
        K = self.W_k(x).view(B, H, L, self.d_k)
        V = self.W_v(x).view(B, H, L, self.d_k)

        # Apply attention
        context = self.attention(Q, K, V, mask)  # (B, H, L, d_k)

        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(B, L, d_embedding)  # (B, L, d_embedding)

        # Final linear projection
        output = self.W_o(context)  # (B, L, d_embedding)

        # Add (& pre-Norm)
        #my preference is to do pre-norm for better stabiliy, even though the original paper used post-norm
        output = x_input + output
        return output


In [5]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_embedding, num_heads):
        super().__init__()
        assert d_embedding % num_heads == 0
        self.d_k = d_embedding // num_heads
        self.num_heads = num_heads

        self.W_q = nn.Linear(d_embedding, d_embedding)
        self.W_k = nn.Linear(d_embedding, d_embedding)
        self.W_v = nn.Linear(d_embedding, d_embedding)
        self.W_o = nn.Linear(d_embedding, d_embedding)

        self.attention = ScaledDotProductAttention(self.d_k)

        self.norm = nn.LayerNorm(d_embedding)

    def forward(self, x_decoder, x_encoder, mask=False):
        assert x_decoder.size() == x_encoder.size() #x and x_encoder must have the same size
        x_input = x_decoder
        x_decoder = self.norm(x_decoder)
        
        B, L, d_embedding = x_decoder.size()  # Batch, Sequence Length, Embedding Dim
        H = self.num_heads

        # Linear projections
        Q = self.W_q(x_encoder).view(B, H, L, self.d_k)  # (B, H, L, d_k)
        K = self.W_k(x_encoder).view(B, H, L, self.d_k)
        V = self.W_v(x_decoder).view(B, H, L, self.d_k)

        # Apply attention
        context = self.attention(Q, K, V, mask)  # (B, H, L, d_k)

        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(B, L, d_embedding)  # (B, L, d_embedding)

        # Final linear projection
        output = self.W_o(context)  # (B, L, d_embedding)

        output = output + x_input
        return output


In [6]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_embedding, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_embedding, d_ff)
        self.linear2 = nn.Linear(d_ff, d_embedding)
        self.activation = nn.ReLU()
        self.norm = nn.LayerNorm(d_embedding)
    
    def forward(self, x):
        x_input = x
        x = self.norm(x)
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = x_input + x
        return x

In [7]:
class Encoder(nn.Module):
    def __init__(self, d_embedding, num_heads, d_ff, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([
            MultiHeadAttention(d_embedding, num_heads),
            FeedForwardNetwork(d_embedding, d_ff)
        ] * num_layers)
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [8]:
class Decoder(nn.Module):
    def __init__(self, encoder, d_embedding, num_heads, d_ff, num_layers, vocab_size):
        super().__init__()
        self.encoder = encoder
        self.norm = nn.LayerNorm(d_embedding)
        self.layers = nn.ModuleList([
            MultiHeadAttention(d_embedding, num_heads),
            MultiHeadCrossAttention(d_embedding, num_heads),
            FeedForwardNetwork(d_embedding, d_ff)
        ] * num_layers)
        self.linear = nn.Linear(d_embedding, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x_decoder, x_encoder, mask=False):
        for i, layer in enumerate(self.layers):
            if i % 3 == 0:
                x_decoder = layer(x_decoder, mask)
            if i % 3 == 1:
                x_decoder = layer(x_decoder, x_encoder, mask)
            else:
                x_decoder = layer(x_decoder)
        x_decoder = self.linear(x_decoder)
        x_decoder = self.softmax(x_decoder)
        return x_decoder

In [9]:
class Embedding(nn.Module):
    # chat gpt init. check if it makes sense for myself
    # def __init__(self, d_model, max_len=5000):
    #     super().__init__()

    #     # Create a matrix of shape (max_len, d_model)
    #     pe = torch.zeros(max_len, d_model)
    #     position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
    #     div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

    #     # Apply the sine to even indices in the array; 2i
    #     pe[:, 0::2] = torch.sin(position * div_term)
    #     # Apply the cosine to odd indices in the array; 2i+1
    #     pe[:, 1::2] = torch.cos(position * div_term)

    #     # Register as buffer so it's not a parameter but moves with `.to(device)`
    #     self.register_buffer('pe', pe)
    def __init__(self, vocab_size, d_embedding, max_len): 
        #max_len is the maximum length of the input sequence
        super().__init__()
        self.embedding = torch.randn(vocab_size, d_embedding)

        pe = torch.zeros(max_len, d_embedding)
        powers = torch.repeat_interleave(torch.arange(0, 1, 2/d_embedding), repeats=2).expand(max_len, -1)
        divisors = torch.pow(10000, powers)
        positions = torch.arange(0, max_len).view(max_len, -1).expand(-1, d_embedding)
        args = positions / divisors
        pe[:, 0::2] = torch.sin(args[:, 0::2])
        pe[:, 1::2] = torch.cos(args[:, 1::2])

        # Register as buffer so it's not a parameter but moves with `.to(device)`
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len)
        Returns:
            Tensor of shape (batch_size, seq_len, d_embedding)
        """
        seq_len = x.size(1)
        # Add positional encoding: broadcast over batch dimension
        x = self.embedding[x] + self.pe[:seq_len]
        return x

In [135]:
vocab_size = 100
max_len = 32

In [11]:
# Dummy input
batch_size = 2
seq_len = 5
d_embedding = 64
num_heads = 8

x_test = torch.randn(batch_size, seq_len, d_embedding)

In [12]:

# Apply attention
mha = MultiHeadAttention(d_embedding, num_heads)
output = mha(x_test, mask=True)


In [13]:
x_test.shape

torch.Size([2, 5, 64])

In [14]:
mask = torch.ones((batch_size, seq_len, d_embedding))

In [15]:
print(output.shape)  # (2, 5, 64)


torch.Size([2, 5, 64])


In [16]:
ffn = FeedForwardNetwork(d_embedding, 256)
output_2 = ffn(x_test)
print(output_2.shape)

torch.Size([2, 5, 64])


In [17]:
d_ff = 256
num_layers = 6
encoder = Encoder(d_embedding, num_heads, d_ff, num_layers)
output_3 = encoder(x_test)
print(output_3.shape)

torch.Size([2, 5, 64])


In [18]:
embedding = Embedding(vocab_size=vocab_size, d_embedding=d_embedding, max_len=max_len)

In [19]:
entry = torch.tensor([[1,3,4], [2,3,5]])
print(entry.shape)
print(embedding(entry).shape)

torch.Size([2, 3])
torch.Size([2, 3, 64])


In [20]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_embedding, num_heads, d_ff, num_layers, max_len):
        super().__init__()
        self.embedding = Embedding(vocab_size, d_embedding, max_len)
        self.encoder = Encoder(d_embedding, num_heads, d_ff, num_layers)
        self.decoder = Decoder(self.encoder, d_embedding, num_heads, d_ff, num_layers, vocab_size)

    def forward(self, x_encoder, x_decoder, mask=False):
        x_encoder = self.embedding(x_encoder)
        x_decoder = self.embedding(x_decoder)
        x_encoder = self.encoder(x_encoder)
        output = self.decoder(x_decoder, x_encoder, mask)
        return output

In [136]:
transformer = Transformer(
    vocab_size=vocab_size,
    d_embedding=d_embedding,
    num_heads=num_heads,
    d_ff=d_ff,
    num_layers=num_layers,
    max_len=max_len)

In [137]:
x = torch.randint(0,10,(batch_size, seq_len, d_embedding))
x = torch.randint(0,10,(batch_size, seq_len))

In [138]:
x.shape

torch.Size([2, 5])

In [123]:
transformer(x, x).shape

torch.Size([2, 5, 100])

In [124]:
transformer(x, x, True).shape

torch.Size([2, 5, 100])

In [126]:
x.shape

torch.Size([2, 5])

In [134]:
# data = load_dataset('opus_books', 'en-sk')

In [180]:
df = pd.read_csv('en_sk_sentence_pairs.csv', sep=';')

In [181]:
df

Unnamed: 0,id_en,sentence_en,id_sk,sentence_sk
0,1283,"The password is ""Muiriel"".",2428549,Heslo je „Muiriel“.
1,1317,I never liked biology.,1058173,Nikdy som nemal rád biológiu.
2,1434,I love you.,735094,Ľúbim ťa.
3,1564,Thank you very much!,2428453,Ďakujem vám veľmi pekne!
4,1646,My name is Jack.,2428509,Volám sa Jack.
...,...,...,...,...
26485,13176417,Butter is a dairy product.,13173502,Maslo je mliečny produkt.
26486,13176417,Butter is a dairy product.,13173504,Maslo je mliečny výrobok.
26487,2740458,How were the pyramids built?,13177542,Ako boli postavené pyramídy?
26488,12432209,I yawn.,13180818,Zívnem.


In [190]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # small, open tokenizer
tokenizer.pad_token = tokenizer.eos_token
# Or for BERT-style:
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [204]:
tokenized = [{
    'en': tokenizer(df.iloc[i]['sentence_en'], return_tensors='pt', padding='max_length', max_length=32, truncation=True)['input_ids'],
    'sk': tokenizer(df.iloc[i]['sentence_sk'], return_tensors='pt', padding='max_length', max_length=32, truncation=True)['input_ids']
} for i in range(df.shape[0])]

In [206]:
from torch.utils.data import Dataset, DataLoader
class TranslationDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=32):
        self.tokenizer = tokenizer
        self.examples = examples
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        inputs = self.tokenizer(example["input"], return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        targets = self.tokenizer(example["target"], return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        return (inputs['input_ids'], targets['input_ids'])
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
        }

class TranslationDataset(Dataset):
    def __init__(self, tokenized):
        self.tokenized = tokenized

    def __len__(self):
        return len(self.tokenized)

    def __getitem__(self, idx):
        return (self.tokenized[idx]['en'], self.tokenized[idx]['sk'])

# tokenizer = AutoTokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token
# dataset = TranslationDataset(examples, tokenizer)
dataset = TranslationDataset(tokenized)
dataloader = DataLoader(dataset, batch_size=300, shuffle=True)



In [207]:
cel = torch.nn.CrossEntropyLoss()

In [208]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.01)

In [209]:
from torch.nn.utils.rnn import pad_sequence #CHECK WHAT THIS MEANS

for batch_data, batch_labels in dataloader:
    # print(batch_data.shape, batch_labels.shape)
    batch_data = batch_data.squeeze(1)  # Remove the extra dimension
    batch_labels = batch_labels.squeeze(1)  # Remove the extra dimension
    batch_data = batch_data % 100
    batch_labels = batch_labels % 100

    output = transformer(batch_data, batch_data, mask=True)
    loss = cel(output.permute(0, 2, 1), batch_labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Batch Loss: {loss.item()}")


Batch Loss: 4.033769607543945
Batch Loss: 4.03731107711792
Batch Loss: 4.044394493103027
Batch Loss: 4.017727851867676
Batch Loss: 4.02908182144165
Batch Loss: 4.015956878662109
Batch Loss: 4.0098114013671875
Batch Loss: 4.020748615264893
Batch Loss: 4.022207260131836
Batch Loss: 4.019186019897461
Batch Loss: 4.016165256500244
Batch Loss: 4.026061058044434
Batch Loss: 4.019290447235107
Batch Loss: 4.035956859588623
Batch Loss: 4.023561000823975
Batch Loss: 4.0126237869262695
Batch Loss: 4.006686210632324
Batch Loss: 4.005540370941162
Batch Loss: 4.038978099822998
Batch Loss: 4.029811382293701
Batch Loss: 4.023561000823975
Batch Loss: 4.018248558044434
Batch Loss: 4.035019397735596
Batch Loss: 4.016582012176514
Batch Loss: 4.019290447235107
Batch Loss: 4.031998634338379
Batch Loss: 4.016061305999756
Batch Loss: 4.025019645690918
Batch Loss: 4.034707069396973
Batch Loss: 4.032207012176514
Batch Loss: 4.026269435882568
Batch Loss: 4.012832164764404
Batch Loss: 4.005748748779297
Batch Loss

project idea \
it can predict sequence of numbers, in words. \
two four six eight ten - twelve \
three six nine twelve - fifteen \
could be arithmetic and geometric. I will generate them, code up the number to string mapper, pass it mapped to strings