In [1]:
from datasets import load_dataset
from transformers import BertTokenizerFast
import torch
from torch.utils.data import DataLoader

# Load the dataset and select only the first 1000 entries
dataset = load_dataset("roneneldan/TinyStories", split="train[:1000]")

# Initialize BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Add special tokens and resize tokenizer vocabulary
special_tokens_dict = {'additional_special_tokens': ['[NL]']}
tokenizer.add_special_tokens(special_tokens_dict)

# Retrieve and print token IDs to verify
nl_token_id = tokenizer.convert_tokens_to_ids('[NL]')
pad_token_id = tokenizer.pad_token_id  # Already defined in BERT tokenizer as [PAD]

# Verify dataset length to ensure it's limited to 1000
print(f"Loaded dataset length: {len(dataset)}")

Downloading readme:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/248M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Loaded dataset length: 1000


In [None]:
filtered_data = []

for entry in dataset:
    text = entry['text']
    # Tokenize without truncation to get the full length
    encodings = tokenizer(text, truncation=False, return_tensors="pt")
    token_count = encodings['input_ids'].shape[1]  # Number of tokens in this sequence
    # Append only if token count is within the 511 token limit
    if token_count <= 511:
        filtered_data.append(entry)

In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast
import torch
from torch.utils.data import DataLoader

# Custom Dataset class with preprocessing
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=10):  # Set max_length to 10
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']

        # Preprocess: Replace newlines with the special [NL] token and tokenize
        text = text.replace("\n\n", " [NL] ")
        encodings = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",  # Enable padding to max_length
            max_length=self.max_length,  # Max length is now 10
            return_tensors="pt"
        )

        # Get input ids and attention mask from tokenizer
        input_ids = encodings['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = encodings['attention_mask'].squeeze(0)  # Remove batch dimension

        # Create a triangular attention mask
        tri_mask = torch.tril(torch.ones(self.max_length, self.max_length))  # Lower triangular matrix (1's below diagonal)

        # Keep the padding tokens as-is in the attention mask
        attention_mask = attention_mask * tri_mask

        target_ids = input_ids.clone()
        # Shift target_ids by 1 position for autoregressive modeling
        target_ids = target_ids[1:]
        target_ids = torch.cat((target_ids, torch.tensor([pad_token_id])))  # Add pad token ID to the end

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': target_ids
        }

# Instantiate the dataset and dataloader for custom text
text_dataset = TextDataset(filtered_data, tokenizer, max_length=512)
data_loader = DataLoader(text_dataset, batch_size=2, shuffle=False)

In [None]:
tokenizer.convert_tokens_to_ids('[NL]')

In [None]:

import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

# Positional Encoding Class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=513):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

# Single attention head class
class AttentionHead(nn.Module):
    def __init__(self, d_model, head_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(d_model, head_size, bias=False)
        self.query = nn.Linear(d_model, head_size, bias=False)
        self.value = nn.Linear(d_model, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)

        # Apply official scaling using sqrt(d_k)
        d_k = k.size(-1)
        wei = (q @ k.transpose(-2, -1)) / (d_k ** 0.5)

        if mask is not None:
            mask = mask[:, :T, :T]
            wei = wei.masked_fill(mask == 0, float('-1e30'))

        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v
        return out

# Multi-head self-attention class
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        head_size = d_model // num_heads
        self.heads = nn.ModuleList([AttentionHead(d_model, head_size, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        out = torch.cat([h(x, mask) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

# Single decoder block class
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(DecoderBlock, self).__init__()
        self.self_attention = MultiHeadSelfAttention(d_model, num_heads, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)

        # Optional: Feed-forward network (FFN)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Self-attention and residual connection
        attn_out = self.self_attention(x, mask)
        x = self.layer_norm1(x + self.dropout1(attn_out))

        # Feed-forward network and residual connection
        ff_out = self.feed_forward(x)
        x = self.layer_norm2(x + self.dropout2(ff_out))

        return x

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads=4, num_layers=6, max_len=512, dropout=0.1):
        super(DecoderOnlyTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.decoder_blocks = nn.ModuleList([DecoderBlock(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.linear_layer = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask=None):
        # Embed the input tokens and apply positional encoding
        x = self.embedding(x)
        x = self.positional_encoding(x)

        # Pass through each decoder block with the mask
        for block in self.decoder_blocks:
            x = block(x, mask)

        # Apply final layer normalization and linear projection
        x = self.layer_norm(x)
        x = self.linear_layer(x)

        return x


# Instantiate the decoder-only model with multiple layers
vocab_size = tokenizer.vocab_size + len(tokenizer.added_tokens_encoder)
d_model = 24
num_heads = 4
num_layers = 3 # Number of decoder layers
max_len = 512
dropout = 0.1

model = DecoderOnlyTransformer(vocab_size=vocab_size, d_model=d_model, num_heads=num_heads, num_layers=num_layers, max_len=max_len, dropout=dropout)
data_loader = DataLoader(text_dataset, batch_size=2, shuffle=False)

model.eval()

# Pass the input through the model
'''for batch in dataloader:
    input_ids = batch['input_ids'][0].unsqueeze(0)
    attention_mask = batch['attention_mask'][0].unsqueeze(0)

    #print("Input Shape:", input_ids.shape)

    input_ids = input_ids.to(torch.int64)
    attention_mask = attention_mask.to(torch.float32)  # Ensure mask is float for compatibility

    if torch.cuda.is_available():
        model = model.cuda()
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()

    with torch.no_grad():
        output = model(input_ids, mask=attention_mask)

    #print("Model Output:")
    #print(output)
    #print("Output Shape:", output.shape)
    break'''

In [None]:
import torch



def count_parameters(model):

    return sum(p.numel() for p in model.parameters())



# Assuming 'model' is your PyTorch model

total_params = count_parameters(model)

print(f"Total model parameters: {total_params}")

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm
import time

# Define the training function with progress tracking
def train_model(model, dataloader, pad_token_id, epochs=5, learning_rate=1e-4, device='cpu'):
    model.to(device)  # Move model to specified device
    model.train()  # Set model to training mode

    # Define the optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Define the loss function, ignoring the pad token
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

    # Training loop
    for epoch in range(epochs):
        total_loss = 0
        start_time = time.time()  # Start time for epoch

        # Initialize progress bar for the epoch
        batch_progress = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")

        for batch in batch_progress:
            input_ids = batch['input_ids'].to(device)
            labels = batch['target_ids'].to(device)
            mask = batch['attention_mask'].to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids ,mask = mask)  # Pass input through the model

            # Reshape outputs and labels for CrossEntropyLoss
            logits = outputs.view(-1, outputs.size(-1))  # Flatten outputs to 2D
            target = labels.view(-1)  # Flatten labels to 1D

            # Compute loss while ignoring pad tokens
            loss = criterion(logits, target)
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

            # Update progress bar with current loss
            batch_progress.set_postfix(loss=loss.item())

        # Calculate average loss per epoch
        avg_loss = total_loss / len(dataloader)
        epoch_duration = time.time() - start_time  # Time taken for epoch

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Time: {epoch_duration:.2f} seconds")

    print("Training complete!")

# Set hyperparameters and training device
epochs = 1
learning_rate = 1e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the padding token ID (replace with the actual pad token ID from your tokenizer)
pad_token_id = tokenizer.pad_token_id

# Call the training function
train_model(model, data_loader, pad_token_id=pad_token_id, epochs=epochs, learning_rate=learning_rate, device=device)

In [None]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizerFast

# Initialize the tokenizer
tokenizer1 = BertTokenizerFast.from_pretrained("bert-base-uncased")

def pad_single_token_list(token_list, max_length=512, device='cpu'):
    # Use tokenizer's pad method to add padding up to the specified max_length
    padded_tokens = tokenizer1.pad(
        {"input_ids": token_list},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Extract the padded tensor and move it to the specified device
    padded_tensor = padded_tokens["input_ids"].squeeze(0).to(device)  # Squeeze to remove batch dimension
    return padded_tensor

# Top-k sampling function
def top_k_sampling(logits, k=3, device='cpu'):
    """Apply top-k sampling to logits"""
    logits = logits / 1.0  # Adjust temperature if necessary

    # Get the top-k values and their indices, move to device
    top_k_values, top_k_indices = torch.topk(logits, k)
    top_k_values = top_k_values.to(device)
    top_k_indices = top_k_indices.to(device)

    # Convert logits to probabilities using softmax
    probabilities = F.softmax(top_k_values, dim=-1)

    # Sample one token from the top-k probabilities
    selected_index = top_k_indices[torch.multinomial(probabilities, 1)]

    return selected_index.item()


def autoregressive_inference_topk(model, initial_output, max_length=512, k=3, pad_token_id=0, device='cpu'):
    model.to(device)  # Ensure model is on the device
    model.eval()

    generated_tokens_for_input = [tokenizer.cls_token_id]
    generated_tokens_for_output = []
    current_output = initial_output.to(device)  # Modified: Move initial_output to device
    for i in range(current_output.size(1)):
        if i > 500:
            break
        logits = current_output[0, i]
        current_token = top_k_sampling(logits, k, device=device)

        if current_token == tokenizer.sep_token_id:
            break
        generated_tokens_for_input.append(current_token)
        generated_tokens_for_output.append(current_token)
        current_input = pad_single_token_list(generated_tokens_for_input, max_length = max_length, device=device)  # Modified: Move padded input to device
        current_output = model(current_input.unsqueeze(0))
    return generated_tokens_for_output


# Example usage:
# Assuming tokenizer and model are already initialized
pad_token_id = tokenizer.pad_token_id
initial_input = torch.tensor([[tokenizer.cls_token_id] + [pad_token_id] * 511], device=device)  # Modified: Move initial input to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
initial_output = model(initial_input)  # No change needed, as initial_input is already on the correct device

# Generate text
print("\n")
for i in range(3):
   print("---------------------------------------------------------------------------------------------------------------------")
   print(f"story {i+1} :")
   generated_sequence = autoregressive_inference_topk(model, initial_output=initial_output, max_length=512, k=50, pad_token_id=pad_token_id, device=device)
   generated_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
   generated_text = generated_text.replace(" [NL] ", "\n").replace("[NL] ", "\n").replace(" [NL]", "\n").replace("[NL]", "\n")
   print(generated_text)