In [1]:
pip install torch transformers datasets


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-3.3.2 multiprocess-0.70.16 xxhash-3.5.0


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import math

# -------------------------------
# Define a custom dataset for language modeling
# -------------------------------
class LanguageModelingDataset(Dataset):
    def __init__(self, tokenized_texts, block_size=128):
        self.block_size = block_size
        # Flatten all token ids into a single list
        self.input_ids = []
        for tokens in tokenized_texts["input_ids"]:
            self.input_ids.extend(tokens)
        # Create sequences (chunks) of fixed block_size
        self.examples = [self.input_ids[i: i + block_size] for i in range(0, len(self.input_ids) - block_size, block_size)]
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        # For language modeling, input and target are the same (shifted by one in more advanced setups)
        x = torch.tensor(self.examples[idx], dtype=torch.long)
        return x, x

# -------------------------------
# Define a basic Transformer block
# -------------------------------
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim, heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, forward_expansion * embed_dim),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, value, key, query, mask):
        # Multi-head self-attention
        attention, _ = self.attention(query, key, value, attn_mask=mask)
        x = self.dropout(attention) + query
        x = self.norm1(x)
        # Feed-forward network
        forward = self.feed_forward(x)
        out = self.dropout(forward) + x
        out = self.norm2(out)
        return out

# -------------------------------
# Define a simple Transformer-based Language Model
# -------------------------------
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_layers, heads, dropout, forward_expansion, max_length):
        super(TransformerLM, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_length, embed_dim)
        self.dropout = nn.Dropout(dropout)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, heads, dropout, forward_expansion)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(embed_dim, vocab_size)
        self.max_length = max_length
    
    def forward(self, x):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        x = self.token_embedding(x) + self.position_embedding(positions)
        x = self.dropout(x)
        # Transformer expects shape: (sequence_length, batch_size, embed_dim)
        x = x.transpose(0, 1)
        for layer in self.layers:
            x = layer(x, x, x, mask=None)
        x = x.transpose(0, 1)  # (batch_size, sequence_length, embed_dim)
        logits = self.fc_out(x)
        return logits

# -------------------------------
# Training function
# -------------------------------
def train():
    # Hyperparameters
    block_size = 128
    batch_size = 16
    embed_dim = 128
    num_layers = 2
    heads = 4
    dropout = 0.1
    forward_expansion = 4
    max_length = block_size
    epochs = 1  # For demonstration; increase as needed
    lr = 3e-4

    # Load WikiText-2 dataset (raw version)
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

    # Load GPT-2 tokenizer (for simplicity)
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    
    # Tokenize the dataset (each example is tokenized separately)
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=block_size)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Create a language modeling dataset
    lm_dataset = LanguageModelingDataset(tokenized_dataset, block_size=block_size)
    dataloader = DataLoader(lm_dataset, batch_size=batch_size, shuffle=True)
    
    # Set device and instantiate the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TransformerLM(
        vocab_size=len(tokenizer),
        embed_dim=embed_dim,
        num_layers=num_layers,
        heads=heads,
        dropout=dropout,
        forward_expansion=forward_expansion,
        max_length=max_length
    )
    model.to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            logits = model(inputs)  # (batch_size, seq_length, vocab_size)
            # Reshape logits and targets for computing loss
            loss = criterion(logits.view(-1, logits.shape[-1]), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if batch_idx % 100 == 0:
                print(f"Epoch {epoch+1}, Step {batch_idx}, Loss: {loss.item():.4f}")
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
    
    # Save the trained model state (optional)
    torch.save(model.state_dict(), "transformer_lm.pth")
    print("Model training complete and saved as transformer_lm.pth")

if __name__ == "__main__":
    train()


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Epoch 1, Step 0, Loss: 10.9646
Epoch 1, Step 100, Loss: 6.6061
Epoch 1, Step 200, Loss: 5.1701
Epoch 1, Step 300, Loss: 4.6207
Epoch 1, Step 400, Loss: 3.8425
Epoch 1, Step 500, Loss: 3.2640
Epoch 1, Step 600, Loss: 2.7487
Epoch 1, Step 700, Loss: 2.6033
Epoch 1, Step 800, Loss: 2.4090
Epoch 1 Average Loss: 4.2281
Model training complete and saved as transformer_lm.pth
