In [1]:
pip install torch torchvision transformers numpy matplotlib

Collecting transformers
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting charset-normalizer<4,>=2 (from requests->transformers)
  Downloading charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests->transformers)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests-

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [5]:
from datasets import load_dataset

In [7]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 477352.37 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 1419296.60 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 766604.27 examples/s]


In [8]:
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [10]:
print (train_dataset)


Dataset({
    features: ['text'],
    num_rows: 36718
})


In [11]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to count tokens in a dataset
def count_tokens(dataset):
    total_tokens = 0
    for example in dataset:
        tokens = tokenizer.tokenize(example['text'])
        total_tokens += len(tokens)
    return total_tokens

# Count tokens for each dataset
train_tokens = count_tokens(train_dataset)
val_tokens = count_tokens(val_dataset)
test_tokens = count_tokens(test_dataset)

print(f"Number of tokens in train dataset: {train_tokens}")
print(f"Number of tokens in validation dataset: {val_tokens}")
print(f"Number of tokens in test dataset: {test_tokens}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors


Number of tokens in train dataset: 2303695
Number of tokens in validation dataset: 238656
Number of tokens in test dataset: 273178


In [12]:
from collections import Counter
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize text and build vocabulary
def build_vocabulary(dataset, tokenizer, vocab_size=10000, min_freq=5):
    token_counter = Counter()
    
    # Tokenize the dataset and count token frequencies
    for example in dataset:
        tokens = tokenizer.tokenize(example['text'])
        token_counter.update(tokens)
    
    # Filter out rare tokens
    vocab = [token for token, freq in token_counter.items() if freq >= min_freq]
    
    # Keep the top `vocab_size` most frequent tokens
    vocab = vocab[:vocab_size]
    
    # Add special tokens
    special_tokens = ["<PAD>", "<UNK>", "<BOS>", "<EOS>"]
    vocab = special_tokens + vocab
    
    # Create token to index mapping
    token_to_idx = {token: idx for idx, token in enumerate(vocab)}
    
    return token_to_idx

# Build vocabulary for the training dataset
vocab = build_vocabulary(train_dataset, tokenizer)

# Function to convert tokens to indices
def tokens_to_indices(tokens, token_to_idx):
    return [token_to_idx.get(token, token_to_idx["<UNK>"]) for token in tokens]

# Function to create training examples
def create_training_examples(dataset, tokenizer, token_to_idx, seq_len=8):
    inputs = []
    targets = []
    
    for example in dataset:
        tokens = tokenizer.tokenize(example['text'])
        token_indices = tokens_to_indices(tokens, token_to_idx)
        
        for i in range(len(token_indices) - seq_len):
            inputs.append(token_indices[i:i+seq_len])
            targets.append(token_indices[i+seq_len])
    
    return inputs, targets

# Create training examples for the training dataset
train_inputs, train_targets = create_training_examples(train_dataset, tokenizer, vocab)

# Function to split data into batches
def create_batches(inputs, targets, batch_size=64):
    batches = []
    for i in range(0, len(inputs), batch_size):
        batch_inputs = inputs[i:i+batch_size]
        batch_targets = targets[i:i+batch_size]
        batches.append((batch_inputs, batch_targets))
    return batches

# Create batches for the training dataset
train_batches = create_batches(train_inputs, train_targets)

Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors
