In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Load the T5 tokenizer.
# It's recommended to use a tokenizer from a pre-trained model like 't5-base'
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# 2. Load the C4 dataset.
# The `streaming=True` argument is useful for huge datasets like C4 to avoid downloading the whole thing.
c4_dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)

# 3. Define a tokenization function.
# This function will be applied to each batch of data.
def tokenize_function(examples):
    # The T5 model expects a prefix for the task, for example "denoise text: ".
    # This is important for T5's pre-training objective.
    # However, for a simple tokenization, we can just process the "text" field.
    return tokenizer(examples["text"], truncation=True)

# 4. Apply the tokenizer to the dataset using the map function.
# `batched=True` processes the data in batches, which is much faster.
tokenized_c4 = c4_dataset.map(tokenize_function, batched=True)

# You can now iterate through the tokenized dataset.
for example in tokenized_c4:
    print(example["input_ids"])
    break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[12847, 277, 15068, 4501, 3, 12297, 3399, 16, 5964, 7115, 9, 55, 531, 25, 241, 12, 129, 394, 44, 492, 3326, 15068, 58, 148, 56, 43, 8, 1004, 6, 474, 48, 30, 39, 4793, 230, 5, 2721, 6, 1600, 1630, 727, 1715, 1150, 4501, 15068, 16127, 6, 9137, 2659, 5595, 45, 301, 782, 3624, 14627, 15, 12612, 277, 5, 216, 56, 36, 2119, 3, 9, 19529, 593, 853, 21, 921, 113, 2746, 12, 129, 394, 28, 70, 17712, 1098, 5, 216, 56, 3884, 25, 762, 25, 174, 12, 214, 12, 5978, 16, 3, 9, 3, 23405, 4547, 15068, 2259, 6, 379, 2097, 6, 5459, 6, 13618, 7, 6, 3604, 1801, 11, 27856, 6, 303, 24190, 11, 1472, 251, 5, 37, 583, 12, 36, 16, 8, 853, 19, 25264, 399, 568, 6, 11, 21, 21380, 7, 34, 19, 339, 5, 15746, 26, 16, 8, 583, 56, 36, 893, 3, 9, 3, 17, 18, 9486, 42, 3, 9, 1409, 29, 11, 25, 56, 36, 12246, 5977, 13, 284, 3604, 24, 19, 2657, 5, 1]


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Set the device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. Define a simple text dataset and vocabulary ---
# FIX: Added much longer sentences to the training data.
TEXT_DATA = [
    "hello world and this is a much longer sentence than before",
    "python programming is fun and easy to learn especially for beginners",
    "i love deep learning with convolutional neural networks and attention",
    "deep learning is a powerful tool in artificial intelligence for language modeling tasks"
]

# Create vocabulary
vocab = sorted(list(set(" ".join(TEXT_DATA).split())))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

class TextDataset(Dataset):
    def __init__(self, text_data, word_to_idx, sequence_length):
        self.sequences = []
        for sentence in text_data:
            words = sentence.split()
            # This condition will now be met for the longer sentences
            if len(words) > sequence_length:
                for i in range(len(words) - sequence_length):
                    input_seq = [word_to_idx[word] for word in words[i:i+sequence_length]]
                    target_word = word_to_idx[words[i+sequence_length]]
                    self.sequences.append((torch.tensor(input_seq), torch.tensor(target_word)))
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx]

# --- 2. Attention Mechanism for 1D sequences ---
class AttentionMessagePassing(nn.Module):
    def __init__(self, in_features):
        super(AttentionMessagePassing, self).__init__()
        self.in_features = in_features
        self.query_proj = nn.Linear(in_features, in_features // 2)
        self.key_proj = nn.Linear(in_features, in_features // 2)
        self.value_proj = nn.Linear(in_features, in_features)

    def forward(self, x):
        batch_size, seq_len, in_features = x.shape
        query = self.query_proj(x)
        key = self.key_proj(x)
        value = self.value_proj(x)
        key_t = key.permute(0, 2, 1)
        
        attention_scores = torch.bmm(query, key_t)
        attention_scores = attention_scores / np.sqrt(query.size(-1))
        
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
        attention_scores.masked_fill_(mask, -1e9)
        
        attention_weights = F.softmax(attention_scores, dim=-1)
        message_passed_features = torch.bmm(attention_weights, value)
        
        output = message_passed_features + x
        return output

# --- 3. The new TextCNN Architecture with Attention ---
class AttentionTextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, sequence_length):
        super(AttentionTextCNN, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.sequence_length = sequence_length
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv_block = nn.Sequential(
            nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
        )
        
        self.final_seq_len = self._compute_final_seq_len()
        
        self.attention_block = AttentionMessagePassing(in_features=256)
        
        self.fc_block = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * self.final_seq_len, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, vocab_size)
        )

    def _compute_final_seq_len(self):
        l_in = self.sequence_length
        l_out_conv1 = (l_in + 2*1 - 3) + 1
        l_out_pool1 = torch.floor(torch.tensor((l_out_conv1 - 2) / 2)) + 1
        l_out_conv2 = (l_out_pool1 + 2*1 - 3) + 1
        l_out_pool2 = torch.floor(torch.tensor((l_out_conv2 - 2) / 2)) + 1
        
        return int(l_out_pool2.item())

    def forward(self, x):
        embeddings = self.embedding(x).permute(0, 2, 1)
        conv_features = self.conv_block(embeddings)
        conv_features_t = conv_features.permute(0, 2, 1)
        attended_features = self.attention_block(conv_features_t)
        logits = self.fc_block(attended_features)
        return logits

def train_model():
    # Hyperparameters
    learning_rate = 0.001
    batch_size = 2
    num_epochs = 20
    # FIX: Increased sequence length to a value that won't result in an empty tensor.
    sequence_length = 8
    embedding_dim = 128
    
    # Data loading
    train_dataset = TextDataset(TEXT_DATA, word_to_idx, sequence_length)
    
    # FIX: Added a check to prevent the error
    if len(train_dataset) == 0:
        print("Error: Dataset is empty.")
        print(f"Please increase the length of sentences in TEXT_DATA or decrease the `sequence_length` (currently {sequence_length}).")
        return

    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    
    model = AttentionTextCNN(vocab_size, embedding_dim, sequence_length).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    print("Starting training...")
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if (i+1) % 1 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {running_loss/(i+1):.4f}')
    
    print("Training finished.")

if __name__ == '__main__':
    train_model()

Using device: cpu
Vocabulary size: 37
Starting training...
Epoch [1/20], Step [1/7], Loss: 3.7109
Epoch [1/20], Step [2/7], Loss: 3.6706
Epoch [1/20], Step [3/7], Loss: 3.6618
Epoch [1/20], Step [4/7], Loss: 3.6554
Epoch [1/20], Step [5/7], Loss: 3.6641
Epoch [1/20], Step [6/7], Loss: 3.6367
Epoch [1/20], Step [7/7], Loss: 3.6520
Epoch [2/20], Step [1/7], Loss: 3.2856
Epoch [2/20], Step [2/7], Loss: 3.2701
Epoch [2/20], Step [3/7], Loss: 3.2480
Epoch [2/20], Step [4/7], Loss: 3.3315
Epoch [2/20], Step [5/7], Loss: 3.2014
Epoch [2/20], Step [6/7], Loss: 3.2693
Epoch [2/20], Step [7/7], Loss: 3.2356
Epoch [3/20], Step [1/7], Loss: 3.5368
Epoch [3/20], Step [2/7], Loss: 2.8795
Epoch [3/20], Step [3/7], Loss: 2.9374
Epoch [3/20], Step [4/7], Loss: 2.7491
Epoch [3/20], Step [5/7], Loss: 2.8209
Epoch [3/20], Step [6/7], Loss: 2.7460


  l_out_pool2 = torch.floor(torch.tensor((l_out_conv2 - 2) / 2)) + 1


Epoch [3/20], Step [7/7], Loss: 2.7118
Epoch [4/20], Step [1/7], Loss: 2.7884
Epoch [4/20], Step [2/7], Loss: 2.9727
Epoch [4/20], Step [3/7], Loss: 2.7151
Epoch [4/20], Step [4/7], Loss: 2.7521
Epoch [4/20], Step [5/7], Loss: 2.9391
Epoch [4/20], Step [6/7], Loss: 2.8336
Epoch [4/20], Step [7/7], Loss: 2.9083
Epoch [5/20], Step [1/7], Loss: 2.7260
Epoch [5/20], Step [2/7], Loss: 2.7982
Epoch [5/20], Step [3/7], Loss: 2.9113
Epoch [5/20], Step [4/7], Loss: 2.3345
Epoch [5/20], Step [5/7], Loss: 2.1588
Epoch [5/20], Step [6/7], Loss: 2.1593
Epoch [5/20], Step [7/7], Loss: 2.2315
Epoch [6/20], Step [1/7], Loss: 1.4436
Epoch [6/20], Step [2/7], Loss: 1.2955
Epoch [6/20], Step [3/7], Loss: 1.6238
Epoch [6/20], Step [4/7], Loss: 2.1200
Epoch [6/20], Step [5/7], Loss: 1.8380
Epoch [6/20], Step [6/7], Loss: 1.7514
Epoch [6/20], Step [7/7], Loss: 1.6431
Epoch [7/20], Step [1/7], Loss: 1.3238
Epoch [7/20], Step [2/7], Loss: 1.2145
Epoch [7/20], Step [3/7], Loss: 1.7978
Epoch [7/20], Step [4/7],

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer
import numpy as np

# Set the device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- 1. Attention Mechanism for 1D sequences ---
class AttentionMessagePassing(nn.Module):
    def __init__(self, in_features):
        super(AttentionMessagePassing, self).__init__()
        self.in_features = in_features
        self.query_proj = nn.Linear(in_features, in_features // 2)
        self.key_proj = nn.Linear(in_features, in_features // 2)
        self.value_proj = nn.Linear(in_features, in_features)

    def forward(self, x):
        batch_size, seq_len, in_features = x.shape
        query = self.query_proj(x)
        key = self.key_proj(x)
        value = self.value_proj(x)
        key_t = key.permute(0, 2, 1)
        
        attention_scores = torch.bmm(query, key_t)
        attention_scores = attention_scores / np.sqrt(query.size(-1))
        
        # We need a causality mask for language modeling
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
        attention_scores.masked_fill_(mask, -1e9)
        
        attention_weights = F.softmax(attention_scores, dim=-1)
        message_passed_features = torch.bmm(attention_weights, value)
        
        output = message_passed_features + x
        return output

# --- 2. The new TextCNN Architecture with Attention ---
class AttentionTextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, sequence_length):
        super(AttentionTextCNN, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.sequence_length = sequence_length
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.conv_block = nn.Sequential(
            # Conv1d expects input shape (batch, channels, sequence_length)
            nn.Conv1d(in_channels=embedding_dim, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
        )
        
        self.final_seq_len = self._compute_final_seq_len()
        
        self.attention_block = AttentionMessagePassing(in_features=256)
        
        # FIX: Removed nn.Flatten() and adjusted Linear layer to work on a per-token basis.
        # This is the correct way to build a sequence-to-sequence model for language modeling.
        self.fc_block = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, vocab_size)
        )

    def _compute_final_seq_len(self):
        l_in = self.sequence_length
        l_out_conv1 = (l_in + 2*1 - 3) + 1
        l_out_pool1 = torch.floor(torch.tensor((l_out_conv1 - 2) / 2)) + 1
        l_out_conv2 = (l_out_pool1 + 2*1 - 3) + 1
        l_out_pool2 = torch.floor(torch.tensor((l_out_conv2 - 2) / 2)) + 1
        
        return int(l_out_pool2.item())

    def forward(self, x):
        embeddings = self.embedding(x).permute(0, 2, 1)
        conv_features = self.conv_block(embeddings)
        conv_features_t = conv_features.permute(0, 2, 1)
        attended_features = self.attention_block(conv_features_t)
        # FIX: The fc_block now processes the attended features without flattening.
        logits = self.fc_block(attended_features)
        return logits

# --- 3. Main training function updated for C4 dataset ---
def train_model():
    # Hyperparameters
    learning_rate = 0.001
    batch_size = 8
    num_epochs = 25
    sequence_length = 128
    embedding_dim = 128
    
    # ----------------------------------------
    # 1. Load C4 dataset (streaming) and Bert tokenizer
    # ----------------------------------------
    print("Loading a small streaming portion of the C4 dataset...")
    dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
    
    # Use a simpler, pre-trained tokenizer: BertTokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    vocab_size = len(tokenizer)
    
    # We will use a smaller sample for a quicker demonstration.
    dataset_sample = dataset

    # ----------------------------------------
    # 2. Pre-process the dataset with the tokenizer
    # ----------------------------------------
    def preprocess_function(examples):
        inputs = examples["text"]
        tokenized_input = tokenizer(
            inputs,
            max_length=sequence_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Shift the labels for next-token prediction
        input_ids = tokenized_input['input_ids'].squeeze(0)
        labels = torch.cat((input_ids[1:], torch.tensor([tokenizer.pad_token_id])))
        
        return {"input_ids": input_ids, "labels": labels}
    
    # Apply the preprocessing to the streaming dataset
    processed_dataset = dataset_sample.map(preprocess_function, batched=False)
    
    # We use the processed streaming dataset directly with the DataLoader
    train_loader = DataLoader(processed_dataset, batch_size=batch_size)
    
    # ----------------------------------------
    # 3. Model, Loss, and Optimizer
    # ----------------------------------------
    model = AttentionTextCNN(vocab_size, embedding_dim, sequence_length).to(device)
    # Ignore the pad token in the loss
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    print("Starting training...")
    
    # ----------------------------------------
    # 4. Training Loop
    # ----------------------------------------
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        # FIX: The total number of batches is unknown, so we can't use len(train_loader).
        # We will track the step count manually.
        for i, batch in enumerate(train_loader):
            # Batches from DataLoader are already tensors
            inputs = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            
            # Forward pass
            outputs = model(inputs)
            
            # The model's output sequence length might be different from the input
            # due to pooling. We need to truncate the labels to match.
            outputs_seq_len = outputs.size(1)
            labels = labels[:, :outputs_seq_len]

            # Reshape for loss calculation: (B*L, V) vs (B*L)
            outputs_flat = outputs.view(-1, outputs.size(-1))
            # FIX: Use .reshape() instead of .view() to handle non-contiguous tensors
            labels_flat = labels.reshape(-1)
            
            loss = criterion(outputs_flat, labels_flat)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if (i + 1) % 50 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {loss.item():.4f}')
        
        # FIX: The total number of batches is unknown, so average loss is not meaningful.
        print(f'Epoch [{epoch+1}/{num_epochs}], Total Loss: {total_loss:.4f}')
    
    print("Training finished.")

if __name__ == '__main__':
    train_model()


  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Loading a small streaming portion of the C4 dataset...


  l_out_pool2 = torch.floor(torch.tensor((l_out_conv2 - 2) / 2)) + 1


Starting training...
Epoch [1/25], Step [50], Loss: 8.1715
Epoch [1/25], Step [100], Loss: 7.9306


KeyboardInterrupt: 

In [6]:
def generate_text(model, tokenizer, prompt, max_length=50):
    """
    Generates text from the model given a starting prompt.
    """
    print("Generating text...")
    
    try:
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

        # Loop to generate new tokens
        for _ in range(max_length):
            # Pad the input_ids to the model's fixed sequence length
            # This is necessary because the CNN architecture is not
            # designed for variable-length inputs.
            current_len = input_ids.size(1)
            padding_needed = model.sequence_length - current_len
            if padding_needed > 0:
                padded_input_ids = F.pad(input_ids, (0, padding_needed), 'constant', tokenizer.pad_token_id)
            else:
                padded_input_ids = input_ids[:, -model.sequence_length:]

            # Get the model's output on the padded sequence
            outputs = model(padded_input_ids)

            # Get the predictions for the last non-padded token
            predictions = outputs[:, -1, :]
            predicted_id = torch.argmax(predictions, dim=-1)
            
            # Add the new token to the original sequence
            input_ids = torch.cat([input_ids, predicted_id.unsqueeze(1)], dim=-1)

        # Decode the token IDs back to text.
        generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        return generated_text
    
    except Exception as e:
        print(f"An error occurred during text generation: {e}")
        return "Text generation failed."

sequence_length = 128
embedding_dim = 128

# 1. Load Bert tokenizer
print("Loading Bert tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
vocab_size = len(tokenizer)

# 2. Initialize the model (without training)
print("Initializing the model for demonstration purposes...")
model = AttentionTextCNN(vocab_size, embedding_dim, sequence_length).to(device)

# 3. Define the starting prompt
start_prompt = "hello"

# 4. Generate text with the (simulated) trained model
generated_text = generate_text(model, tokenizer, start_prompt)

print("\n--- Generated Text ---")
print(generated_text)
print("----------------------")

print("\nScript execution finished.")

Loading Bert tokenizer...
Initializing the model for demonstration purposes...
Generating text...


  l_out_pool2 = torch.floor(torch.tensor((l_out_conv2 - 2) / 2)) + 1



--- Generated Text ---
hello [unused947] composuredad ranged pleaded amplifiers napier shanghai halves napier amplifiers modify napier specimens unity [unused947] securities canberrawarkax correspondence bucharest ableguide mccormickquet fantasiaᆨ blonde aromatic composure bob weak lifetime remembrance [unused947] faculties bucharest abledad modify canberraoge subsidy mead architecture modify strange [unused701] able
----------------------

Script execution finished.
