In [1]:
#!pip install transformers -U

In [2]:
import sys
import os
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import random
import torch.nn.functional as F
import importlib
import numpy as np

In [3]:
# Add the correct path to the local transformers directory
local_path = os.path.abspath('../src/')
print("Adding path:", local_path)  # Verify the path to be added
sys.path.insert(0, local_path)

Adding path: /Users/johnschroter/IdeaProjects/Sigma-GPT/src


Creating local Path to files

Confirming local copies are being used

In [4]:
# Import your modified GPT2 classes
from transformers.models.gpt2.tokenization_gpt2 import *
from transformers.models.gpt2.modeling_gpt2 import *

# Verify that the modules are being loaded from the correct path
import transformers.models.gpt2.tokenization_gpt2
import transformers.models.gpt2.modeling_gpt2

print(transformers.models.gpt2.tokenization_gpt2.__file__)  # Should point to your local file
print(transformers.models.gpt2.modeling_gpt2.__file__)  # Should point to your local file

/Users/johnschroter/IdeaProjects/Sigma-GPT/src/transformers/models/gpt2/tokenization_gpt2.py
/Users/johnschroter/IdeaProjects/Sigma-GPT/src/transformers/models/gpt2/modeling_gpt2.py


Randomly initilizing sigma-gpt

In [5]:
# Initialize the tokenizer (pre-trained vocab is fine for tokenizer)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Initialize the configuration with random parameters
config = GPT2Config()

# Initialize the model with the custom configuration
model = CustomGPT2LMHeadModel(config)

# Initialize weights randomly
model.init_weights()

load in dataset

In [6]:
# Load Wikitext-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [7]:
class AdaptiveShuffle:
    def __init__(self, initial_shuffle_percentage=0.0, max_adjustment_per_epoch=0.05, performance_threshold=0.01):
        self.shuffle_percentage = initial_shuffle_percentage
        self.max_adjustment_per_epoch = max_adjustment_per_epoch
        self.performance_threshold = performance_threshold
        self.previous_loss = None

    def adjust_shuffle_percentage(self, current_loss):
        if self.previous_loss is not None:
            improvement = (self.previous_loss - current_loss) / self.previous_loss
            if improvement > self.performance_threshold:
                self.shuffle_percentage = min(self.shuffle_percentage + self.max_adjustment_per_epoch, 1.0)
            elif improvement < -self.performance_threshold:
                self.shuffle_percentage = max(self.shuffle_percentage - self.max_adjustment_per_epoch, 0.0)
        self.previous_loss = current_loss

    def get_current_shuffle_percentage(self):
        return self.shuffle_percentage



In [8]:
from torch.utils.data import Dataset

class ShuffledDataset(Dataset):
    def __init__(self, input_ids, position_ids, next_position_ids, attention_mask):
        self.input_ids = input_ids
        self.position_ids = position_ids
        self.next_position_ids = next_position_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'position_ids': torch.tensor(self.position_ids[idx], dtype=torch.long),
            'next_position_ids': torch.tensor(self.next_position_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long)
        }

In [9]:
# Function to shuffle a percentage of tokens within each sequence
def shuffle_with_positional_ids(dataset, shuffle_percentage):
    shuffled_input_ids_list = []
    shuffled_pos_ids_list = []
    next_pos_ids_list = []
    attention_mask_list = []

    for example in dataset:
        input_ids = example['input_ids']
        attention_mask = example['attention_mask']

        # Calculate the number of tokens to shuffle
        seq_length = len(input_ids)
        num_shuffled_tokens = int(seq_length * shuffle_percentage)

        # Get indices to shuffle
        indices = list(range(seq_length))
        indices_to_shuffle = np.random.choice(indices, num_shuffled_tokens, replace=False)

        # Create a permutation for the selected indices
        permutation = np.random.permutation(num_shuffled_tokens)

        # Create shuffled input_ids, pos_ids, and attention_mask
        shuffled_input_ids = input_ids.copy()
        pos_ids = list(range(seq_length))
        shuffled_pos_ids = pos_ids.copy()
        shuffled_attention_mask = attention_mask.copy()

        for i, idx in enumerate(indices_to_shuffle):
            shuffled_input_ids[idx] = input_ids[indices_to_shuffle[permutation[i]]]
            shuffled_pos_ids[idx] = pos_ids[indices_to_shuffle[permutation[i]]]
            shuffled_attention_mask[idx] = attention_mask[indices_to_shuffle[permutation[i]]]

        # Create the next shuffled pos ids
        next_pos_ids = shuffled_pos_ids[1:] + [shuffled_pos_ids[0]]

        # Append to lists
        shuffled_input_ids_list.append(shuffled_input_ids)
        shuffled_pos_ids_list.append(shuffled_pos_ids)
        next_pos_ids_list.append(next_pos_ids)
        attention_mask_list.append(shuffled_attention_mask)

    return ShuffledDataset(
        shuffled_input_ids_list,
        shuffled_pos_ids_list,
        next_pos_ids_list,
        attention_mask_list
    )

In [40]:
def train_model(model, tokenizer, adaptive_shuffle, train_dataset, eval_dataset, num_epochs=10, batch_size=4):
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        shuffle_percentage = adaptive_shuffle.get_current_shuffle_percentage()
        print(f"Epoch {epoch + 1}: Shuffle Percentage={shuffle_percentage}")

        # Shuffle the sequences based on the current shuffle percentage
        shuffled_train_dataset = shuffle_with_positional_ids(train_dataset, shuffle_percentage)

        train_loader = DataLoader(shuffled_train_dataset, batch_size=batch_size, shuffle=True)

        model.train()
        total_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            # Get input and target sequences
            input_ids = batch['input_ids'].to(model.device)
            position_ids = batch['position_ids'].to(model.device)
            next_position_ids = batch['next_position_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = input_ids.clone()
            # Forward pass
            outputs = model(input_ids=input_ids, position_ids=position_ids, next_position_ids=next_position_ids, attention_mask=attention_mask)
            logits = outputs.logits
            # Compute loss
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = input_ids[..., 1:].contiguous()
            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

            if batch_idx % (1 * 1) == 0:
                print(f"Epoch {epoch + 1}, Batch {batch_idx}, Loss: {loss.item()}")

        average_loss = total_loss / len(train_loader)
        adaptive_shuffle.adjust_shuffle_percentage(average_loss)
        print(f"Epoch {epoch + 1}: Average Loss={average_loss}")

        # Optional: Evaluate on the validation set here

    print("Training completed")
    
adaptive_shuffle = AdaptiveShuffle()
x, y = train_model(model, tokenizer, adaptive_shuffle, train_dataset, eval_dataset)

Epoch 1: Shuffle Percentage=0.0
Epoch 1, Batch 0, Loss: 1.4676845073699951
Epoch 1, Batch 1, Loss: 20.451215744018555
Epoch 1, Batch 2, Loss: 11.188095092773438
Epoch 1, Batch 3, Loss: 4.988286018371582
Epoch 1, Batch 4, Loss: 5.656720161437988
Epoch 1, Batch 5, Loss: 2.509121894836426
Epoch 1, Batch 6, Loss: 5.409886837005615
Epoch 1, Batch 7, Loss: 4.307000637054443
Epoch 1, Batch 8, Loss: 2.30422043800354
Epoch 1, Batch 9, Loss: 1.158675193786621
Epoch 1, Batch 10, Loss: 3.6322109699249268
Epoch 1, Batch 11, Loss: 6.366108417510986
Epoch 1, Batch 12, Loss: 3.4161782264709473
Epoch 1, Batch 13, Loss: 4.814005374908447
Epoch 1, Batch 14, Loss: 7.37620210647583
Epoch 1, Batch 15, Loss: 3.746324062347412
Epoch 1, Batch 16, Loss: 2.1351242065429688
Epoch 1, Batch 17, Loss: 5.646624565124512
Epoch 1, Batch 18, Loss: 2.111929416656494
Epoch 1, Batch 19, Loss: 1.6194090843200684
Epoch 1, Batch 20, Loss: 1.8807986974716187
Epoch 1, Batch 21, Loss: 5.878927707672119
Epoch 1, Batch 22, Loss: 1

KeyboardInterrupt: 

In [23]:
# Shuffle the sequences based on the current shuffle percentage
shuffled_train_dataset = shuffle_with_positional_ids(train_dataset, 0)

# Create DataLoader
train_loader = DataLoader(shuffled_train_dataset, batch_size=4, shuffle=True)

In [24]:
for batch_idx, batch in enumerate(train_loader):
    x = batch
    break

In [25]:
x

{'input_ids': tensor([[ 1550, 27512,   705,  ..., 50256, 50256, 50256],
         [50256, 50256, 50256,  ..., 50256, 50256, 50256],
         [  796,   796,   609,  ..., 50256, 50256, 50256],
         [  796,   796, 39978,  ..., 50256, 50256, 50256]]),
 'position_ids': tensor([[  0,   1,   2,  ..., 253, 254, 255],
         [  0,   1,   2,  ..., 253, 254, 255],
         [  0,   1,   2,  ..., 253, 254, 255],
         [  0,   1,   2,  ..., 253, 254, 255]]),
 'next_position_ids': tensor([[  1,   2,   3,  ..., 254, 255,   0],
         [  1,   2,   3,  ..., 254, 255,   0],
         [  1,   2,   3,  ..., 254, 255,   0],
         [  1,   2,   3,  ..., 254, 255,   0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [39]:
shuffled_train_dataset[12]

{'input_ids': tensor([50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50