In [1]:
#!pip install transformers -U

In [1]:
import sys
import os
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import random
import torch.nn.functional as F
import importlib
import numpy as np

In [2]:
# Add the correct path to the local transformers directory
local_path = os.path.abspath('../src/')
print("Adding path:", local_path)  # Verify the path to be added
sys.path.insert(0, local_path)

Adding path: /Users/johnschroter/IdeaProjects/Sigma-GPT/src


Creating local Path to files

Confirming local copies are being used

In [3]:
# Import your modified GPT2 classes
from transformers.models.gpt2.tokenization_gpt2 import *
from transformers.models.gpt2.modeling_gpt2 import *

# Verify that the modules are being loaded from the correct path
import transformers.models.gpt2.tokenization_gpt2
import transformers.models.gpt2.modeling_gpt2

print(transformers.models.gpt2.tokenization_gpt2.__file__)  # Should point to your local file
print(transformers.models.gpt2.modeling_gpt2.__file__)  # Should point to your local file

/Users/johnschroter/IdeaProjects/Sigma-GPT/src/transformers/models/gpt2/tokenization_gpt2.py
/Users/johnschroter/IdeaProjects/Sigma-GPT/src/transformers/models/gpt2/modeling_gpt2.py


Randomly initilizing sigma-gpt

In [4]:
# Initialize the tokenizer (pre-trained vocab is fine for tokenizer)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# Initialize the configuration with random parameters
config = GPT2Config()

# Initialize the model with the custom configuration
#model = CustomGPT2LMHeadModel(config)
model = CustomGPT2LMHeadModel.from_pretrained('gpt2')

# Initialize weights randomly
#model.init_weights()

Some weights of CustomGPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['lm_head.weight', 'wte.LayerNorm.bias', 'wte.LayerNorm.weight', 'wte.next_position_embeddings.weight', 'wte.position_embeddings.weight', 'wte.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


load in dataset

In [5]:
# Load Wikitext-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=32)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [5]:
# Load Penn Treebank dataset
dataset = load_dataset("ptb_text_only")

# Preprocess the dataset
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=32)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["sentence"])
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [6]:
class AdaptiveShuffle:
    def __init__(self, initial_shuffle_percentage=0.0, max_adjustment_per_epoch=0.05, performance_threshold=0.25):
        self.shuffle_percentage = initial_shuffle_percentage
        self.max_adjustment_per_epoch = max_adjustment_per_epoch
        self.performance_threshold = performance_threshold
        self.previous_loss = None

    def adjust_shuffle_percentage(self, current_loss):
        if self.previous_loss is not None:
            improvement = (self.previous_loss - current_loss) / self.previous_loss
            if improvement > self.performance_threshold:
                self.shuffle_percentage = min(self.shuffle_percentage + self.max_adjustment_per_epoch, 1.0)
            elif improvement < -self.performance_threshold:
                self.shuffle_percentage = max(self.shuffle_percentage - self.max_adjustment_per_epoch, 0.0)
        self.previous_loss = current_loss

    def get_current_shuffle_percentage(self):
        return self.shuffle_percentage



In [7]:
from torch.utils.data import Dataset

class ShuffledDataset(Dataset):
    def __init__(self, input_ids, position_ids, next_position_ids, attention_mask):
        self.input_ids = input_ids
        self.position_ids = position_ids
        self.next_position_ids = next_position_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'position_ids': torch.tensor(self.position_ids[idx], dtype=torch.long),
            'next_position_ids': torch.tensor(self.next_position_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long)
        }

In [8]:
# Function to shuffle a percentage of tokens within each sequence
def shuffle_with_positional_ids(dataset, shuffle_percentage):
    shuffled_input_ids_list = []
    shuffled_pos_ids_list = []
    next_pos_ids_list = []
    attention_mask_list = []

    for example in dataset:
        input_ids = example['input_ids']
        attention_mask = example['attention_mask']

        # Calculate the number of tokens to shuffle
        seq_length = len(input_ids)
        num_shuffled_tokens = int(seq_length * shuffle_percentage)

        # Get indices to shuffle
        indices = list(range(seq_length))
        indices_to_shuffle = np.random.choice(indices, num_shuffled_tokens, replace=False)

        # Create a permutation for the selected indices
        permutation = np.random.permutation(num_shuffled_tokens)

        # Create shuffled input_ids, pos_ids, and attention_mask
        shuffled_input_ids = input_ids.copy()
        pos_ids = list(range(seq_length))
        shuffled_pos_ids = pos_ids.copy()
        shuffled_attention_mask = attention_mask.copy()

        for i, idx in enumerate(indices_to_shuffle):
            shuffled_input_ids[idx] = input_ids[indices_to_shuffle[permutation[i]]]
            shuffled_pos_ids[idx] = pos_ids[indices_to_shuffle[permutation[i]]]
            shuffled_attention_mask[idx] = attention_mask[indices_to_shuffle[permutation[i]]]

        # Create the next shuffled pos ids
        next_pos_ids = shuffled_pos_ids[1:] + [shuffled_pos_ids[0]]

        # Append to lists
        shuffled_input_ids_list.append(shuffled_input_ids)
        shuffled_pos_ids_list.append(shuffled_pos_ids)
        next_pos_ids_list.append(next_pos_ids)
        attention_mask_list.append(shuffled_attention_mask)

    return ShuffledDataset(
        shuffled_input_ids_list,
        shuffled_pos_ids_list,
        next_pos_ids_list,
        attention_mask_list
    )

In [9]:
from torch.utils.tensorboard import SummaryWriter
import time
import os
import shutil

def train_model(model, tokenizer, adaptive_shuffle, train_dataset, eval_dataset, num_epochs=10, batch_size=64, log_dir='./logs'):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
    loss_fn = torch.nn.CrossEntropyLoss()
    writer = SummaryWriter(log_dir=log_dir)

    def compute_accuracy(logits, labels):
        preds = torch.argmax(logits, dim=-1)
        correct = (preds == labels).float()
        return correct.sum() / correct.numel()

    for epoch in range(num_epochs):
        shuffle_percentage = adaptive_shuffle.get_current_shuffle_percentage()
        print(f"Epoch {epoch + 1}: Shuffle Percentage={shuffle_percentage}")

        # Shuffle the sequences based on the current shuffle percentage
        shuffled_train_dataset = shuffle_with_positional_ids(train_dataset, shuffle_percentage)
        train_loader = DataLoader(shuffled_train_dataset, batch_size=batch_size, shuffle=True)

        model.train()
        total_loss = 0
        total_correct = 0
        total_samples = 0
        start_time = time.time()

        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            # Get input and target sequences
            input_ids = batch['input_ids'].to(model.device)
            position_ids = batch['position_ids'].to(model.device)
            next_position_ids = batch['next_position_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = input_ids.clone()
            # Forward pass
            outputs = model(input_ids=input_ids, position_ids=position_ids, next_position_ids=next_position_ids, attention_mask=attention_mask)
            logits = outputs.logits
            # Compute loss
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = input_ids[..., 1:].contiguous()
            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            # Compute accuracy
            total_correct += compute_accuracy(shift_logits, shift_labels).item() * shift_labels.numel()
            total_samples += shift_labels.numel()
            writer.add_scalar('Loss/Train', loss, batch_idx)

            if batch_idx % (1 * 1) == 0:
                print(f"Epoch {epoch + 1}, Batch {batch_idx}, Loss: {loss.item()}")
                
        scheduler.step()
        average_loss = total_loss / len(train_loader)
        train_accuracy = total_correct / total_samples
        writer.add_scalar('Loss/Train', average_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_accuracy, epoch)
        adaptive_shuffle.adjust_shuffle_percentage(average_loss)
        print(f"Epoch {epoch + 1}: Average Loss={average_loss}, Train Accuracy={train_accuracy}")

        # Evaluation part
        model.eval()
        eval_loss = 0
        eval_correct = 0
        eval_samples = 0
        shuffled_eval_dataset = shuffle_with_positional_ids(eval_dataset, shuffle_percentage)
        eval_loader = DataLoader(shuffled_eval_dataset, batch_size=batch_size, shuffle=False)
        with torch.no_grad():
            for batch in eval_loader:
                input_ids = batch['input_ids'].to(model.device)
                position_ids = batch['position_ids'].to(model.device)
                next_position_ids = batch['next_position_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                labels = input_ids.clone()
                outputs = model(input_ids=input_ids, position_ids=position_ids, next_position_ids=next_position_ids, attention_mask=attention_mask)
                logits = outputs.logits
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = input_ids[..., 1:].contiguous()
                loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                eval_loss += loss.item()
                eval_correct += compute_accuracy(shift_logits, shift_labels).item() * shift_labels.numel()
                eval_samples += shift_labels.numel()

        average_eval_loss = eval_loss / len(eval_loader)
        eval_accuracy = eval_correct / eval_samples
        writer.add_scalar('Loss/Eval', average_eval_loss, epoch)
        writer.add_scalar('Accuracy/Eval', eval_accuracy, epoch)
        print(f"Epoch {epoch + 1}: Evaluation Loss={average_eval_loss}, Eval Accuracy={eval_accuracy}")

        epoch_time = time.time() - start_time
        writer.add_scalar('Time/Epoch', epoch_time, epoch)
        print(f"Epoch {epoch + 1}: Time Taken={epoch_time}s")

    print("Training completed")
    writer.close()

adaptive_shuffle = AdaptiveShuffle()
train_model(model, tokenizer, adaptive_shuffle, train_dataset, eval_dataset)

Epoch 1: Shuffle Percentage=0.0
Epoch 1, Batch 0, Loss: 14.431927680969238
Epoch 1, Batch 1, Loss: 11.867571830749512
Epoch 1, Batch 2, Loss: 10.281754493713379
Epoch 1, Batch 3, Loss: 7.948977947235107
Epoch 1, Batch 4, Loss: 6.577035427093506
Epoch 1, Batch 5, Loss: 6.500138759613037
Epoch 1, Batch 6, Loss: 5.947820663452148
Epoch 1, Batch 7, Loss: 6.110809803009033
Epoch 1, Batch 8, Loss: 6.134476184844971
Epoch 1, Batch 9, Loss: 5.6920390129089355
Epoch 1, Batch 10, Loss: 5.871313095092773
Epoch 1, Batch 11, Loss: 5.872975826263428
Epoch 1, Batch 12, Loss: 5.799072265625
Epoch 1, Batch 13, Loss: 5.499769687652588
Epoch 1, Batch 14, Loss: 5.837723731994629
Epoch 1, Batch 15, Loss: 5.473331928253174
Epoch 1, Batch 16, Loss: 5.159011363983154
Epoch 1, Batch 17, Loss: 5.7526984214782715
Epoch 1, Batch 18, Loss: 5.36810827255249
Epoch 1, Batch 19, Loss: 4.941473960876465
Epoch 1, Batch 20, Loss: 4.997711658477783
Epoch 1, Batch 21, Loss: 5.103751182556152
Epoch 1, Batch 22, Loss: 5.1885


KeyboardInterrupt



In [22]:
# Shuffle the sequences based on the current shuffle percentage
shuffled_train_dataset = shuffle_with_positional_ids(train_dataset, 0)

# Create DataLoader
train_loader = DataLoader(shuffled_train_dataset, batch_size=32, shuffle=True)

In [23]:
count = 0
for batch_idx, batch in enumerate(train_loader):
    count = count + 1


In [24]:
count

1315

In [15]:
shuffled_train_dataset[14]

{'input_ids': tensor([ 1169,  1279,  2954,    29, 12262,   531, 45450,   373,   973,   287,
           845, 12949,  6867,   287,  1642,  3348,   329,   262, 16628,   287,
           262,  1903, 11445,    82,   290,  6928,   351,   257,  1180,  2099,
           286,  1279,  2954,    29,   287,   399, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256]),
 'position_ids': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         54, 55, 56, 57, 58, 59, 60, 61, 62, 63]),
 'next_position_ids': tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 3

In [None]:
model.att