In [9]:
#!pip install transformers -U

In [10]:
import sys
import os

# Add the correct path to the local transformers directory
local_path = os.path.abspath('../src/')
print("Adding path:", local_path)  # Verify the path to be added
sys.path.insert(0, local_path)

Adding path: /Users/johnschroter/IdeaProjects/Sigma-GPT/src


In [11]:
# Import your modified GPT2 classes
from transformers.models.gpt2.tokenization_gpt2 import *
from transformers.models.gpt2.modeling_gpt2 import *

# Verify that the modules are being loaded from the correct path
import transformers.models.gpt2.tokenization_gpt2
import transformers.models.gpt2.modeling_gpt2

print(transformers.models.gpt2.tokenization_gpt2.__file__)  # Should point to your local file
print(transformers.models.gpt2.modeling_gpt2.__file__)  # Should point to your local file

/Users/johnschroter/IdeaProjects/Sigma-GPT/src/transformers/models/gpt2/tokenization_gpt2.py
/Users/johnschroter/IdeaProjects/Sigma-GPT/src/transformers/models/gpt2/modeling_gpt2.py


In [12]:
# Initialize the tokenizer and configuration
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
config = GPT2Config.from_pretrained('gpt2')

# Initialize the custom model
model = CustomGPT2LMHeadModel.from_pretrained('gpt2')

Some weights of CustomGPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['lm_head.weight', 'wte.LayerNorm.bias', 'wte.LayerNorm.weight', 'wte.next_position_embeddings.weight', 'wte.position_embeddings.weight', 'wte.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Example input
input_text = ("The quick brown fox jumps")
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate shuffled order
shuffle_indices = torch.randperm(input_ids.size(1))
#shuffle_indices = torch.arange(input_ids.size(1))
shuffled_input_ids = input_ids[:, shuffle_indices]

# Generate positional encodings
pos_ids = torch.arange(input_ids.size(1)).unsqueeze(0)
shuffled_pos_ids = pos_ids[:, shuffle_indices]
# Generate the next_pos_ids by rolling the shuffle indices
next_pos_ids = torch.roll(shuffle_indices, shifts=-1, dims=0).unsqueeze(0)

# Forward pass to get the output logits
outputs = model(input_ids=shuffled_input_ids, position_ids=shuffled_pos_ids, next_position_ids=next_pos_ids)

# Get the logits and convert to token IDs
logits = outputs.logits
predicted_token_ids = torch.argmax(logits, dim=-1)

# Undo the shuffle before decoding
_, unshuffle_indices = torch.sort(shuffle_indices)
unshuffled_predicted_token_ids = predicted_token_ids[:, unshuffle_indices]

# Decode the token IDs to get the output words
predicted_text = tokenizer.decode(unshuffled_predicted_token_ids[0], skip_special_tokens=True)

print("Input Text: ", input_text)
print("Predicted Response: ", predicted_text)

Input Text:  The quick brown fox jumps
Predicted Response:   Rogers Rogersdoorsdoorscache


In [14]:
import torch
from transformers import GPT2Tokenizer, GPT2Config, AdamW, get_scheduler
from torch.nn.utils.rnn import pad_sequence
from torch.nn import CrossEntropyLoss
from typing import Optional, Tuple, Union

# Initialize the tokenizer and configuration
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
config = GPT2Config.from_pretrained('gpt2', output_attentions=True)

# Set pad token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Add BOS and EOS tokens if not present
if tokenizer.bos_token is None:
    tokenizer.add_special_tokens({'bos_token': '<|startoftext|>'})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '<|endoftext|>'})

# Initialize the custom model
model = CustomGPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings if the tokenizer size has changed

# Placeholder for training data, replace this with your actual data
training_data = [
    "The quick brown fox jumps over the lazy dog.",
    "In the heart of the bustling city, a lone musician played a haunting melody.",
    "Artificial intelligence is transforming industries across the globe.",
    "The sun set over the horizon, painting the sky in shades of pink and orange.",
    "A mysterious figure emerged from the shadows, cloaked in darkness.",
    "With every passing year, technology advances at an unprecedented rate.",
    "The ancient ruins stood as a testament to a long-forgotten civilization.",
    "She whispered secrets to the wind, hoping they would reach the stars.",
    "The scientist carefully documented the results of the groundbreaking experiment.",
    "In the stillness of the night, the only sound was the distant howl of a wolf.",
    "The library was a haven for those who sought knowledge and solace.",
    "He crafted intricate sculptures from blocks of ice, each one a fleeting masterpiece.",
    "Beneath the waves, a vibrant coral reef teemed with marine life.",
    "Her laughter was like music, bringing joy to everyone around her.",
    "The adventurer braved the treacherous mountains in search of hidden treasures.",
    "In the garden, flowers of every color bloomed in a riotous display.",
    "The clock struck midnight, marking the beginning of a new year.",
    "He opened the ancient book, its pages filled with arcane symbols.",
    "The spaceship soared through the cosmos, exploring uncharted territories.",
    "She penned letters to her future self, filled with hopes and dreams.",
    "The chef prepared a feast, each dish more delectable than the last.",
    "In the forest, the trees whispered ancient secrets to those who would listen.",
    "The artist painted a masterpiece, capturing the essence of a fleeting moment.",
    "The storm raged on, lightning illuminating the darkened sky.",
    "The detective pieced together the clues, unraveling the mystery bit by bit."
]

Some weights of CustomGPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['lm_head.weight', 'wte.LayerNorm.bias', 'wte.LayerNorm.weight', 'wte.next_position_embeddings.weight', 'wte.position_embeddings.weight', 'wte.word_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import random

# Hyperparameters
epochs = 1000
batch_size = 25  # Adjust batch size as needed
learning_rate = 5e-4

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=epochs * len(training_data) // batch_size)

# Loss function
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Convert training data to input IDs
training_input_ids = [torch.tensor(tokenizer.encode(f"{tokenizer.bos_token} {text} {tokenizer.eos_token}"), dtype=torch.long) for text in training_data]

def collate_fn(batch):
    # Pad the sequences in the batch to the length of the longest sequence
    batch_input_ids = pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)

    # Generate attention mask (1 for tokens, 0 for padding)
    attention_mask = (batch_input_ids != tokenizer.pad_token_id).long()

    return batch_input_ids, attention_mask

def shuffle_data(data):
    indices = list(range(len(data)))
    random.shuffle(indices)
    return [data[i] for i in indices]

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    training_input_ids = shuffle_data(training_input_ids)

    for batch_idx in range(0, len(training_input_ids), batch_size):
        batch = training_input_ids[batch_idx:batch_idx+batch_size]
        batch_input_ids, attention_mask = collate_fn(batch)

        # Shuffle sequences
        shuffle_indices = torch.randperm(batch_input_ids.size(1))
        #shuffle_indices = torch.arange(batch_input_ids.size(1))
        shuffled_input_ids = batch_input_ids[:, shuffle_indices]
        
        # Generate positional encodings
        pos_ids = torch.arange(batch_input_ids.size(1), dtype=torch.long, device=batch_input_ids.device).unsqueeze(0).repeat(batch_input_ids.size(0), 1)
        shuffled_pos_ids = pos_ids[:, shuffle_indices]

        # Generate next position IDs
        next_pos_ids = torch.zeros_like(shuffled_pos_ids)
        for i in range(next_pos_ids.size(1)):
            next_pos_ids[:, i] = shuffled_pos_ids[:, (i + 1) % shuffled_pos_ids.size(1)]
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids=shuffled_input_ids, position_ids=shuffled_pos_ids, next_position_ids=next_pos_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Compute loss
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = shuffled_input_ids[..., 1:].contiguous()
        loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % (batch_size * 1) == 0:
            print(f"Epoch {epoch + 1}, Batch {batch_idx}, Loss: {loss.item()}")



print("Training complete")


Epoch 1, Batch 0, Loss: 14.63951587677002
Epoch 2, Batch 0, Loss: 10.971122741699219
Epoch 3, Batch 0, Loss: 10.78005599975586
Epoch 4, Batch 0, Loss: 10.588807106018066
Epoch 5, Batch 0, Loss: 10.477395057678223
Epoch 6, Batch 0, Loss: 10.28282356262207
Epoch 7, Batch 0, Loss: 10.018815040588379
Epoch 8, Batch 0, Loss: 9.698076248168945
Epoch 9, Batch 0, Loss: 9.38814640045166
Epoch 10, Batch 0, Loss: 8.769542694091797
Epoch 11, Batch 0, Loss: 7.976982116699219
Epoch 12, Batch 0, Loss: 7.476379871368408
Epoch 13, Batch 0, Loss: 6.541189193725586
Epoch 14, Batch 0, Loss: 6.284066200256348
Epoch 15, Batch 0, Loss: 5.706801414489746
Epoch 16, Batch 0, Loss: 5.5318708419799805
Epoch 17, Batch 0, Loss: 5.230037212371826
Epoch 18, Batch 0, Loss: 5.061039924621582
Epoch 19, Batch 0, Loss: 4.769473075866699
Epoch 20, Batch 0, Loss: 4.663898944854736
Epoch 21, Batch 0, Loss: 4.680403232574463
Epoch 22, Batch 0, Loss: 4.576603412628174
Epoch 23, Batch 0, Loss: 4.5464982986450195
Epoch 24, Batch

In [None]:
training_input_ids

In [None]:
model.save_pretrained('../notebooks/saved_models/')
tokenizer.save_pretrained('../notebooks/saved_tokenizer/')
