We are planning to use MLPs here to replace positional encoding functions. Let's start with simple ones.

In [92]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Tokenizer

# Define model path
MODEL_PATH = r'C:\Users\shaow\.cache\huggingface\hub\models--gpt2-medium\snapshots\f65d4965d1221eff2bcf34f53a2ba12120e18f24'

# Load GPT2 model and tokenizer
gpt2_model = GPT2Model.from_pretrained(MODEL_PATH)
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)

# Extract positional embeddings from GPT-2 model
gpt2_positional_embeddings = gpt2_model.wpe.weight.data

# Define a positional encoder using a linear layer
class PositionalEncoder(nn.Module):
    def __init__(self, embedding_dimension, max_positions=1024):
        super(PositionalEncoder, self).__init__()
        self.linear = nn.Linear(embedding_dimension, embedding_dimension, bias=True)
        self.linear.weight.data = torch.eye(embedding_dimension)
        self.linear.bias.data = gpt2_positional_embeddings[:max_positions].clone()
        self.max_positions = max_positions

    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
        seq_length = embeddings.size(0)
        padded_embeddings = torch.cat([embeddings, torch.zeros(self.max_positions - seq_length, embeddings.size(1)).to(embeddings.device)], dim=0)
        encoded = self.linear(padded_embeddings)
        return encoded[:seq_length]

# Prepare input data
input_text = "Hello world!"
tokenized_inputs = tokenizer(input_text, return_tensors="pt")
input_ids = tokenized_inputs["input_ids"]

# Get token embeddings from GPT-2 model
token_embeddings = gpt2_model.wte(input_ids).squeeze()

# Compute combined token and positional embeddings directly from GPT-2
position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
direct_gpt2_positional_embeddings = gpt2_model.wpe(position_ids).squeeze()
combined_embeddings_direct = token_embeddings + direct_gpt2_positional_embeddings

print("Direct combined token and positional embeddings from GPT-2:")
print(combined_embeddings_direct)

# Using custom positional encoder
embedding_dimension = gpt2_positional_embeddings.size(1)
custom_encoder = PositionalEncoder(embedding_dimension)
custom_encoded_positions = custom_encoder(token_embeddings)
print("\nPositional Encoding from custom PositionalEncoder (added to token embeddings):")
print(custom_encoded_positions)

Direct combined token and positional embeddings from GPT-2:
tensor([[-0.1352, -0.1314, -0.0387,  ..., -0.0573, -0.0422,  0.0286],
        [ 0.0692,  0.0637,  0.0399,  ...,  0.1743, -0.0916, -0.0058],
        [-0.0133, -0.0195,  0.0106,  ..., -0.0330, -0.1536,  0.0116]],
       grad_fn=<AddBackward0>)

Positional Encoding from custom PositionalEncoder (added to token embeddings):
tensor([[-0.1352, -0.1314, -0.0387,  ..., -0.0573, -0.0422,  0.0286],
        [ 0.0692,  0.0637,  0.0399,  ...,  0.1743, -0.0916, -0.0058],
        [-0.0133, -0.0195,  0.0106,  ..., -0.0330, -0.1536,  0.0116]],
       grad_fn=<SliceBackward0>)


In [93]:
# Ensure GPT2's positional embeddings are zero to prevent double addition
gpt2_model.wpe.weight.data = torch.zeros_like(gpt2_model.wpe.weight.data)

# Get GPT-2 output using custom combined embeddings
outputs = gpt2_model(inputs_embeds=custom_encoded_positions.unsqueeze(0))
final_output = outputs.last_hidden_state.squeeze()
print("\nGPT-2 output with custom positional embeddings:")
print(final_output)


GPT-2 output with custom positional embeddings:
tensor([[ 0.6711,  0.6076, -0.1074,  ..., -0.5113, -0.3370,  0.2346],
        [ 0.7570,  0.0672,  0.4193,  ...,  0.0518, -0.2396,  0.2325],
        [ 0.2106, -0.2740,  0.0330,  ..., -0.0427,  0.0995, -0.0989]],
       grad_fn=<SqueezeBackward0>)


In [94]:
# Load GPT2 model and tokenizer
gpt2_model = GPT2Model.from_pretrained(MODEL_PATH)
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)

# Prepare input data
input_text = "Hello world!"
tokenized_inputs = tokenizer(input_text, return_tensors="pt")
input_ids = tokenized_inputs["input_ids"]

# Get output from original GPT-2 model
outputs = gpt2_model(input_ids)
original_gpt2_output = outputs.last_hidden_state.squeeze()

print("Original GPT-2 output:")
print(original_gpt2_output)


Original GPT-2 output:
tensor([[ 0.6711,  0.6076, -0.1074,  ..., -0.5113, -0.3370,  0.2346],
        [ 0.7570,  0.0672,  0.4193,  ...,  0.0518, -0.2396,  0.2325],
        [ 0.2106, -0.2740,  0.0330,  ..., -0.0427,  0.0995, -0.0989]],
       grad_fn=<SqueezeBackward0>)


How about softmax function?