In [1]:
from transformers import GPT2Model
from peft import LoraConfig, get_peft_model
import torch.nn as nn
import torch

model = GPT2Model.from_pretrained("gpt2")

In [2]:
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [3]:
total_params  = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params

124439808

In [None]:
class TiedTextHead(nn.Module):
    def __init__(self, input_dim, hidden_size, vocab_size, tied_weights=None):
        super().__init__()
        self.shared_mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU()
        )
        # output is twice vocab size
        # first half is for next token prediction: x_{t+1}
        # second half is for previous token prediction: x_{t+k-1}
        self.output_layer = nn.Linear(hidden_size, vocab_size * 2)

    def forward(self, f, b):
        combined = torch.cat([f, b], dim=-1)
        shared_output = self.shared_mlp(combined)
        logits = self.output_layer(shared_output)
        return logits

lora_config = LoraConfig(
            r=16, 
            lora_alpha=32, 
            lora_dropout=0.05, 
            # target_modules=["q_proj", "v_proj"],  # apply lora to attention layers
            bias="none",
            task_type="CAUSAL_LM",
        )

# create separate forward and backward lora adapters
model.add_adapter(lora_config, adapter_name="forward_encoder")
model.add_adapter(lora_config, adapter_name="backward_encoder")

# add tied text head for next and previous token predictions
text_head = TiedTextHead(
                    input_dim=model.config.hidden_size * 2,
                    hidden_size=512, # TODO; allow this to be configurable
                    vocab_size=50257,
                    # tied_weights=self.model.transformer.wte.weight  # use input embeddings' weights
                )




In [5]:
total_params  = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params

589824