In [7]:
from transformers import GPT2TokenizerFast, GPT2Model, GPT2Config
import torch

In [8]:
config = GPT2Config(n_layers=2, vocab_size=1)
model = GPT2Model(config)
model

GPT2Model(
  (wte): Embedding(1, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_af

In [10]:
inputs_embeds = torch.rand((1, 10, 768)) # batch, seq_len, hidden
model(inputs_embeds=inputs_embeds).last_hidden_state.shape

torch.Size([1, 10, 768])

In [13]:
from dataclasses import dataclass
from typing import Any, Optional, Sequence, Union, Dict
from transformers import DataCollatorForSeq2Seq, DataCollatorWithPadding

from torch.utils.data import Dataset, IterableDataset
from transformers.utils import PaddingStrategy
from datasets import load_dataset


In [14]:
ds = load_dataset("heegyu/vae_eval", split="test", use_auth_token=True)
ds[0]

Using custom data configuration heegyu--vae_eval-5adb9ae751b4aeab
Reusing dataset parquet (/home/ubuntu/.cache/huggingface/datasets/heegyu___parquet/heegyu--vae_eval-5adb9ae751b4aeab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


{'sentence': "그는 ▲인적자본 개발의 기회 불평등 ▲중상류층의 불공정한 기회 가로채기 등을 청년 불평등 원인으로 꼽았다.김 기획총장은 '중상류층의 경우 노동시장에서 큰 가치로 인정되는 능력을 개발할 수 있는 기회가 많다."}

In [None]:
from src.task.optimus_v2 import OptimusTask

task = OptimusTask.load_from_checkpoint("outputs/2022-10-09/03-28-27/checkpoint/optimus-v2-mini-ae-v1.ckpt")
task.encode("안녕하세요?")

In [21]:
class LatentGPTDataset(Dataset):
    def __init__(self, dataset, encoder, column: str = "sentence") -> None:
        super().__init__()
        self.encoder = encoder
        self.column = column
        self.dataset = dataset.map(self._encode, batched=True, batch_size=32)

    def _encode(self, x):
        s = x[self.column]
        x["latents"] = self.encoder.encode(s).tolist()
        return x

    def __getitem__(self, index: Any) -> Dict:
        return self.dataset[index]


dataset = LatentGPTDataset(ds, task)

100%|██████████| 16/16 [00:05<00:00,  3.15ba/s]


In [27]:
latents = torch.tensor(dataset[0:2]["latents"])
latents.shape

torch.Size([2, 512])