In [1]:
from pathlib import Path
import json
import sys
import torch
import torch.nn as nn
import evaluate
import transformers
import numpy as np
import pandas as pd
import pickle
import sys

sys.path.append("../")
import encoders

from tqdm.notebook import tqdm
from datasets import Dataset
from transformers import (
    AutoConfig,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
)

In [2]:
username = ""  # your username

with open(f"/home/{username}/ttmp/PBSCR/baselines/9_class_dataset.pkl", "rb") as f:
    (
        x_train9,
        y_train9,
        x_valid9,
        y_valid9,
        x_test9,
        y_test9,
        m_train9,
        m_valid9,
        m_test9,
    ) = pickle.load(f)

composers9 = np.unique(y_train9)
y_train9 = np.stack(y_train9)
y_test9 = np.stack(y_test9)

with open(f"/home/{username}/ttmp/PBSCR/baselines/100_class_dataset.pkl", "rb") as f:
    (
        x_train100,
        y_train100,
        x_valid100,
        y_valid100,
        x_test100,
        y_test100,
        m_train100,
        m_valid100,
        m_test100,
    ) = pickle.load(f)

composers100 = np.unique(y_train100)
y_train100 = np.stack(y_train100)
y_test100 = np.stack(y_test100)

In [3]:
def LM_extract(x, model, tokenizer, device):
    encoded = [encoders.dense_encoder(i, block_size=[1, 8]) for i in x]

    tokenized = (
        torch.Tensor([i[:64] for i in tokenizer(encoded)["input_ids"]])
        .to(device)
        .long()
    )

    # Do in batches to ensure no OOM
    batch_size = 32

    extracted = []
    with torch.no_grad():
        for i in tqdm(range(0, tokenized.shape[0], batch_size)):
            extracted.append(
                model(tokenized[i : i + batch_size])["last_hidden_state"][:, -1]
            )

    extracted = torch.cat(extracted, axis=0)

    return extracted.cpu().numpy()

## GPT-2 Extraction


In [None]:
data_path = Path(f"/home/{username}/ttmp/PBSCR/gpt2/pretrained_model")
tokenizer_path = data_path / "tokenizer.json"
pretrained_output_model_path = data_path

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path.parent)
config = AutoConfig.from_pretrained(pretrained_output_model_path)
config.num_labels = len(tokenizer.vocab)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_output_model_path, config=config
)

tokenizer.pad_token = "<pad>"
model.pad_token_id = tokenizer.pad_token_id
tokenizer.model_max_length = config.n_positions
config.pad_token_id = tokenizer.pad_token_id

# Designate GPUs if possible and parallelize (disabled for now)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [5]:
# Remove last layer
new_model = nn.Sequential(*list(model.children())[:-1])

In [6]:
data_path = Path(f"/home/{username}/ttmp/PBSCR/baselines")
fewshot_vecs = data_path / "fewshot_vecs"
fewshot_vecs.mkdir(exist_ok=True)

gpt2_train_vecs9 = LM_extract(x_train9, new_model, tokenizer, device)
gpt2_test_vecs9 = LM_extract(x_test9, new_model, tokenizer, device)
gpt2_train_vecs100 = LM_extract(x_train100, new_model, tokenizer, device)
gpt2_test_vecs100 = LM_extract(x_test100, new_model, tokenizer, device)


np.save("fewshot_vecs/gpt2_train9.npy", gpt2_train_vecs9)
np.save("fewshot_vecs/gpt2_test9.npy", gpt2_test_vecs9)
np.save("fewshot_vecs/gpt2_train100.npy", gpt2_train_vecs100)
np.save("fewshot_vecs/gpt2_test100.npy", gpt2_test_vecs100)

  0%|          | 0/875 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/469 [00:00<?, ?it/s]

## Roberta Extraction


In [None]:
# data_path = Path("roberta/models/pretrained_models")
# tokenizer_path = data_path/"tokenizer.json"
# pretrained_output_model_path = data_path

data_path = Path(f"/home/{username}/ttmp/PBSCR/roberta/pretrained_model")
tokenizer_path = data_path / "tokenizer.json"
pretrained_output_model_path = data_path

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path.parent)
config = AutoConfig.from_pretrained(pretrained_output_model_path)
config.num_labels = len(tokenizer.vocab)
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_output_model_path, config=config
)

tokenizer.pad_token = "<pad>"
model.pad_token_id = tokenizer.pad_token_id
# tokenizer.model_max_length = config.n_positions
config.pad_token_id = tokenizer.pad_token_id

# Designate GPUs if possible and parallelize (disabled for now)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [8]:
# Remove last layer
new_model = nn.Sequential(*list(model.children())[:-1])

In [9]:
roberta_train_vecs9 = LM_extract(x_train9, new_model, tokenizer, device)
roberta_test_vecs9 = LM_extract(x_test9, new_model, tokenizer, device)
roberta_train_vecs100 = LM_extract(x_train100, new_model, tokenizer, device)
roberta_test_vecs100 = LM_extract(x_test100, new_model, tokenizer, device)

np.save("fewshot_vecs/roberta_train9.npy", roberta_train_vecs9)
np.save("fewshot_vecs/roberta_test9.npy", roberta_test_vecs9)
np.save("fewshot_vecs/roberta_train100.npy", roberta_train_vecs100)
np.save("fewshot_vecs/roberta_test100.npy", roberta_test_vecs100)

  0%|          | 0/875 [00:00<?, ?it/s]

  0%|          | 0/188 [00:00<?, ?it/s]

  0%|          | 0/2188 [00:00<?, ?it/s]

  0%|          | 0/469 [00:00<?, ?it/s]

In [10]:
print(roberta_train_vecs100.shape)

(70000, 768)


In [11]:
d = 768

np.save(
    "fewshot_vecs/random_train9.npy",
    np.random.normal(size=(len(x_train9), d)).astype(np.float32),
)
np.save(
    "fewshot_vecs/random_test9.npy",
    np.random.normal(size=(len(x_test9), d)).astype(np.float32),
)

np.save(
    "fewshot_vecs/random_train100.npy",
    np.random.normal(size=(len(x_train100), d)).astype(np.float32),
)
np.save(
    "fewshot_vecs/random_test100.npy",
    np.random.normal(size=(len(x_test100), d)).astype(np.float32),
)