In [1]:
from datasets import DatasetDict

# Load the DatasetDict from the saved directory
data = DatasetDict.load_from_disk(".")

  from .autonotebook import tqdm as notebook_tqdm


# Model selection

In [2]:
from transformers import AutoModel, AutoTokenizer
import torch

model_ckpt = "distilbert-base-uncased"

## Tokenizing

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

#function that tokenizes, we use padding and truncation in order to have consistent input dimensions for the model. This improves model training.
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [4]:
#map the function to all three datasets in the dict
data_encoded = data.map(tokenize, batched=True, batch_size=None)  # data_split will be a dict with train-val-test splits

                                                                 

## Creating a feature extractor

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [6]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [8]:
#convert to the torch format because our model expects it as input
data_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])