In [1]:
import torch
import datasets
from transformers import AutoTokenizer, AutoModel

In [2]:
data = datasets.load_from_disk('../data/preprocessed')
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 340675
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 272541
    })
})

In [3]:
def make_embeddings(batch): 
  inputs = tokenizer(
      batch['text'], 
      padding=True, 
      truncation=True, 
      return_tensors="pt", 
      max_length=256 
  )
  
  inputs = {k: v.to(device) for k, v in inputs.items()}

  with torch.no_grad():
      outputs = model(**inputs)

  embeddings = outputs.pooler_output
  embeddings_np = embeddings.detach().cpu().numpy()
  return {'embeddings': embeddings_np}

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = AutoModel.from_pretrained("prajjwal1/bert-tiny")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

data = data.map(make_embeddings, batched=True, batch_size=128,  keep_in_memory=True)

Map:   0%|          | 0/340675 [00:00<?, ? examples/s]

Map:   0%|          | 0/272541 [00:00<?, ? examples/s]

In [6]:
len(data['test']['embeddings'][0])

128

In [7]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'embeddings'],
        num_rows: 340675
    })
    test: Dataset({
        features: ['text', 'labels', 'embeddings'],
        num_rows: 272541
    })
})

In [8]:
data.select_columns(['embeddings', 'labels']).save_to_disk('../data/preprocessed_transformer_embeddings')

Saving the dataset (0/1 shards):   0%|          | 0/340675 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/272541 [00:00<?, ? examples/s]