In [13]:
import torch
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
from datasets import load_dataset
from tqdm import tqdm

In [29]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [28]:
# Load pre-trained model (weights)
model = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [30]:
def get_word_embedding(text, sentence_length=32, pad_token="[PAD]"):
    # Pad or truncate the text to the specified sentence length
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=sentence_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Extract the input tensors
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    # Run the text through BERT and collect all of the hidden states produced
    # from all layers
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)        
        hidden_states = outputs[1]

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1, 0, 2)

    # Stores the token vectors, with shape [sentence_length x 768]
    token_vecs_sum = torch.zeros([token_embeddings.size()[0], 768])

    # `token_embeddings` is a [sentence_length x 6 x 768] tensor.

    # For each token in the sentence...
    for index, token in enumerate(token_embeddings):

        # `token` is a [6 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum[index] = sum_vec

    # Remove the first and the last token
    # token_vecs_sum = token_vecs_sum[1:-1]

    return token_vecs_sum

text = "[MASK] After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
word_vector = get_word_embedding(text, sentence_length=32, pad_token="[PAD]")
word_vector.shape

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])


TypeError: stack(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [90]:
# Load the "wnut_17" dataset
wnut = load_dataset("wnut_17")

# Batch size for parallel processing
batch_size = 32

# Calculate the number of batches
num_batches = int(np.ceil(len(wnut["train"]) / batch_size))

embedded_sentences = []
pbar = tqdm(total=len(wnut["train"]))

# Iterate over the batches
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(wnut["train"]))

    # Prepare the batch of sentences
    batch_sentences = wnut["train"]["tokens"][start_idx:end_idx]
    batch_sentences = [" ".join(tokens) for tokens in batch_sentences]

    # Embed the batch of sentences
    batch_embeddings = get_word_embedding(batch_sentences, sentence_length=32, pad_token="[PAD]")

    # Append the batch embeddings to the overall list
    embedded_sentences.extend(batch_embeddings)

    pbar.update(end_idx - start_idx)

pbar.close()

print(len(embedded_sentences)) # 3424

print(len(wnut["train"])) # 3394

Found cached dataset wnut_17 (/home/malthe/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3394/3394 [00:20<00:00, 166.37it/s]


In [91]:
len(embedded_sentences)

3424

In [82]:
# Get length of wnut train
len(wnut["train"])

3394

In [111]:
# Load the "wnut_17" dataset
wnut = load_dataset("wnut_17")

# Batch size for parallel processing
batch_size = 64

# Calculate the number of batches
num_batches = int(np.ceil(len(wnut["train"]) / batch_size))

embedded_sentences = []
pbar = tqdm(total=len(wnut["train"]))

# Iterate over the batches
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min(batch_idx * batch_size + batch_size, len(wnut["train"]))

    # Prepare the batch of sentences
    batch_sentences = wnut["train"]["tokens"][start_idx:end_idx]

    # Embed each sentence in the batch
    for tokens in batch_sentences:
        sentence = " ".join(tokens)
        word_vector = get_word_embedding(sentence, sentence_length=32, pad_token="[PAD]")
        embedded_sentences.append(word_vector)

    pbar.update(end_idx - start_idx)

# Check if there are remaining sentences
if num_batches * batch_size < len(wnut["train"]):
    remaining_sentences = wnut["train"]["tokens"][num_batches * batch_size:]

    # Embed each remaining sentence
    for tokens in remaining_sentences:
        sentence = " ".join(tokens)
        word_vector = get_word_embedding(sentence, sentence_length=32, pad_token="[PAD]")
        embedded_sentences.append(word_vector)

pbar.close()

print(len(embedded_sentences))
print(len(wnut["train"]))


Found cached dataset wnut_17 (/home/malthe/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3394 [00:10<?, ?it/s]
100%|██████████| 3394/3394 [06:11<00:00,  9.13it/s]

3394
3394





In [1]:
from tqdm import tqdm

embedded_sentences = []
pbar = tqdm(total=len(wnut["train"]))

# Iterate over the sentences
for sent in wnut["train"]:
    tokens = sent["tokens"]
    sentence = " ".join(tokens)
    word_vector = get_word_embedding(sentence, sentence_length=32, pad_token="[PAD]")
    embedded_sentences.append(word_vector)
    
    pbar.update(1)

pbar.close()


NameError: name 'wnut' is not defined

In [114]:
# Turn embedded_sentences into a tensor
embedded_sentences_tensor = torch.stack(embedded_sentences)
embedded_sentences_tensor.size()
# Save
torch.save(embedded_sentences_tensor, "models/word_embeddings/wnut17_embedded.pt")

In [6]:
# Load embedded_sentences_tensor
embedded_sentences_tensor = torch.load("models/word_embeddings/wnut17_embedded.pt")

In [7]:
embedded_sentences_tensor.shape

torch.Size([3394, 32, 768])