In [1]:
import pandas as pd
import numpy as np
import torch

from transformers import BertTokenizer, BertForMaskedLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# device = torch.device("mps")
device = torch.device("cpu")
# device = "cuda"

In [3]:
model_name = "bert-base-uncased"

bert_tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

We know that our Bert is pretrained. Let's mask toxic words ourselves using a dictionary, and give the masked inputs to the model!

In [4]:
def load_toxic_words(path: str) -> set:
    file = open(path)

    toxic_words = set(file.read().strip().split("\n"))

    file.close()

    return toxic_words

def mask_toxic_words(sentence: str, toxic_set: set):
    mask_token = "[MASK]"

    split = sentence.strip().split()
    masked = []
    for word in split:
        if word not in toxic_set:
            masked.append(word)
        else:
            masked.append(mask_token)
    
    return " ".join(masked)

In [5]:
toxic_words_path = "../data/raw/toxic_words.txt"

toxic_words_set = load_toxic_words(toxic_words_path)

In [6]:
test_sentence = "hello, you fuck ! Can't you just read that fucking sign?"

generator = pipeline("fill-mask", model=model, tokenizer=bert_tokenizer)

input_text = mask_toxic_words(test_sentence, toxic_words_set)
generated_text = generator(input_text)


In [7]:
generated_text

[[{'score': 0.19414223730564117,
   'token': 10041,
   'token_str': 'idiot',
   'sequence': "[CLS] hello, you idiot! can't you just read that [MASK] sign? [SEP]"},
  {'score': 0.10547658056020737,
   'token': 2048,
   'token_str': 'two',
   'sequence': "[CLS] hello, you two! can't you just read that [MASK] sign? [SEP]"},
  {'score': 0.03941063582897186,
   'token': 4364,
   'token_str': 'guys',
   'sequence': "[CLS] hello, you guys! can't you just read that [MASK] sign? [SEP]"},
  {'score': 0.0375501811504364,
   'token': 7966,
   'token_str': 'fool',
   'sequence': "[CLS] hello, you fool! can't you just read that [MASK] sign? [SEP]"},
  {'score': 0.03160572424530983,
   'token': 7743,
   'token_str': 'bitch',
   'sequence': "[CLS] hello, you bitch! can't you just read that [MASK] sign? [SEP]"}],
 [{'score': 0.14270180463790894,
   'token': 4365,
   'token_str': 'damn',
   'sequence': "[CLS] hello, you [MASK]! can't you just read that damn sign? [SEP]"},
  {'score': 0.13051149249076843