https://gist.github.com/yuchenlin/a2f42d3c4378ed7b83de65c7a2222eb2

In [1]:

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import numpy as np
import pandas as pd
import re
logging.basicConfig(level=logging.INFO)# OPTIONAL

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()
# model.to('cuda')  # if you have gpu

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [3]:
def predict_masked_sent(text, top_k):
    # Tokenize input
    text = "[CLS] %s [SEP]"%text
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index("[MASK]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)

    adjectiveList =[]
    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i]
        # adjective=[
        #     predicted_token,
        #     float(token_weight),
        # ]
        adjectiveList.append(predicted_token)
        #print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight)*1000)
    return adjectiveList

In [5]:
adjectiveList = predict_masked_sent("female refugees are [MASK].", 10)
adjectiveList

['welcome',
 'welcomed',
 'allowed',
 'excluded',
 'included',
 'uncommon',
 'admitted',
 'accepted',
 'encouraged',
 'rare']

In [5]:
def cleanList(list):
    str = 'a'
    str = list
    #str = re.sub('[', '', str)
    #str = re.sub(']', '', str)
    str = re.sub('\'', '', str)
    return str

In [6]:
def createTemplate(fileTemplate, newTemplate, personList, numAtt):
    mask = "[MASK]"
    dataList =[]
    for index,row in fileTemplate.iterrows():
        target_place = 0
        adjectiveList = []
        sentence = row.loc['template']
        #print(sentence)
        if sentence.find("<person>") > sentence.find("<attribute>"):
            target_place = 1
        for pers in personList:
            _sentence = re.sub(r'<person>', pers, sentence)  
            _sentence = re.sub(r'<attribute>', mask, _sentence) 
            #adjectiveList.extend(predict_masked_sent(_sentence, numAtt))
            adjectiveList = adjectiveList + list(set(predict_masked_sent(_sentence, numAtt)) - set(adjectiveList))
            adjectiveStr = ','.join(adjectiveList)
        data=[
            sentence,
            target_place,
            adjectiveStr
        ]
        dataList.append(data)
        tweet_df = pd.DataFrame(dataList, columns=["template", "target_place", "attributes"])
        tweet_df.to_csv(newTemplate, sep=";")

In [7]:
templates = pd.read_csv("templates/template_refugees.csv", sep=";")
topic = "refugees"
personList = ["male", "female"]
numAtt = 20
newTemplate = f"word_prediction/"+ topic + ".csv"
temp = createTemplate(templates, newTemplate, personList, numAtt)

['unknown', 'excluded', 'present', 'tolerated', 'common', 'accepted', 'welcome', 'rare', 'prohibited', 'welcomed', 'eligible', 'available', 'included', 'illegal', 'permitted', 'admitted', 'free', 'allowed', 'encouraged', 'uncommon']
['unknown', 'excluded', 'present', 'tolerated', 'common', 'accepted', 'welcome', 'rare', 'prohibited', 'welcomed', 'eligible', 'available', 'included', 'illegal', 'permitted', 'admitted', 'free', 'allowed', 'encouraged', 'uncommon', 'forbidden', 'expected']
['reduced', 'increased', 'mixed', 'large', 'moderate', 'highest', 'better', 'comparable', 'low', 'lower', 'good', 'high', 'equal', 'greater', 'different', 'improved', 'similar', 'poor', 'higher', 'strong']
['reduced', 'increased', 'mixed', 'large', 'moderate', 'highest', 'better', 'comparable', 'low', 'lower', 'good', 'high', 'equal', 'greater', 'different', 'improved', 'similar', 'poor', 'higher', 'strong', 'significant', 'elevated']
['bound', 'excluded', 'listed', 'guaranteed', 'accepted', 'affected', 