https://gist.github.com/yuchenlin/a2f42d3c4378ed7b83de65c7a2222eb2

In [79]:

import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from afinn import Afinn
logging.basicConfig(level=logging.INFO)# OPTIONAL

In [80]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()
# model.to('cuda')  # if you have gpu

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [81]:
def predict_masked_sent(text, top_k):
    # Tokenize input
    text = "[CLS] %s [SEP]"%text
    #print(f"text: {text}")
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index("[MASK]")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    # tokens_tensor = tokens_tensor.to('cuda')    # if you have gpu

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
    top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)

    adjectiveList =[]
    for i, pred_idx in enumerate(top_k_indices):
        predicted_token = tokenizer.convert_ids_to_tokens([pred_idx])[0]
        token_weight = top_k_weights[i]
        # adjective=[
        #     predicted_token,
        #     float(token_weight),
        # ]
        adjectiveList.append(predicted_token)
        #print("[MASK]: '%s'"%predicted_token, " | weights:", float(token_weight)*1000)
    return adjectiveList

In [82]:
# adjectiveList = predict_masked_sent("Male asylum seekers are [MASK].", 30)
# adjectiveList

In [83]:
def cleanList(list):
    str = 'a'
    str = list
    #str = re.sub('[', '', str)
    #str = re.sub(']', '', str)
    str = re.sub('\'', '', str)
    return str

In [84]:
def createTemplate(fileTemplate, newTemplate, personList, numAtt):
    mask = "[MASK]"
    dataList =[]
    for index,row in fileTemplate.iterrows():
        target_place = 0
        adjectiveList = []
        sentence = row.loc['template']
        #print(sentence)
        if sentence.find("<person>") > sentence.find("<attribute>"):
            target_place = 1
        for pers in personList:
            _sentence = re.sub(r'<person>', pers, sentence)  
            _sentence = re.sub(r'<attribute>', mask, _sentence) 
            #adjectiveList.extend(predict_masked_sent(_sentence, numAtt))
            print(f"sentence{_sentence}")
            adjectiveList = adjectiveList + list(set(predict_masked_sent(_sentence, numAtt)) - set(adjectiveList))
            print(f"sentence{adjectiveList}")
            adjectiveStr = ','.join(adjectiveList)
        data=[
            sentence,
            target_place,
            adjectiveStr
        ]
        dataList.append(data)
        tweet_df = pd.DataFrame(dataList, columns=["template", "target_place", "attributes"])
        tweet_df.to_csv(newTemplate, sep=";")

In [85]:
def getAdj(fileTemplate, personList, numAtt):
    mask = "[MASK]"
    dataList =[]
    for index,row in fileTemplate.iterrows():
        #target_place = 0
        adjectiveList = []
        tar1_list = []
        tar2_list = []
        sentence = row.loc['template']
        #print(sentence)
        for idx, pers in enumerate(personList):
            _sentence = re.sub(r'<person>', pers, sentence)  
            _sentence = re.sub(r'<attribute>', mask, _sentence) 
            #adjectiveList.extend(predict_masked_sent(_sentence, numAtt))
            #print(f"sent {_sentence}")
            if idx == 0:
                tar1_list = predict_masked_sent(_sentence, numAtt)
                #print(f"adj {tar1_list}")
            else:
                tar2_list = predict_masked_sent(_sentence, numAtt)
                #print(f"adj {tar2_list}")
        print_graph(sentence, sent_analysis(tar1_list), sent_analysis(tar2_list))
        
            
           
                

In [86]:
def sent_analysis(list):
    afn = Afinn()
    scores = [afn.score(word) for word in list]
    values = [
        scores.count(-5),
        scores.count(-4),
        scores.count(-3),
        scores.count(-2),
        scores.count(-1),
        scores.count(0),
        scores.count(1),
        scores.count(2),
        scores.count(3),
        scores.count(4),
        scores.count(5)
    ]
    print(f"values {values}")
        
    

In [87]:
def print_graph(sentence, list1, list2):
	# set width of bar
	barWidth = 0.25
	fig = plt.subplots(figsize =(12, 8))

	# set height of bar
	IT = [12, 30, 1, 8, 22]
	ECE = [28, 6, 16, 5, 10]
	

	# Set position of bar on X axis
	br1 = np.arange(len(IT))
	br2 = [x + barWidth for x in br1]

	# Make the plot
	plt.bar(br1, IT, color ='r', width = barWidth,
			edgecolor ='grey', label ='IT')
	plt.bar(br2, ECE, color ='g', width = barWidth,
			edgecolor ='grey', label ='ECE')

	# Adding Xticks
	plt.xlabel('Branch', fontweight ='bold', fontsize = 15)
	plt.ylabel('Students passed', fontweight ='bold', fontsize = 15)
	plt.xticks([r + barWidth for r in range(len(IT))],
			['2015', '2016', '2017', '2018', '2019'])

	plt.legend()
	plt.show()


In [88]:
templates = pd.read_csv("templates/template_word_prediction.csv", sep=";")
topic = "refugees"
personList = ["male", "female"]
numAtt = 50
newTemplate = f"word_prediction/"+ topic + ".csv"
#temp = createTemplate(templates, newTemplate, personList, numAtt)
getAdj(templates, personList, numAtt)

sent male refugees are [MASK].
adj ['welcome', 'welcomed', 'allowed', 'accepted', 'admitted', 'uncommon', 'rare', 'excluded', 'encouraged', 'prohibited', 'included', 'unknown', 'eligible', 'common', 'present', 'illegal', 'permitted', 'tolerated', 'free', 'available', 'expected', 'not', 'discouraged', 'reported', 'invited', 'forbidden', 'mixed', 'numerous', 'exempt', 'few', 'banned', 'bilingual', 'scarce', 'found', 'recruited', 'expelled', 'female', 'rejected', 'outnumbered', 'displaced', 'resident', 'male', 'persecuted', 'listed', 'barred', 'employed', 'preferred', 'deported', 'introduced', 'mostly']
values [0, 0, 1, 8, 3, 33, 2, 3, 0, 0, 0]
sent female refugees are [MASK].
adj ['welcome', 'welcomed', 'allowed', 'excluded', 'included', 'uncommon', 'admitted', 'accepted', 'encouraged', 'rare', 'prohibited', 'common', 'permitted', 'tolerated', 'illegal', 'present', 'eligible', 'expected', 'available', 'forbidden', 'reported', 'free', 'also', 'unknown', 'discouraged', 'deported', 'not', '