In [26]:
import json
import torch
from transformers import BertTokenizer
from transformers import BertForNextSentencePrediction,BertForMaskedLM
from torch.nn.functional import softmax
import re
from tqdm import tqdm
import os

In [2]:
devDataFile = r"dev.json"


## intersentence Analysis

In [3]:
def loadDevFile(devDataFile,testType):
    with open (devDataFile, encoding='utf-8') as f:
        devData = json.load(f)
    devProcessed = dict()
    for record in devData['data'][testType]:
        rId = record['id']
        
        devProcessed[rId] = {}
        devProcessed[rId]['context'] = record['context']
        devProcessed[rId]['bias_type'] = record['bias_type']
        sentences = []
        for row in record['sentences']:
            tmp = dict()
            tmp['sentence'] = row['sentence']
            tmp['gold_label'] = row['gold_label']
            tmp['id'] = row['id']
            sentences.append(tmp)
        devProcessed[rId]['sentences'] = sentences
    return devProcessed

In [4]:
devProcessed = loadDevFile(devDataFile,'intersentence')

In [5]:
model_name = 'bert-base-uncased'
model = BertForNextSentencePrediction.from_pretrained(model_name)
# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def obtainProb(tokenizer,model,seq_A,seq_B):
    with torch.no_grad():
        # encode the two sequences. Particularly, make clear that they must be 
        # encoded as "one" input to the model by using 'seq_B' as the 'text_pair'
        encoded = tokenizer.encode_plus(seq_A, text_pair=seq_B, return_tensors='pt')
        # NOTE when you print encoded you will see how the token_type_ids are 0 for all tokens in seq_A and 1 for seq_B, 
        # this way the model knows which token belongs to which sequence
        # a model's output is a tuple, we only need the output tensor containing
        # the relationships which is the first item in the tuple
        seq_relationship_logits = model(**encoded)[0]
        # we still need softmax to convert the logits into probabilities
        # index 0: sequence B is a continuation of sequence A
        # index 1: sequence B is a random sequence
        probs = softmax(seq_relationship_logits, dim=1)
        return probs[0][0].item()

Get the model probability for each of the sentences

In [7]:
for rid,row in tqdm(devProcessed.items()):
    seq_A= row['context']
    for inner_row in row['sentences']:
        seq_B = inner_row['sentence']
        gold_label= inner_row['gold_label']
        inner_row['model_prob'] = obtainProb(tokenizer,model,seq_A,seq_B) # inplace update

100%|██████████| 2123/2123 [03:27<00:00, 10.23it/s]


We are going to choose for cases where the stereotypes and non- stereo have a huge difference in probability for analysis

In [8]:
def createSmallerRecord(inner_row):
    new_record = dict()
    new_record['sentence'] = inner_row['sentence']
    new_record['gold_label']=inner_row['gold_label']
    new_record['model_prob']=inner_row['model_prob']
    return new_record

In [9]:
# this is a function for doing offlien analysis on records. It gives records that have a huge difference between stereo and non-stereo
def CreateRecordsAnalysis(devProcessed,threshold=0.75,numDigits=3):
    interesting_records =[]
    for rid,row in devProcessed.items():
        stereo_score=0
        non_stereo_score =0
        newSentences = []
        for inner_row in row['sentences']:
            gold_label= inner_row['gold_label']
            score = round(inner_row['model_prob'] ,numDigits)

            if gold_label =='stereotype':
                stereo_score=score
                newSentences.append(createSmallerRecord(inner_row))
            elif gold_label=='anti-stereotype':
                non_stereo_score=score
                newSentences.append(createSmallerRecord(inner_row))
        if abs(stereo_score-non_stereo_score) >threshold:
            new_row = dict()
            new_row['context'] = row['context']
            new_row['bias_type'] =row['bias_type']
            new_row['sentences'] = newSentences
            interesting_records.append(new_row)
    return interesting_records

In [10]:
#writes records to separate files
def writeData(outfile,interesting_records):
    with open(outfile,'w',encoding='utf-8') as f:
        for row in interesting_records:
            json.dump(row,f)
            f.write('\n')
    print("Wrote interesting records to ",outfile)
    return

In [11]:
# we are picking examples for visual analysis with this code
interesting_records = CreateRecordsAnalysis(devProcessed,threshold=0.75)
print("Number of interesting records ",len(interesting_records))
writeData('analysisIntersentenceInteresting.json',interesting_records)

Number of interesting records  350
Wrote interesting records to  analysisIntersentenceInteresting.json


In [12]:
# writing down interesting records
with open('analysisIntersentenceInteresting.json','w',encoding='utf-8') as f:
    for row in interesting_records:
        json.dump(row,f)
        f.write('\n')

In [13]:
# to feed to StereoSet eval code, we need {'id': '107427644575c4712bf105f14475af0e', 'score': 0.058055270463228226} per sentence
def CreateScoresListForPred(devProcessed):
    SentenceScores = []
    for index,row in devProcessed.items():
        for inner_row in row['sentences']:
            record =dict()
            record['id'] = inner_row['id']
            record['score'] = inner_row['model_prob']
            SentenceScores.append(record)
    print("Total Number of sentences added ", len(SentenceScores))
    return SentenceScores


In [14]:
InterSetenceList = CreateScoresListForPred(devProcessed)

Total Number of sentences added  6369


## Intrasentence

In [15]:
devProcessed = loadDevFile(devDataFile,'intrasentence')

In [16]:
masked_LM_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# we need to process the input to add cls & sep token. Also we are replacing keyword BLANK with MASK
def tokenizeText(text):
    text = '[CLS]' + text + '[SEP]'
    text = re.sub(r"\bBLANK\b",'[MASK]',text)
    tokenized_text = tokenizer.tokenize(text)
    return tokenized_text

In [18]:
#this will generate all the required tensors
def ProcessMaskedInput(text):
    tokenized_text = tokenizeText(text)
    masked_index=0
    # we need masked_index to choose the output
    for index,token in enumerate(tokenized_text):
        if token=='[MASK]':
            masked_index=index
            break
    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return tokens_tensor, segments_tensors ,masked_index  

In [19]:
# To know the prediction we need to know the position of the masked word
def GetTokenIndex(word):
    target = [word]
    target_index= tokenizer.convert_tokens_to_ids(target)[0]
    return target_index

In [20]:
# this function runs the model for a text and list of candidates and returns 
def FindScoresForCand(text,cand_text_l):
    # Predict all tokens
    with torch.no_grad():
        tokens_tensor, segments_tensors ,masked_index =ProcessMaskedInput(text)
        predictions = masked_LM_model(tokens_tensor, segments_tensors)
        # Transformers models always output tuples.In our case, the first element is the hidden state of the last layer 
        predictions_for_mask = predictions[0][0, masked_index] 
        probs = softmax(predictions_for_mask,dim=0)
        cand_probs = []
        for row in cand_text_l : 
            cand_text = row['sentence']
            cand_tokenized_text = tokenizeText(cand_text)
            word = cand_tokenized_text[masked_index]
            row['model_prob']=probs[GetTokenIndex(word)].item() # in place replacement
        return 

In [21]:
# call the model and get the probability
for rid,row in tqdm(devProcessed.items()):
    seq_A= row['context']
    candidates = row['sentences']
    FindScoresForCand(seq_A,candidates)

100%|██████████| 2106/2106 [01:39<00:00, 21.24it/s]


In [22]:
# this is to find interesting records for analysis
interesting_records = CreateRecordsAnalysis(devProcessed,2.0e-02,6)
print("Number of interesting records ",len(interesting_records))
writeData('analysisIntrasentenceInteresting.json',interesting_records)

Number of interesting records  325
Wrote interesting records to  analysisIntrasentenceInteresting.json


In [23]:
IntraSetenceList = CreateScoresListForPred(devProcessed)

Total Number of sentences added  6318


##  Prepare output 

In [24]:
predData = dict()
predData['intersentence'] = InterSetenceList
predData['intrasentence'] = IntraSetenceList

In [27]:
# write it to a file
if not os.path.exists('gilopez_Predictions'):
    os.makedirs('gilopez_Predictions')
with open('gilopez_Predictions/predictedResults.json', 'w') as fp:
    json.dump(predData, fp)

Run it on Sample Data provided by Stereoset Authors

In [31]:
%run -i evaluation.py --gold-file dev.json --predictions-dir SamplePredictions/


Evaluating SamplePredictions/predictions_bert-base-cased_BertNextSentence_BertLM.json...
intrasentence
	gender
		Count: 765.0
		LM Score: 82.50328729241772
		SS Score: 61.48204661682922
		ICAT Score: 63.55715547775384
	profession
		Count: 2430.0
		LM Score: 82.31092099986019
		SS Score: 60.8476591974996
		ICAT Score: 64.45330461508425
	race
		Count: 2886.0
		LM Score: 83.82409779040428
		SS Score: 56.29627559199869
		ICAT Score: 73.26850537162359
	religion
		Count: 237.0
		LM Score: 82.16091954022988
		SS Score: 56.27586206896552
		ICAT Score: 71.84830757035274
	overall
		Count: 2106.0
		LM Score: 83.01912382272438
		SS Score: 58.68030062800166
		ICAT Score: 68.60650476963355
intersentence
	gender
		Count: 726.0
		LM Score: 90.84746774964165
		SS Score: 62.026618711401326
		ICAT Score: 68.99571063921627
	profession
		Count: 2481.0
		LM Score: 85.87218285497853
		SS Score: 62.32294703538678
		ICAT Score: 64.70821563227955
	race
		Count: 2928.0
		LM Score: 89.67450156426334
		SS Score: 

Run it on the file we just created 

In [30]:
%run -i evaluation.py --gold-file dev.json --predictions-dir gilopez_Predictions/


Evaluating gilopez_Predictions/predictedResults.json...
intrasentence
	gender
		Count: 765.0
		LM Score: 85.26874960788005
		SS Score: 65.81614359875229
		ICAT Score: 58.29629384219437
	profession
		Count: 2430.0
		LM Score: 84.27860823500706
		SS Score: 59.80115458275169
		ICAT Score: 67.75805488839758
	race
		Count: 2886.0
		LM Score: 87.00319891306857
		SS Score: 55.54717017291779
		ICAT Score: 77.35076791388843
	religion
		Count: 237.0
		LM Score: 87.12643678160919
		SS Score: 58.94252873563219
		ICAT Score: 71.54382349055356
	overall
		Count: 2106.0
		LM Score: 85.75367359929504
		SS Score: 58.591415011417865
		ICAT Score: 71.01876562639082
intersentence
	gender
		Count: 726.0
		LM Score: 89.45130151651892
		SS Score: 57.637827148696715
		ICAT Score: 75.78702993233645
	profession
		Count: 2481.0
		LM Score: 84.68382403929309
		SS Score: 62.345428969882065
		ICAT Score: 63.7746613477914
	race
		Count: 2928.0
		LM Score: 87.77420257603487
		SS Score: 59.69733115785828
		ICAT Score: