# Initialisation

In [None]:
#!pip install torch transformers sentencepiece

## Imports

In [None]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AutoModelForCausalLM , AutoTokenizer, pipeline, RobertaTokenizer, RobertaForMaskedLM, AlbertTokenizer, AlbertModel, AlbertForMaskedLM
import logging
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from afinn import Afinn
from tqdm import tqdm
import unidecode
from time import sleep
logging.basicConfig(level=logging.INFO)# OPTIONAL

## MAC Settings

In [None]:
print(f"PyTorch version: {torch.__version__}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

## Definitions

In [None]:
IDENTITIES = 'identities'
ORIENTATION = 'orientation'
OTHER = 'other'
PRONOUNS = 'pronouns'
MASKBERT_ = '\[MASK\]'
MASKBERT= '[MASK]'
MASKROBERT = '<mask>'
TARGET = '<target>'
NOM = '<nom>'
ACC = '<acc>'
BE = '<be>'
QUEER = 'queer'
NONQUEER = 'non-queer'

#TEMPLATES
TEMPLATE_NOZZA = '../src/templates/template_nozza.csv'
TEMPLATE_TOXIC1 = '../src/templates/template_toxing1.csv'
TEMPLATE_TOXIC2 = '../src/templates/template_toxing2.csv'

#IDENTITIES CSV
IDENTITIES_CSV = '../src/queer_identities/identities.csv'
PRONOUNS_CSV = '../src/queer_identities/pronouns.csv'

#MODELS
BERT_BASE = 'bert-base-uncased'
BERT_LARGE = 'bert-large-uncased'
ROBERTA_BASE = 'roberta-base'
ROBERTA_LARGE = 'roberta-large'
GPT2 = 'gpt2'
ALBERT = 'albert-base-v2'

## Utils

In [None]:
def get_tokenizer(model_name):
    if((model_name == BERT_BASE) or (model_name == BERT_LARGE)):
        model = BertForMaskedLM.from_pretrained(model_name)
        tokenizer = BertTokenizer.from_pretrained(model_name)
    else:
        if(model_name == ALBERT):
            tokenizer = AlbertTokenizer.from_pretrained(ALBERT)
            model = AlbertForMaskedLM.from_pretrained(ALBERT)
        else:
            if((model_name == ROBERTA_BASE) or (model_name == ROBERTA_LARGE)):
                model = RobertaForMaskedLM.from_pretrained(model_name)
                tokenizer = RobertaTokenizer.from_pretrained(model_name)
            else: 
                if(model_name == GPT2):
                    model = AutoModelForCausalLM.from_pretrained(model_name)
                    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Queer Bench class

In [None]:
class QueerBench():
    def __init__(self, template_path, model_name, numAtt):
        self.numAtt = numAtt
        self.data = []
        self.template_path = template_path
        self.template_file = pd.read_csv(template_path, sep=";")
        self.template_identities = pd.read_csv(IDENTITIES_CSV, sep=';')
        self.template_pronouns = pd.read_csv(PRONOUNS_CSV, sep=';')
        self.model_name = model_name
        self.model, self.tokenizer = get_tokenizer(model_name)
        self.template_builder()
    
    def template_builder(self):
        if(self.template_path == TEMPLATE_NOZZA):
            self.template_nozza()
        else:
            if (self.template_path == TEMPLATE_TOXIC1):
                self.template_toxic1()
        #     else: 
        #         self.template_toxic2()
        
    def template_nozza(self):
        dataList =[]
        for index,row in tqdm(self.template_file.iterrows(), total=self.template_file.shape[0], desc='Creating template', unit='sentences'):
            sentence = row.loc['template']
            for ind, r in self.template_identities.iterrows():
                adjectiveList = []
                _sentence = re.sub(TARGET, f"The {r.loc['identity']} person", sentence)
                adjectiveList = self.predict_masked_sent(_sentence)
                sentencesNew = []
                for a in adjectiveList:
                    #print(a)
                    comp_sentence = re.sub(MASKBERT_, a, _sentence)
                    comp_sentence = re.sub(BE, 'is', comp_sentence)
                    sentencesNew.append(comp_sentence)  
                    #print(comp_sentence)                              
                data=[
                    sentence, #template
                    r.loc["identity"], #identity
                    adjectiveList, #word list
                    sentencesNew, #sentence list
                    r.loc["type"] #type identity
                ]
                dataList.append(data) 
            for ind, r in self.template_pronouns.iterrows():
                adjectiveList = []
                _sentence = re.sub(TARGET, r.loc["nom"], sentence)
                adjectiveList = self.predict_masked_sent(_sentence)
                sentencesNew = []
                for a in adjectiveList:
                    comp_sentence = re.sub(MASKBERT_, a, _sentence)
                    comp_sentence = re.sub(BE, r.loc["be"], comp_sentence)
                    sentencesNew.append(comp_sentence)
                    #print(comp_sentence)                                         
                data=[
                    sentence, #template
                    r.loc["nom"], #identity
                    r.loc["type"], #type identity
                    adjectiveList, #word list
                    sentencesNew, #sentence list
                ]
                dataList.append(data) 
        data_df = pd.DataFrame(dataList, columns=["template", "identity", "type", "attributes", "sentences"])
        self.data = data_df
        display(data_df)
    
    def template_toxic1(self):
        dataList =[]
        for index,row in tqdm(self.template_file.iterrows(), total=self.template_file.shape[0], desc='Creating template', unit='sentences'):
            sentence = row.loc['template']
            for ind, identity in self.template_identities.iterrows():
                adjectiveList = []
                _sentence = re.sub(TARGET, f"The {identity.loc['identity']} person", sentence)
                for id, pronouns in self.template_pronouns.iterrows():
                    __sentence = re.sub(NOM, pronouns.loc['nom'], _sentence)
                    adjectiveList = self.predict_masked_sent(__sentence)
                    sentencesNew = []
                    for a in adjectiveList:
                        #print(a)
                        comp_sentence = re.sub(MASKBERT_, a, __sentence)
                        comp_sentence = re.sub(BE, pronouns.loc['be'], comp_sentence)
                        sentencesNew.append(comp_sentence)  
                        #print(comp_sentence)                              
                    data=[
                        sentence, #template
                        identity.loc["identity"], #identity
                        adjectiveList, #word list
                        sentencesNew, #sentence list
                        identity.loc["type"], #type identity
                        pronouns.loc["nom"], #pronouns nom
                        pronouns.loc["type"] #type pronouns
                    ]
                    dataList.append(data) 
            for ind, pronouns in self.template_pronouns.iterrows():
                adjectiveList = []
                _sentence = re.sub(TARGET, pronouns.loc["nom"], sentence)
                adjectiveList = self.predict_masked_sent(_sentence)
                sentencesNew = []
                for a in adjectiveList:
                    comp_sentence = re.sub(MASKBERT_, a, _sentence)
                    comp_sentence = re.sub(BE, pronouns.loc["be"], comp_sentence)
                    sentencesNew.append(comp_sentence)
                    #print(comp_sentence)                                         
                data=[
                    sentence, #template
                    pronouns.loc["nom"], #identity
                    pronouns.loc["type"], #type identity
                    adjectiveList, #word list
                    sentencesNew, #sentence list
                    pronouns.loc["nom"], #pronouns nom
                    pronouns.loc["type"] #type pronouns
                ]
                dataList.append(data) 
        data_df = pd.DataFrame(dataList, columns=["template", "identity", "type_identity", "attributes", "sentences", "nom_identity", ])
        self.data = data_df
        display(data_df)

    def predict_masked_sent(self, text):
        if((self.model_name == BERT_BASE) or (self.model_name == BERT_LARGE) or (self.model_name== ALBERT)):
            text = "[CLS] %s [SEP]"%text
            #print(text)
            tokenized_text = self.tokenizer.tokenize(text)
            masked_index = tokenized_text.index(MASKBERT)
            indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
            tokens_tensor = torch.tensor([indexed_tokens])
            with torch.no_grad():
                output = self.model(tokens_tensor)
                predictions = output[0]

            probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
            top_k_weights, top_k_indices = torch.topk(probs, self.numAtt, sorted=True)

            adjectiveList = []
            for i, pred_idx in enumerate(top_k_indices):
                predicted_token = self.tokenizer.convert_ids_to_tokens([pred_idx])[0]
                token_weight = top_k_weights[i]
                #print(predicted_token)
                #print(token_weight.item()*100)
                adjectiveList.append(predicted_token)
            return adjectiveList
        else:
            if((self.model_name == ROBERTA_BASE) or (self.model_name == ROBERTA_LARGE)):
                text = re.sub(MASKBERT_, MASKROBERT, text)
                text = "<s> %s </s>"%text
                #print(text)
                tokenized_text = self.tokenizer.tokenize(text)
                #print(tokenized_text)
                masked_index = tokenized_text.index(MASKROBERT)
                indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
                tokens_tensor = torch.tensor([indexed_tokens])
                with torch.no_grad():
                    output = self.model(tokens_tensor)
                    predictions = output[0]

                probs = torch.nn.functional.softmax(predictions[0, masked_index], dim=-1)
                top_k_weights, top_k_indices = torch.topk(probs, self.numAtt, sorted=True)

                adjectiveList = []
                for i, pred_idx in enumerate(top_k_indices):
                    predicted_token = self.tokenizer.convert_ids_to_tokens([pred_idx])[0]
                    predicted_token = re.sub('Ġ', '', predicted_token)
                    token_weight = top_k_weights[i]
                    print(predicted_token)
                    print(token_weight.item()*100)
                    adjectiveList.append(predicted_token)
                return adjectiveList
            else:
                if(self.model_name == GPT2):
                    inputs = self.tokenizer.encode(text, return_tensors="pt")
                    with torch.no_grad():
                        outputs = self.model(inputs)
                        predictions = outputs[0]
                    next_token_candidates_tensor = predictions[0, -1, :]
                    topk_candidates_indexes = torch.topk(next_token_candidates_tensor, self.numAtt).indices.tolist()
                    #all_candidates_probabilities = torch.nn.functional.softmax(next_token_candidates_tensor, dim=-1)
                    #topk_candidates_probabilities = all_candidates_probabilities[topk_candidates_indexes].tolist()
                    topk_candidates_tokens = [self.tokenizer.decode([idx]).strip() for idx in topk_candidates_indexes]
                    return list(topk_candidates_tokens)

In [None]:
#Select template: TEMPLATE_NOZZA, TEMPLATE_TOXIC1, TEMPLATE_TOXIC2
#Select model: BERT_BASE, BERT_LARGE, ROBERTA_BASE, ROBERTA_LARGE, GPT2, ALBERT
BenchNozza = QueerBench(TEMPLATE_TOXIC1, BERT_BASE, 5)