In [1]:
import gzip
import regex as re
import nltk
import torch
from torch.utils.data import Dataset
from transformers import *
import os.path
from tqdm import tqdm

In [27]:
this_model_type='albert' #from ['bert', 'roberta', 'albert', 'dbert', 'electra', 'gpt2']
this_block_size=128
attributes=['./attribute_target_words/attributes/religion_demonyms.txt']
stereotypes_file='./attribute_target_words/targets/polarized_and_class_words.txt'  #not using stereotypes; define ''
out_combo='religion_polarized_class'

In [28]:
if not os.path.exists('news_commentary_v15.en'):
    f_in=gzip.open('news-commentary-v15.en.gz') #download from website; or use curl
    f_out=open('news_commentary_v15.en', 'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_in.close()

In [29]:
with open('news_commentary_v15.en', 'r', encoding='utf-8') as f:
    lines=f.readlines()

data=[l.strip() for l in lines]

if stereotypes_file:
    stereotypes=[word.strip() for word in open(stereotypes_file)]
    stereotype_set=set(stereotypes)

pat=re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

attributes_l=[]
all_attributes_set=set()
for attribute in attributes:
    l=[word.strip() for word in open(attribute)]
    attributes_l.append(set(l))
    all_attributes_set |= set(l)

In [30]:
def prepare_transformer(model_type):
    if model_type=='bert':
        pretrained_weights='bert-base-uncased'
        model=BertModel.from_pretrained(pretrained_weights, output_hidden_states=True)
        tokenizer=BertTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='roberta':
        pretrained_weights='roberta-base'
        model=RobertaModel.from_pretrained(pretrained_weights)
        tokenizer=RobertaTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='albert':
        pretrained_weights='albert-base-v2'
        model=AlbertModel.from_pretrained(pretrained_weights)
        tokenizer=AlbertTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='dbert':
        pretrained_weights='distilbert-base-uncased'
        model=DistilBertModel.from_pretrained(pretrained_weights)
        tokenizer=DistilBertTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='xlnet':
        pretrained_weights='xlnet-base-cased'
        model=XLNetModel.from_pretrained(pretrained_weights)
        tokenizer=XLNetTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='electra':
        pretrained_weights='google/electra-small-discriminator'
        model=ElectraModel.from_pretrained(pretrained_weights)
        tokenizer=ElectraTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='gpt':
        pretrained_weights='openai-gpt'
        model=OpenAIGPTModel.from_pretrained(pretrained_weights)
        tokenizer=OpenAIGPTTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='gpt2':
        pretrained_weights='gpt2'
        model=GPT2Model.from_pretrained(pretrained_weights)
        tokenizer=GPT2Tokenizer.from_pretrained(pretrained_weights)
    elif model_type=='xl':
        pretrained_weights='transfo-xl-wt103'
        model=TransfoXLModel.from_pretrained(pretrained_weights)
        tokenizer=TransfoXLTokenizer.from_pretrained(pretrained_weights)
    return model, tokenizer

def encode_to_is(tokenizer, the_data, add_special_tokens):
    if type(the_data)==list:
        data=[tuple(tokenizer.encode(sentence, add_special_tokens=add_special_tokens)) for sentence in the_data]
    elif type(the_data)==dict:
        data={tuple(tokenizer.encode(key, add_special_tokens=add_special_tokens)): tokenizer.encode(value, add_special_tokens=add_special_tokens) for key, value in the_data.items()}
    return data

In [31]:
model, tokenizer=prepare_transformer(this_model_type)

loading configuration file https://huggingface.co/albert-base-v2/resolve/main/config.json from cache at C:\Users\31631/.cache\huggingface\transformers\e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88
Model config AlbertConfig {
  "architectures": [
    "AlbertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "albert",
  "net_structure_type": 0,
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "num_memory_blocks": 0,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_

In [32]:
if stereotypes_file:
    tok_stereotypes=encode_to_is(tokenizer, stereotypes, add_special_tokens=False)

neutral_examples=[]
if stereotypes_file:
    neutral_labels=[]
attribute_examples=[[] for _ in range(len(attributes_l))]
attribute_labels=[[] for _ in range(len(attributes_l))]

In [33]:
for line in tqdm(data):
    neutral_flag=True
    line=line.strip()
    if len(line)<1:
        continue
    length=len(line.split())
    if length>this_block_size or length<=1:
        continue
    tokens_orig=[token.strip() for token in re.findall(pat, line)]
    tokens_lower=[token.lower() for token in tokens_orig]
    token_set=set(tokens_lower)

    attribute_other_l=[]
    for i, _ in enumerate(attributes_l):
        a_set=set()
        for j, attribute in enumerate(attributes_l):
            if i!=j:
                a_set |= attribute
        attribute_other_l.append(a_set)

    for i, (attribute_set, other_set) in enumerate(zip(attributes_l, attribute_other_l)):
        # & is bitwise AND operator
        if attribute_set & token_set: #if a gender attribute is in the data line; classify the line as not neutral; and set the attribute to be the label
            neutral_flag=False
            if not other_set&token_set:
                orig_line=line
                line=tokenizer.encode(line, add_special_tokens=True)
                labels=attribute_set & token_set
                for label in list(labels):
                    idx=tokens_lower.index(label)
                label=tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=True))[1:-1]
                line_ngram=list(nltk.ngrams(line, len(label)))
                if label not in line_ngram:
                    label=tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=False))
                    line_ngram=list(nltk.ngrams(line, len(label)))
                    if label not in line_ngram:
                        label = tuple(tokenizer.encode(f'a {tokens_orig[idx]} a'))[1:-1]
                        line_ngram = list(nltk.ngrams(line, len(label)))
                        if label not in line_ngram:
                            label = tuple([tokenizer.encode(f'{tokens_orig[idx]}2')[0]])
                            line_ngram = list(nltk.ngrams(line, len(label)))
                idx=line_ngram.index(label)
                attribute_examples[i].append(line)
                attribute_labels[i].append([idx+j for j in range(len(label))])
            break

    if neutral_flag:
        if stereotypes_file:
            if stereotype_set & token_set: #if attribute is not in line; check if stereotype in line: if there is a stereotype in the line, but not an attribute: the line is considered neutral
                orig_line=line
                line=tokenizer.encode(line, add_special_tokens=True)
                labels=stereotype_set&token_set #stereotype words present in line are the labels
                for label in list(labels):
                    idx=tokens_lower.index(label)
                    label=tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=True))[1:-1]
                    line_ngram=list(nltk.ngrams(line, len(label)))
                    if label not in line_ngram:
                        label = tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=False))
                        line_ngram = list(nltk.ngrams(line, len(label)))
                        if label not in line_ngram:
                            label = tuple(tokenizer.encode(f'a {tokens_orig[idx]} a'))[1:-1]
                            line_ngram = list(nltk.ngrams(line, len(label)))
                            if label not in line_ngram:
                                label = tuple([tokenizer.encode(f'{tokens_orig[idx]}2')[0]])
                                line_ngram = list(nltk.ngrams(line, len(label)))
                    idx=line_ngram.index(label)
                    neutral_examples.append(line)
                    neutral_labels.append([idx+i for i in range(len(label))])
        else:
            neutral_examples.append(tokenizer.encode(line, add_special_tokens=True)) #If we dont use stereotypes; all lines not containing an attribute word are considered neutral

print('neutral:', len(neutral_examples))
for i, examples in enumerate(attribute_examples):
    print(f'attributes{i}:', len(examples))

data_output= {'attributes_examples': attribute_examples,
            'attributes_labels': attribute_labels,
            'neutral_examples': neutral_examples}

if stereotypes_file:
    data_output['neutral_labels']=neutral_labels

100%|██████████| 608912/608912 [01:24<00:00, 7209.72it/s] 

neutral: 34439
attributes0: 6443





In [35]:
torch.save(data_output, './data/'+this_model_type+'/'+out_combo+'_data.bin')