In [1]:
import gzip
import regex as re
import nltk
import torch
from torch.utils.data import Dataset
from transformers import *
import os.path
from tqdm import tqdm
import pickle
from collections import defaultdict



In [2]:
this_model_type='bert' #from ['bert', 'roberta', 'albert', 'dbert', 'electra', 'gpt2']
this_block_size=128
attributes=['gender.txt']
stereotypes_file='gender_stereotype.txt'  #not using stereotypes; define ''
out_combo='gender_stereotype'

In [3]:
if not os.path.exists('news_commentary_v15.en'):
    f_in=gzip.open('news-commentary-v15.en.gz') #download from website; or use curl
    f_out=open('news_commentary_v15.en', 'wb')
    f_out.writelines(f_in)
    f_out.close()
    f_in.close()

In [4]:
with open('news_commentary_v15.en', 'r', encoding='utf-8') as f:
    lines=f.readlines()

data=[l.strip() for l in lines]

if stereotypes_file:
    stereotypes=[word.strip() for word in open(stereotypes_file)]
    stereotype_set=set(stereotypes)

pat=re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

attributes_l=[]
all_attributes_set=set()
for attribute in attributes:
    l=[word.strip() for word in open(attribute)]
    attributes_l.append(set(l))
    all_attributes_set |= set(l)

In [5]:
def prepare_transformer(model_type):
    if model_type=='bert':
        pretrained_weights='bert-base-uncased'
        model=BertModel.from_pretrained(pretrained_weights, output_hidden_states=True)
        tokenizer=BertTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='roberta':
        pretrained_weights='roberta-base'
        model=RobertaModel.from_pretrained(pretrained_weights)
        tokenizer=RobertaTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='albert':
        pretrained_weights='albert-base-v2'
        model=AlbertModel.from_pretrained(pretrained_weights)
        tokenizer=AlbertTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='dbert':
        pretrained_weights='distilbert-base-uncased'
        model=DistilBertModel.from_pretrained(pretrained_weights)
        tokenizer=DistilBertTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='xlnet':
        pretrained_weights='xlnet-base-cased'
        model=XLNetModel.from_pretrained(pretrained_weights)
        tokenizer=XLNetTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='electra':
        pretrained_weights='google/electra-small-discriminator'
        model=ElectraModel.from_pretrained(pretrained_weights)
        tokenizer=ElectraTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='gpt':
        pretrained_weights='openai-gpt'
        model=OpenAIGPTModel.from_pretrained(pretrained_weights)
        tokenizer=OpenAIGPTTokenizer.from_pretrained(pretrained_weights)
    elif model_type=='gpt2':
        pretrained_weights='gpt2'
        model=GPT2Model.from_pretrained(pretrained_weights)
        tokenizer=GPT2Tokenizer.from_pretrained(pretrained_weights)
    elif model_type=='xl':
        pretrained_weights='transfo-xl-wt103'
        model=TransfoXLModel.from_pretrained(pretrained_weights)
        tokenizer=TransfoXLTokenizer.from_pretrained(pretrained_weights)
    return model, tokenizer

def encode_to_is(tokenizer, the_data, add_special_tokens):
    if type(the_data)==list:
        data=[tuple(tokenizer.encode(sentence, add_special_tokens=add_special_tokens)) for sentence in the_data]
    elif type(the_data)==dict:
        data={tuple(tokenizer.encode(key, add_special_tokens=add_special_tokens)): tokenizer.encode(value, add_special_tokens=add_special_tokens) for key, value in the_data.items()}
    return data

In [6]:
model, tokenizer=prepare_transformer(this_model_type)

loading configuration file config.json from cache at C:\Users\466476mp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at C:\Users\466476mp/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891c

In [7]:
if stereotypes_file:
    tok_stereotypes=encode_to_is(tokenizer, stereotypes, add_special_tokens=False)

neutral_examples=[]
neutral_labels=[]
stereotype_attr_labels=[]
attribute_examples=[[] for _ in range(len(attributes_l))]
attribute_labels=[[] for _ in range(len(attributes_l))]


for line in tqdm(data):
    neutral_flag=True
    line=line.strip()
    if len(line)<1:
        continue
    length=len(line.split())
    if length>this_block_size or length<=1:
        continue
    tokens_orig=[token.strip() for token in re.findall(pat, line)]
    tokens_lower=[token.lower() for token in tokens_orig]
    token_set=set(tokens_lower)
    
    attribute_other_l=[]
    for i, _ in enumerate(attributes_l):
        a_set=set()
        for j, attribute in enumerate(attributes_l):
            if i!=j:
                a_set |= attribute
        attribute_other_l.append(a_set)
    
    for i, (attribute_set, other_set) in enumerate(zip(attributes_l, attribute_other_l)):
        # & is bitwise AND operator
        if attribute_set & token_set: #if a gender attribute is in the data line; classify the line as not neutral; and set the attribute to be the label
            neutral_flag=False
            if not other_set&token_set:
                orig_line=line
                line=tokenizer.encode(line, add_special_tokens=True)
                labels=attribute_set & token_set
                for label in list(labels):
                    idx=tokens_lower.index(label)
                label=tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=True))[1:-1]
                line_ngram=list(nltk.ngrams(line, len(label)))
                if label not in line_ngram:
                    label=tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=False))
                    line_ngram=list(nltk.ngrams(line, len(label)))
                    if label not in line_ngram:
                        label = tuple(tokenizer.encode(f'a {tokens_orig[idx]} a'))[1:-1]
                        line_ngram = list(nltk.ngrams(line, len(label)))
                        if label not in line_ngram:
                            label = tuple([tokenizer.encode(f'{tokens_orig[idx]}2')[0]])
                            line_ngram = list(nltk.ngrams(line, len(label)))
                idx=line_ngram.index(label)
                attribute_examples[i].append(line)
                attribute_labels[i].append([idx+j for j in range(len(label))])
                attr_label=label
                
    if not neutral_flag and stereotype_set&token_set:
        line=orig_line
        line=tokenizer.encode(line, add_special_tokens=False)
        neutr_labels=stereotype_set&token_set
        for label in list(neutr_labels):
            stereotype_attr_labels.append(attr_label)
            idx=tokens_lower.index(label)
            label=tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=True))[1:-1]
            line_ngram_neutral=list(nltk.ngrams(line, len(label)))
            if label not in line_ngram_neutral:
                label = tuple(tokenizer.encode(tokens_orig[idx], add_special_tokens=False))
                line_ngram_neutral = list(nltk.ngrams(line, len(label)))
                if label not in line_ngram_neutral:
                    label = tuple(tokenizer.encode(f'a {tokens_orig[idx]} a'))[1:-1]
                    line_ngram_neutral = list(nltk.ngrams(line, len(label)))
                    if label not in line_ngram_neutral:
                        label = tuple([tokenizer.encode(f'{tokens_orig[idx]}2')[0]])
                        line_ngram_neutral = list(nltk.ngrams(line, len(label)))
            idx=line_ngram_neutral.index(label)
            neutral_examples.append(line)
            neutral_labels.append(label)
            #neutral_labels.append([idx+i for i in range(len(label))])
                    
                
print('neutral:', len(neutral_examples))
for i, examples in enumerate(attribute_examples):
    print(f'attributes{i}:', len(examples))   

100%|██████████| 608912/608912 [01:04<00:00, 9420.91it/s] 

neutral: 9974
attributes0: 56800





In [8]:
for i in range(len(neutral_examples)):
    neutral_examples[i]=tokenizer.decode(neutral_examples[i])
    neutral_labels[i]=tokenizer.decode(neutral_labels[i])
    stereotype_attr_labels[i]=tokenizer.decode(stereotype_attr_labels[i])

In [9]:
file=open("gender_with_labels.pickle", "rb")
gender_dict=pickle.load(file)

In [10]:
gender_data=[]
for i in range(len(neutral_examples)):
    this_line=defaultdict()
    this_line['g']=gender_dict[stereotype_attr_labels[i]]
    this_line['s']=neutral_labels[i]
    this_line['text']=neutral_examples[i]
    gender_data.append(this_line)

In [24]:
gender_data_train=gender_data[:round(len(gender_data)*0.9)]
gender_data_dev=gender_data[round(len(gender_data)*0.9):]

In [27]:
with open('gender_data_for_inlp_train.pickle', 'wb') as f:
    pickle.dump(gender_data_train, f)
    
with open('gender_data_for_inlp_dev.pickle', 'wb') as f:
    pickle.dump(gender_data_dev, f)