## 3.3 Transformers

In a third experiment, we modified the input. Two sentences were given: the first one being the sentence obtained in the training corpus, the second one a masked version of it. We verified which words in the original sentence appear in Recasens’ biased lexicon (Recasens et al.,2013). Those words were switched for the PBias word.


Then we fine-tuned DistilRoberta 3 more times: first omitting epistemological bias during training, second using only epistemological bias, third using only framing bias.

In [None]:
import pandas as pd # data processing
from sklearn.model_selection import train_test_split
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          TrainingArguments,
                          Trainer,
                          AutoConfig,
                          DataCollatorWithPadding)
import re #regex
import torch
import torch.nn as nn
import os
from datasets import Dataset, DatasetDict,load_metric
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
import spacy
nlp_en = spacy.load('en_core_web_lg')
import numpy as np
## Only if your run it in colab 
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
def get_wikibias():
    '''
    We read the wikibias corpus, prepare the partitions and return them. 
    '''
    corpus = pd.read_csv(f'data/wikibias_re.csv')
    #In order to have some balance in the data, we can remove one sentence from the pairs.
    #corpus = corpus[corpus.id.str.contains('\d_0')]

    #----------- Only train with framing bias.
    #corpus = corpus[~corpus.type.str.contains('0\|1\|0')]
    #corpus = corpus[~corpus.type.str.contains('0\|0\|1')]

    #----------- To Train without epistemological biases.
    #corpus = corpus[~corpus.type.str.contains('0\|1\|0')]
    #corpus = corpus[~corpus.type.str.contains('0\|0\|1')]

    #----------- Training of epistemological biases only
    #corpus_epis = corpus[corpus.type.str.contains('0\|1\|0')]
    #corpus_neu = corpus[corpus.type.str.contains('0\|0\|0')].sample(len(corpus_epis))
    #corpus = pd.concat([corpus_epis, corpus_neu], axis=0)
    
    #We change to binary classes.
    corpus['label'] = corpus['type'].apply(lambda x: 1 if '1' in x else 0)
    #We are left with the data that we need
    corpus = corpus[["sentence", "partition", "label"]]
    #Split
    train = corpus[corpus.partition == 0]
    test = corpus[corpus.partition == 1]
    val =  corpus[corpus.partition == 2]
    #----------- If we remove sentences with some kind of bias, replace the split part.
    #train, test = train_test_split(corpus, train_size=.8, random_state=0)
    #train, val = train_test_split(train, train_size=.9, random_state=0)
    return train[["sentence", "label"]], test[["sentence", "label"]], val[["sentence", "label"]]

train, test, val= get_wikibias()
train.head(10)

Unnamed: 0,sentence,label
0,""" We "" died of natural causes at age eight in ...",0
1,`` We '' died of natural causes at age eight i...,0
2,Evolution is the source of the vast diversity ...,1
3,Evolution may be the source of the vast divers...,0
4,Credit information such as a persons previous ...,0
5,Credit information such as a persons previous ...,0
6,"Phil Wilayto , former coordinator of A Job is ...",1
7,"Phil Wilayto , former coordinator of A Job is ...",0
8,Most Mellen books are in English ; many are al...,1
9,Most Mellen books are in English ; some are in...,0


### SG2

In [None]:
#SG2
en_corpus_SG2 = pd.read_csv('data/final_labels_SG2.csv', delimiter=';')
en_corpus_SG2['label'] = en_corpus_SG2['label_bias'].apply(lambda x: 1 if 'Biased' in x else 0)
en_corpus_SG2 = en_corpus_SG2[['text','label']]
en_corpus_SG2 = en_corpus_SG2.rename(columns={'text': 'sentence'})
_, test_SG2 = train_test_split(en_corpus_SG2, train_size=.5, random_state=0)
test_SG2.head(10)

Unnamed: 0,sentence,label
253,American Outdoor Brands Corp AOBC.O said on Th...,0
541,Black Lives Matter Philadelphia organizer prop...,0
302,And if we look at a subset of guns which the l...,1
3575,While Pence and the president have tried to pa...,1
1698,Many people in politics and media are still no...,1
3048,Their work came on the heels of an IPCC report...,0
3653,"Yet bizarrely, the entirety of the Establishme...",1
841,"Despite a clear margin of victory for Biden, T...",0
3530,When greed and power are exercised by giant mu...,1
473,"Because he just can't help himself, on the Sun...",0


### Checkthat

In [None]:
#Checkthat
test_Check = pd.read_csv('data/dev_en.tsv', sep='\t')
test_Check['label'] = np.where(test_Check['label']=='SUBJ',1,0)
test_Check = test_Check[['sentence','label']]
test_Check.head(10)

Unnamed: 0,sentence,label
0,Who will redistribute the hoarded wealth that ...,1
1,What we don’t need is the indiscriminate influ...,1
2,The Social Distance Between Us shows every sig...,0
3,"History shows that McCarthy and McConnell, lik...",0
4,So while it’s not hard to reach a banal point ...,1
5,We can never forget that when the pressure was...,0
6,"Over the past few decades, very few real fight...",1
7,"I just find it a ridiculous argument,” said St...",0
8,"Meanwhile, the radical political project of ma...",1
9,Their posh communities don’t have to deal with...,1


## Par

We indicate that this word may introduce a bias.

In [None]:
def txt_to_set(url):
    '''
    It reads all files in a folder and returns a set of their contents.

    Args:
        url (str): folder path
    '''
    lex = set({})
    for file in os.listdir(url):
        if ('README' and ".pdf") not in file:
            with open(url + file, "r") as text_file:
              for line in text_file:
                    aux = line.strip()
                    if ' ' not in aux and '' != aux:
                        lex.add(aux)
    return lex

lex_en = txt_to_set("lexicon/")

def do_mask(data):
    '''
    Scans every word in a sentence and masks possible biases

    Args:
        data (array): Data partition
    '''
    pattern = r'[-_{}(),;:"#\/.¡!¿?·\[\]\'`‘’%0123456789…—\n]'
    d_aux = ''
    doc = nlp_en(data)
    for word in doc:
        #Removes the sign, converts to lower case, and keeps the lemma.
        aux = re.sub(pattern,'', word.lemma_.lower()) 
        if aux != '':
            if aux in lex_en:
                d_aux += ' PBias'
            else:
                d_aux += ' ' + word.text
    return d_aux

train['Text1'] = train.sentence.apply(do_mask)
val['Text1'] = val.sentence.apply(do_mask)
test['Text1'] = test.sentence.apply(do_mask)
test_SG2['Text1'] = test_SG2.sentence.apply(do_mask)
test_Check['Text1'] = test_Check.sentence.apply(do_mask)

In [None]:
train.head(5)

Unnamed: 0,sentence,label,Text1
2,Evolution is the source of the vast diversity ...,1,Evolution is the PBias of the PBias diversity...
8,Most Mellen books are in English ; many are al...,1,PBias Mellen PBias are PBias English many are...
16,"In 1953 , the Nehru government bowed to intens...",1,PBias the Nehru PBias PBias PBias PBias PBias...
20,The culture war ( or culture wars ) in America...,1,The PBias PBias or PBias PBias PBias PBias us...
38,After seeing the horse run on television the b...,1,After PBias the horse PBias PBias television ...


## Transformer

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["Texts"], examples["Text1"],truncation=True)

datasets = DatasetDict({
    "train": Dataset.from_pandas(train.rename(columns={'sentence': 'Texts', 'label': 'labels'})),
    "dev" : Dataset.from_pandas(val.rename(columns={'sentence': 'Texts', 'label': 'labels'})),
    "test": Dataset.from_pandas(test.rename(columns={'sentence': 'Texts', 'label': 'labels'})),
    "test_SG2": Dataset.from_pandas(test_SG2.rename(columns={'sentence': 'Texts', 'label': 'labels'})),
    "test_Check": Dataset.from_pandas(test_Check.rename(columns={'sentence': 'Texts', 'label': 'labels'})),
    })

tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


tokenized_datasets["train"].set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["dev"].set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["test"].set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["test_SG2"].set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_datasets["test_Check"].set_format("torch", columns=["input_ids", "attention_mask", "labels"])

### Weighted Cross Entropy Loss

In [None]:
class_weights = (1 - (train["label"].value_counts().sort_index() / len(train))).values
class_weights

array([0.50913838, 0.49086162])

In [None]:
class_weights = torch.from_numpy(class_weights).float().to("cuda")#.to("mps")
print(class_weights)

class WeightedLossTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):

        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")

        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)

        return (loss, outputs) if return_outputs else loss

tensor([0.5091, 0.4909], device='cuda:0')


### Train with WIKIBIAS

In [None]:
def compute_metrics(eval_pred):
    metric1 = load_metric("precision", trust_remote_code=True)
    metric2 = load_metric("recall", trust_remote_code=True)
    metric3 = load_metric("f1", trust_remote_code=True)
    metric4 = load_metric("accuracy", trust_remote_code=True)

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [None]:
# Load and compile our model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained("distilroberta-base", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base",config=config).to(device)

training_args = TrainingArguments(output_dir="test_trainer",
                                  evaluation_strategy='epoch',
                                  learning_rate=3e-5,
                                  load_best_model_at_end=True,
                                  save_strategy="epoch",
                                  #weight_decay=0.01,
                                  num_train_epochs=3,
                                  seed=43)

trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()


## En nuestros datos

In [None]:
preds_output_test = trainer.predict(tokenized_datasets["test"])
preds_output_test.metrics

### SG2

In [None]:
preds_output_SG2 = trainer.predict(tokenized_datasets["test_SG2"])
preds_output_SG2.metrics

### CheckThat!

In [None]:
preds_output_Check = trainer.predict(tokenized_datasets["test_Check"])
preds_output_Check.metrics