In [1]:
from datasets import Dataset
import json
from pathlib import Path
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
group = 2

In [3]:
with open('../retriever/dataset/split.json', 'r') as file:
    split = json.load(file)

In [4]:
ref = {'false': 0, 'on_fire': 0, 'mostly_false': 1, 'half_true': 2, 'mostly_true': 3, 'true': 4}

In [5]:
train_data = []
for part in ['train']:
    for topic in split[group-1]:
        for label in ref.keys():
            path = Path(f'../retrieval/{group}/{part}/{topic}/{label}')
            sub = path.glob('*')
            num = len(list(sub))
            for i in range(num):
                idx = i+1
                path = Path(f'../retrieval/{group}/{part}/{topic}/{label}/{idx}')
                sub = path.glob('*')
                with open(f'../dataset/{part}/{topic}/{label}/{idx}/info.json', 'r') as file:
                    info = json.load(file)
                claim = info['claim']
                for s in sub:
                    if random.randint(1, 2) != 1:
                        continue
                    with open(s, 'r') as file:
                        evidence = json.load(file)
                    text = ""
                    for e in evidence:
                        text += str(e) + ' '
                    datapoint = {'text': f'Claim: {claim}\nReference: {text}', 'label': ref[label]}
                    train_data.append(datapoint)
                    if datapoint['label'] == 4:
                        train_data.append(datapoint)
print(len(train_data))
train_dataset = Dataset.from_list(train_data)
train_dataset.shuffle(seed=42)

eval_data = []
for topic in split[group-1]:
    for label in ref.keys():
        path = Path(f'../retrieval/{group}/test/{topic}/{label}')
        sub = path.glob('*')
        num = len(list(sub))
        for i in range(num):
            idx = i+1
            path = Path(f'../retrieval/{group}/test/{topic}/{label}/{idx}')
            sub = path.glob('*')
            with open(f'../dataset/test/{topic}/{label}/{idx}/info.json', 'r') as file:
                info = json.load(file)
            claim = info['claim']
            for s in sub:
                with open(s, 'r') as file:
                    evidence = json.load(file)
                text = ""
                for e in evidence:
                    text += str(e) + ' '
                datapoint = {'text': f'Claim: {claim}\nReference: {text}', 'label': ref[label]}
                eval_data.append(datapoint)
                if datapoint['label'] == 4:
                    eval_data.append(datapoint)

eval_dataset = Dataset.from_list(eval_data)

5912


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 5912/5912 [00:01<00:00, 5153.15 examples/s]
Map: 100%|██████████| 3950/3950 [00:00<00:00, 8173.76 examples/s]


In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    macro_f1 = f1_score(y_true=labels, y_pred=predictions, average='macro')

    metrics = accuracy.compute(predictions=predictions, references=labels)
    metrics["macro_f1"] = macro_f1

    return metrics

In [9]:
id2label = {0: 'false', 1: 'mostly_false', 2: 'half_true', 3: 'mostly_true', 4: 'true'}
label2id = {'false': 0, 'mostly_false': 1, 'half_true': 2, 'mostly_true': 3, 'true': 4}

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base", num_labels=5, id2label=id2label, label2id=label2id
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(
    output_dir=f"./model/{group}",
    learning_rate=1e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,1.503745,0.353924,0.154914
2,No log,1.469843,0.373165,0.267005
3,1.516200,1.488583,0.371899,0.281827


TrainOutput(global_step=741, training_loss=1.455687611691865, metrics={'train_runtime': 479.0578, 'train_samples_per_second': 37.023, 'train_steps_per_second': 1.547, 'total_flos': 4666663374741504.0, 'train_loss': 1.455687611691865, 'epoch': 3.0})

In [12]:
# from transformers import pipeline
# text = "Claim: The Affordable Care Act is national law.\nReference: The Affordable Care Act was signed by president Obama in 2010."
# classifier = pipeline("sentiment-analysis", model="./model/1/checkpoint-806/")
# classifier(text)