In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climate_fever_to_claim_evidence_pairs

In [2]:
# This should always output true now, but worth checking
print(f"is GPU available: {torch.cuda.is_available()}")

is GPU available: True


In [3]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/unsubstantiated_claims_classification/data/climate_fever/climate-fever-dataset-r1.jsonl", lines=True)
preprocessed_df = climate_fever_to_claim_evidence_pairs(df)


In [4]:
# Map evidence_labels to integers so that the Trainer will know what the labels mean
label_dict = {
    "REFUTES": 0,
    "NOT_ENOUGH_INFO": 1,
    "SUPPORTS": 2
}

preprocessed_df["labels"] = preprocessed_df["evidence_label"].map(label_dict)
preprocessed_df

Unnamed: 0,claim_id,claim,evidence_id,evidence_label,evidence,entropy,labels
0,0,Global warming is driving polar bears toward e...,Extinction risk from global warming:170,NOT_ENOUGH_INFO,"""Recent Research Shows Human Activity Driving ...",0.693147,1
1,0,Global warming is driving polar bears toward e...,Global warming:14,SUPPORTS,Environmental impacts include the extinction o...,0.000000,2
2,0,Global warming is driving polar bears toward e...,Global warming:178,NOT_ENOUGH_INFO,Rising temperatures push bees to their physiol...,0.693147,1
3,0,Global warming is driving polar bears toward e...,Habitat destruction:61,SUPPORTS,"Rising global temperatures, caused by the gree...",0.000000,2
4,0,Global warming is driving polar bears toward e...,Polar bear:1328,NOT_ENOUGH_INFO,"""Bear hunting caught in global warming debate"".",0.693147,1
...,...,...,...,...,...,...,...
7670,3134,"Over the last decade, heatwaves are five times...",Bushfires in Australia:126,SUPPORTS,Australia's climate has warmed by more than on...,0.000000,2
7671,3134,"Over the last decade, heatwaves are five times...",Effects of global warming:86,NOT_ENOUGH_INFO,"In the last 30–40 years, heat waves with high ...",0.693147,1
7672,3134,"Over the last decade, heatwaves are five times...",Global warming:155,NOT_ENOUGH_INFO,Many regions have probably already seen increa...,0.693147,1
7673,3134,"Over the last decade, heatwaves are five times...",Global warming:156,NOT_ENOUGH_INFO,"Since the 1950s, droughts and heat waves have ...",0.693147,1


In [5]:
dataset = Dataset.from_pandas(preprocessed_df)
dataset

Dataset({
    features: ['claim_id', 'claim', 'evidence_id', 'evidence_label', 'evidence', 'entropy', 'labels'],
    num_rows: 7675
})

In [6]:
# Shuffle the dataset! This randomly rearranges the dataset, which is good especially with this one since the same claim appears five times in a row
# The seed parameter means we can access the exact same shuffle again if we need to
dataset = dataset.shuffle(seed=12)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")



In [8]:
# This splits our dataset so that we use 90% of it for training, and 10% for testing
split_dataset = dataset.train_test_split(test_size=0.1)

In [9]:
def custom_tokenize(examples):
    # The code block below this one can be used to find what the max_length should be set to.
    # Otherwise you have too much padding
    # Consider this properly later
    tokenized_output = tokenizer(
        text=[f"Claim: {claim} Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = split_dataset["train"].map(custom_tokenize, batched=True)
tokenized_testing_dataset = split_dataset["test"].map(custom_tokenize, batched=True)

Map:   0%|          | 0/6907 [00:00<?, ? examples/s]

Map:   0%|          | 0/768 [00:00<?, ? examples/s]

In [10]:

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long, which may be unnecessary
print(tokenized_training_dataset[6]["claim"])
print(tokenized_training_dataset[6]["evidence"])
print(tokenized_training_dataset[6]["labels"])
print(tokenized_training_dataset[6]["input_ids"])
print(tokenized_training_dataset[6]["attention_mask"])

print(tokenized_testing_dataset[2]["claim"])
print(tokenized_testing_dataset[2]["evidence"])
print(tokenized_training_dataset[2]["labels"])
print(tokenized_testing_dataset[2]["input_ids"])
print(tokenized_testing_dataset[2]["attention_mask"])

While there has been a mean rise of a little more than 3mm per year worldwide since the 1990s, in the last decade, the NOAA Virginia Key tide gauge just south of Miami Beach has measured a 9mm rise annually.”
This network was used, in combination with satellite altimeter data, to establish that global mean sea-level rose 19.5 cm (7.7 in) between 1870 and 2004 at an average rate of about 1.44 mm/yr (1.7 mm/yr during the 20th century).
1
[0, 45699, 35, 616, 89, 34, 57, 10, 1266, 1430, 9, 10, 410, 55, 87, 155, 5471, 228, 76, 3612, 187, 5, 4525, 29, 6, 11, 5, 94, 2202, 6, 5, 28260, 2240, 4300, 13260, 12567, 95, 2077, 9, 2561, 2467, 34, 9550, 10, 361, 5471, 1430, 6333, 4, 17, 46, 27956, 35, 152, 1546, 21, 341, 6, 11, 4069, 19, 7595, 11838, 28266, 414, 6, 7, 5242, 14, 720, 1266, 3342, 12, 4483, 1458, 753, 4, 245, 50141, 13753, 36, 406, 4, 406, 50141, 179, 43, 227, 41102, 8, 4482, 23, 41, 674, 731, 9, 59, 112, 4, 3305, 50141, 5471, 73, 4503, 36, 134, 4, 406, 50141, 5471, 73, 4503, 148, 5, 291

In [11]:
# Remove ignore_mismatched_sizes when needed - this replaces the head of the pretrained model (because if using
# climateBERT/environmental-claims, it has already been fine tuned and has 2 labels
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilroberta-base",
    num_labels=3,
    # ignore_mismatched_sizes=True
).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(next(model.parameters()).device)

cuda:0


In [13]:
model.gradient_checkpointing_enable()

In [14]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [15]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/distilroberta-base/climate_fever/seed12",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=600,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="no",
    save_steps=500,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.05,                    # Allows the model to adapt a little
    # gradient_accumulation_steps=2       # Might help with OOM errors, if we have them
    learning_rate=2e-5,
    push_to_hub=False,
)

In [16]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [17]:
trainer.train()

  0%|          | 0/2592 [00:00<?, ?it/s]

{'loss': 0.9495, 'grad_norm': 3.561037302017212, 'learning_rate': 1.9439480097481723e-05, 'epoch': 0.23}
{'loss': 0.8513, 'grad_norm': 14.175113677978516, 'learning_rate': 1.7814784727863527e-05, 'epoch': 0.46}
{'loss': 0.81, 'grad_norm': 19.386032104492188, 'learning_rate': 1.619821283509342e-05, 'epoch': 0.69}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7902180552482605, 'eval_accuracy': 0.64453125, 'eval_f1_score': 0.5930218605837773, 'eval_runtime': 9.3147, 'eval_samples_per_second': 82.45, 'eval_steps_per_second': 10.306, 'epoch': 0.69}
{'loss': 0.7507, 'grad_norm': 17.57859230041504, 'learning_rate': 1.4581640942323316e-05, 'epoch': 0.93}
{'loss': 0.6726, 'grad_norm': 13.372920036315918, 'learning_rate': 1.295694557270512e-05, 'epoch': 1.16}
{'loss': 0.635, 'grad_norm': 22.25615119934082, 'learning_rate': 1.1332250203086923e-05, 'epoch': 1.39}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7770288586616516, 'eval_accuracy': 0.67578125, 'eval_f1_score': 0.6746538035175039, 'eval_runtime': 9.5018, 'eval_samples_per_second': 80.827, 'eval_steps_per_second': 10.103, 'epoch': 1.39}
{'loss': 0.6383, 'grad_norm': 25.428762435913086, 'learning_rate': 9.707554833468725e-06, 'epoch': 1.62}
{'loss': 0.6432, 'grad_norm': 13.639143943786621, 'learning_rate': 8.082859463850529e-06, 'epoch': 1.85}
{'loss': 0.5674, 'grad_norm': 17.662887573242188, 'learning_rate': 6.458164094232332e-06, 'epoch': 2.08}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7503314018249512, 'eval_accuracy': 0.6848958333333334, 'eval_f1_score': 0.6879256727309818, 'eval_runtime': 9.3665, 'eval_samples_per_second': 81.994, 'eval_steps_per_second': 10.249, 'epoch': 2.08}
{'loss': 0.5097, 'grad_norm': 51.94365692138672, 'learning_rate': 4.833468724614135e-06, 'epoch': 2.31}
{'loss': 0.5102, 'grad_norm': 26.693740844726562, 'learning_rate': 3.2087733549959386e-06, 'epoch': 2.55}
{'loss': 0.4908, 'grad_norm': 3.2292091846466064, 'learning_rate': 1.5840779853777418e-06, 'epoch': 2.78}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7299189567565918, 'eval_accuracy': 0.7083333333333334, 'eval_f1_score': 0.7030141727490039, 'eval_runtime': 9.4279, 'eval_samples_per_second': 81.46, 'eval_steps_per_second': 10.183, 'epoch': 2.78}
{'train_runtime': 1118.6146, 'train_samples_per_second': 18.524, 'train_steps_per_second': 2.317, 'train_loss': 0.6568070959161829, 'epoch': 3.0}


TrainOutput(global_step=2592, training_loss=0.6568070959161829, metrics={'train_runtime': 1118.6146, 'train_samples_per_second': 18.524, 'train_steps_per_second': 2.317, 'total_flos': 2744905918178304.0, 'train_loss': 0.6568070959161829, 'epoch': 3.0})

In [18]:
# Metrics are not included in the save model so we need to save them separately
metrics = trainer.evaluate()
with open("./results/distilroberta-base/climate_fever/seed12/eval_metrics.json", "w") as output_file:
    json.dump(metrics, output_file)

  0%|          | 0/96 [00:00<?, ?it/s]