In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climate_fever_to_claim_evidence_pairs

In [2]:
# This should always output true now, but worth checking
print(f"GPU available: {torch.cuda.is_available()}")

GPU available: True


In [3]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/unsubstantiated_claims_classification/data/climate_fever/climate-fever-dataset-r1.jsonl", lines=True)
preprocessed_df = climate_fever_to_claim_evidence_pairs(df)


In [4]:
# Map evidence_labels to integers so that the Trainer will know what the labels mean
label_dict = {
    "REFUTES": 0,
    "NOT_ENOUGH_INFO": 1,
    "SUPPORTS": 2
}

preprocessed_df["labels"] = preprocessed_df["evidence_label"].map(label_dict)
preprocessed_df

Unnamed: 0,claim_id,claim,evidence_id,evidence_label,evidence,entropy,labels
0,0,Global warming is driving polar bears toward e...,Extinction risk from global warming:170,NOT_ENOUGH_INFO,"""Recent Research Shows Human Activity Driving ...",0.693147,1
1,0,Global warming is driving polar bears toward e...,Global warming:14,SUPPORTS,Environmental impacts include the extinction o...,0.000000,2
2,0,Global warming is driving polar bears toward e...,Global warming:178,NOT_ENOUGH_INFO,Rising temperatures push bees to their physiol...,0.693147,1
3,0,Global warming is driving polar bears toward e...,Habitat destruction:61,SUPPORTS,"Rising global temperatures, caused by the gree...",0.000000,2
4,0,Global warming is driving polar bears toward e...,Polar bear:1328,NOT_ENOUGH_INFO,"""Bear hunting caught in global warming debate"".",0.693147,1
...,...,...,...,...,...,...,...
7670,3134,"Over the last decade, heatwaves are five times...",Bushfires in Australia:126,SUPPORTS,Australia's climate has warmed by more than on...,0.000000,2
7671,3134,"Over the last decade, heatwaves are five times...",Effects of global warming:86,NOT_ENOUGH_INFO,"In the last 30–40 years, heat waves with high ...",0.693147,1
7672,3134,"Over the last decade, heatwaves are five times...",Global warming:155,NOT_ENOUGH_INFO,Many regions have probably already seen increa...,0.693147,1
7673,3134,"Over the last decade, heatwaves are five times...",Global warming:156,NOT_ENOUGH_INFO,"Since the 1950s, droughts and heat waves have ...",0.693147,1


In [5]:
dataset = Dataset.from_pandas(preprocessed_df)
dataset

Dataset({
    features: ['claim_id', 'claim', 'evidence_id', 'evidence_label', 'evidence', 'entropy', 'labels'],
    num_rows: 7675
})

In [6]:
# Shuffle the dataset! This randomly rearranges the dataset, which is good especially with this one since the same claim appears five times in a row
# The seed parameter means we can access the exact same shuffle again if we need to
dataset = dataset.shuffle(seed=12)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")



In [8]:
# This splits our dataset so that we use 90% of it for training, and 10% for testing
split_dataset = dataset.train_test_split(test_size=0.1)

In [9]:
def custom_tokenize(examples):
    # The code block below this one can be used to find what the max_length should be set to.
    # Otherwise you have too much padding
    # Consider this properly later
    tokenized_output = tokenizer(
        text=[f"Claim: {claim} Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = split_dataset["train"].map(custom_tokenize, batched=True)
tokenized_testing_dataset = split_dataset["test"].map(custom_tokenize, batched=True)

Map:   0%|          | 0/6907 [00:00<?, ? examples/s]

Map:   0%|          | 0/768 [00:00<?, ? examples/s]

In [10]:

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long, which may be unnecessary
print(tokenized_training_dataset[6]["claim"])
print(tokenized_training_dataset[6]["evidence"])
print(tokenized_training_dataset[6]["labels"])
print(tokenized_training_dataset[6]["input_ids"])
print(tokenized_training_dataset[6]["attention_mask"])

print(tokenized_testing_dataset[2]["claim"])
print(tokenized_testing_dataset[2]["evidence"])
print(tokenized_training_dataset[2]["labels"])
print(tokenized_testing_dataset[2]["input_ids"])
print(tokenized_testing_dataset[2]["attention_mask"])

“The global reef crisis does not necessarily mean extinction for coral species.
The Cretaceous–Paleogene (K–Pg) extinction event, also known as the Cretaceous–Tertiary (K–T) extinction, was a sudden mass extinction of three-quarters of the plant and animal species on Earth, approximately 66 million years ago.
1
[0, 45699, 35, 1437, 50305, 133, 720, 28350, 1437, 50499, 473, 45, 4784, 1266, 23989, 13, 23491, 4707, 4, 27956, 35, 20, 230, 4903, 40576, 50300, 510, 1627, 44288, 36, 530, 50300, 47396, 43, 23989, 515, 6, 67, 684, 25, 5, 230, 4903, 40576, 50300, 565, 2399, 17174, 36, 530, 50300, 565, 43, 23989, 6, 21, 10, 7207, 2862, 23989, 9, 130, 12, 21899, 9, 5, 2195, 8, 3477, 4707, 15, 3875, 6, 2219, 5138, 50141, 4416, 107, 536, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
# Remove ignore_mismatched_sizes when needed - this replaces the head of the pretrained model (because if using
# climateBERT/environmental-claims, it has already been fine tuned and has 2 labels
model = AutoModelForSequenceClassification.from_pretrained(
    "climatebert/distilroberta-base-climate-f",
    num_labels=3,
    # ignore_mismatched_sizes=True
).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(next(model.parameters()).device)

cuda:0


In [13]:
model.gradient_checkpointing_enable()

In [14]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [None]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/climateBERT-base/climate_fever/seed12",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=600,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="no",
    save_steps=500,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.05,                    # Allows the model to adapt a little
    # gradient_accumulation_steps=2       # Might help with OOM errors, if we have them
    learning_rate=2e-5,
    push_to_hub=False,
)

In [16]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [17]:
trainer.train()

  0%|          | 0/2592 [00:00<?, ?it/s]

{'loss': 0.9186, 'grad_norm': 5.303847312927246, 'learning_rate': 1.9439480097481723e-05, 'epoch': 0.23}
{'loss': 0.8546, 'grad_norm': 6.42067813873291, 'learning_rate': 1.7814784727863527e-05, 'epoch': 0.46}
{'loss': 0.7843, 'grad_norm': 11.562832832336426, 'learning_rate': 1.619008935824533e-05, 'epoch': 0.69}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.8134475350379944, 'eval_accuracy': 0.6302083333333334, 'eval_f1_score': 0.5972580598031295, 'eval_runtime': 9.5873, 'eval_samples_per_second': 80.106, 'eval_steps_per_second': 10.013, 'epoch': 0.69}
{'loss': 0.7752, 'grad_norm': 7.913565635681152, 'learning_rate': 1.4573517465475225e-05, 'epoch': 0.93}
{'loss': 0.6969, 'grad_norm': 10.367363929748535, 'learning_rate': 1.2948822095857027e-05, 'epoch': 1.16}
{'loss': 0.6016, 'grad_norm': 10.25446605682373, 'learning_rate': 1.132412672623883e-05, 'epoch': 1.39}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7236738204956055, 'eval_accuracy': 0.6979166666666666, 'eval_f1_score': 0.6920587485886577, 'eval_runtime': 9.5913, 'eval_samples_per_second': 80.073, 'eval_steps_per_second': 10.009, 'epoch': 1.39}
{'loss': 0.6342, 'grad_norm': 18.000932693481445, 'learning_rate': 9.707554833468725e-06, 'epoch': 1.62}
{'loss': 0.614, 'grad_norm': 16.814802169799805, 'learning_rate': 8.082859463850529e-06, 'epoch': 1.85}
{'loss': 0.5569, 'grad_norm': 8.792628288269043, 'learning_rate': 6.458164094232332e-06, 'epoch': 2.08}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7637662887573242, 'eval_accuracy': 0.68359375, 'eval_f1_score': 0.6919419212686071, 'eval_runtime': 9.6335, 'eval_samples_per_second': 79.722, 'eval_steps_per_second': 9.965, 'epoch': 2.08}
{'loss': 0.488, 'grad_norm': 10.382074356079102, 'learning_rate': 4.833468724614135e-06, 'epoch': 2.31}
{'loss': 0.4654, 'grad_norm': 23.23374366760254, 'learning_rate': 3.2087733549959386e-06, 'epoch': 2.55}
{'loss': 0.495, 'grad_norm': 9.496102333068848, 'learning_rate': 1.5840779853777418e-06, 'epoch': 2.78}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7427298426628113, 'eval_accuracy': 0.7096354166666666, 'eval_f1_score': 0.7133817042901426, 'eval_runtime': 7.0685, 'eval_samples_per_second': 108.651, 'eval_steps_per_second': 13.581, 'epoch': 2.78}
{'train_runtime': 1094.5098, 'train_samples_per_second': 18.932, 'train_steps_per_second': 2.368, 'train_loss': 0.644239643473684, 'epoch': 3.0}


TrainOutput(global_step=2592, training_loss=0.644239643473684, metrics={'train_runtime': 1094.5098, 'train_samples_per_second': 18.932, 'train_steps_per_second': 2.368, 'total_flos': 2744905918178304.0, 'train_loss': 0.644239643473684, 'epoch': 3.0})

In [18]:
# trainer.save_model("./results/climateBERT-base/climate_fever/first_run")
# Please remember to delete model.safetensors BEFORE adding to git. Causes issues...
# Also it is probably not worth running this block until the model is worth keeping

In [19]:
# Metrics are not included in the save model so we need to save them separately
metrics = trainer.evaluate()

  0%|          | 0/96 [00:00<?, ?it/s]

In [20]:
with open("./results/climateBERT-base/climate_fever/seed12/eval_metrics.json", "w") as output_file:
    json.dump(metrics, output_file)