In [None]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climate_fever_to_claim_evidence_pairs


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/lukeg/ClimatEnv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/lukeg/ClimatEnv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/lukeg/ClimatEnv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/home/lukeg/Cl

In [None]:
# This should always output true now, but worth checking
print(f"is GPU available: {torch.cuda.is_available()}")

is GPU available: True


In [None]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("data/climate_fever/climate-fever-dataset-sample.jsonl", lines=True)
preprocessed_df = climate_fever_to_claim_evidence_pairs(df)


Dataset({
    features: ['claim_id', 'claim', 'claim_label', 'evidences'],
    num_rows: 100
})

In [None]:
# Map evidence_labels to integers so that the Trainer will know what the labels mean
label_dict = {
    "REFUTES": 0,
    "NOT_ENOUGH_INFO": 1,
    "SUPPORTS": 2
}

preprocessed_df["labels"] = preprocessed_df["evidence_label"].map(label_dict)
preprocessed_df

DF
    claim_id                                              claim claim_label  \
0         0  Global warming is driving polar bears toward e...    SUPPORTS   
1         5  The sun has gone into ‘lockdown’ which could c...    SUPPORTS   
2         6        The polar bear population has been growing.     REFUTES   
3         9  Ironic' study finds more CO2 has slightly cool...     REFUTES   
4        10  Human additions of CO2 are in the margin of er...     REFUTES   

                                           evidences  
0  [{'evidence_id': 'Extinction risk from global ...  
1  [{'evidence_id': 'Famine:386', 'evidence_label...  
2  [{'evidence_id': 'Polar bear:1332', 'evidence_...  
3  [{'evidence_id': 'Atmosphere of Mars:131', 'ev...  
4  [{'evidence_id': 'Carbon dioxide in Earth's at...  

EXPLODED
    claim_id                                              claim claim_label  \
0         0  Global warming is driving polar bears toward e...    SUPPORTS   
1         0  Global warming is

AttributeError: 'Dataset' object has no attribute 'columns'

In [None]:
dataset = Dataset.from_pandas(preprocessed_df)
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")



In [None]:
# This splits our dataset so that we use 90% of it for training, and 10% for testing
split_dataset = dataset.train_test_split(test_size=0.1)

In [None]:
def custom_tokenize(examples):
    # The code block below this one can be used to find what the max_length should be set to.
    # Otherwise you have too much padding
    # Consider this properly later
    tokenized_output = tokenizer(
        text=[f"Claim: {claim} Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = split_dataset["train"].map(custom_tokenize, batched=True)
tokenized_testing_dataset = split_dataset["test"].map(custom_tokenize, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

more than 100 per cent of the warming over the past century is due to human actions
SUPPORTS
SUPPORTS
[0, 4321, 87, 727, 228, 715, 9, 5, 8232, 81, 5, 375, 3220, 16, 528, 7, 1050, 2163, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Human-produced carbon might be one of the factors [of climate change], but there’s simply no evidence that it is a significant one.
REFUTES
SUPPORTS
[0, 33837, 12, 25617, 4363, 429, 28, 65, 9, 5, 1437, 50292, 646, 1116, 2147, 464, 7479, 53, 89, 50267, 29, 1622, 117, 1

In [None]:

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long, which may be unnecessary
print(tokenized_training_dataset[6]["claim"])
print(tokenized_training_dataset[6]["evidence"])
print(tokenized_training_dataset[6]["labels"])
print(tokenized_training_dataset[6]["input_ids"])
print(tokenized_training_dataset[6]["attention_mask"])

print(tokenized_testing_dataset[2]["claim"])
print(tokenized_testing_dataset[2]["evidence"])
print(tokenized_training_dataset[2]["labels"])
print(tokenized_testing_dataset[2]["input_ids"])
print(tokenized_testing_dataset[2]["attention_mask"])

In [None]:
# Code to find the max token length in the dataset
# Please reset max_length and padding parameters in above code cell before running this

lengths = [len(tokens) for tokens in tokenized_training_dataset["input_ids"]]
max = 0
for l in lengths:
    if l > max:
        max = l
max

100

In [None]:
# Remove ignore_mismatched_sizes when needed - this replaces the head of the pretrained model (because if using
# climateBERT/environmental-claims, it has already been fine tuned and has 2 labels
model = AutoModelForSequenceClassification.from_pretrained(
    "climatebert/distilroberta-base-climate-f",
    num_labels=3,
    # ignore_mismatched_sizes=True
).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(next(model.parameters()).device)

cuda:0


In [None]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    cross_entropy_loss = CrossEntropyLoss(
        torch.tensor(logits, dtype=torch.float32),
        torch.tensor(labels, dtype=torch.long)
    ).item()

    return {
        "accuracy": accuracy,
        "f1_score": f1,
        "cross_entropy_loss": cross_entropy_loss,
    }

In [None]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/climateBERT-base/climate_fever_sample/check",
    per_device_train_batch_size=4,
    num_train_epochs=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    # save_steps=50,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.1,                    # Allows the model to adapt a little

    learning_rate=2e-5,
    push_to_hub=False,
)

In [None]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [None]:
trainer.train()

  0%|          | 0/69 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
trainer.save_model("./results/climateBERT-base/climate_fever_sample/check")
# Please remember to delete model.safetensors BEFORE adding to git. Causes issues...

In [None]:
with open("./results/climateBERT-base/climate_fever_sample/check/eval_metrics.json", "w") as output_file:
    json.dump(calculate_metrics, output_file)