In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climate_fever_to_claim_evidence_pairs

In [2]:
# This should always output true now, but worth checking
print(f"GPU available: {torch.cuda.is_available()}")

GPU available: True


In [3]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/unsubstantiated_claims_classification/data/climate_fever/climate-fever-dataset-r1.jsonl", lines=True)
preprocessed_df = climate_fever_to_claim_evidence_pairs(df)


In [4]:
# Map evidence_labels to integers so that the Trainer will know what the labels mean
label_dict = {
    "REFUTES": 0,
    "NOT_ENOUGH_INFO": 1,
    "SUPPORTS": 2
}

preprocessed_df["labels"] = preprocessed_df["evidence_label"].map(label_dict)
preprocessed_df

Unnamed: 0,claim_id,claim,evidence_id,evidence_label,evidence,entropy,labels
0,0,Global warming is driving polar bears toward e...,Extinction risk from global warming:170,NOT_ENOUGH_INFO,"""Recent Research Shows Human Activity Driving ...",0.693147,1
1,0,Global warming is driving polar bears toward e...,Global warming:14,SUPPORTS,Environmental impacts include the extinction o...,0.000000,2
2,0,Global warming is driving polar bears toward e...,Global warming:178,NOT_ENOUGH_INFO,Rising temperatures push bees to their physiol...,0.693147,1
3,0,Global warming is driving polar bears toward e...,Habitat destruction:61,SUPPORTS,"Rising global temperatures, caused by the gree...",0.000000,2
4,0,Global warming is driving polar bears toward e...,Polar bear:1328,NOT_ENOUGH_INFO,"""Bear hunting caught in global warming debate"".",0.693147,1
...,...,...,...,...,...,...,...
7670,3134,"Over the last decade, heatwaves are five times...",Bushfires in Australia:126,SUPPORTS,Australia's climate has warmed by more than on...,0.000000,2
7671,3134,"Over the last decade, heatwaves are five times...",Effects of global warming:86,NOT_ENOUGH_INFO,"In the last 30–40 years, heat waves with high ...",0.693147,1
7672,3134,"Over the last decade, heatwaves are five times...",Global warming:155,NOT_ENOUGH_INFO,Many regions have probably already seen increa...,0.693147,1
7673,3134,"Over the last decade, heatwaves are five times...",Global warming:156,NOT_ENOUGH_INFO,"Since the 1950s, droughts and heat waves have ...",0.693147,1


In [5]:
dataset = Dataset.from_pandas(preprocessed_df)
dataset

Dataset({
    features: ['claim_id', 'claim', 'evidence_id', 'evidence_label', 'evidence', 'entropy', 'labels'],
    num_rows: 7675
})

In [6]:
# Shuffle the dataset! This randomly rearranges the dataset, which is good especially with this one since the same claim appears five times in a row
# The seed parameter means we can access the exact same shuffle again if we need to
seed=13
dataset = dataset.shuffle(seed=seed)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")



In [8]:
# This splits our dataset so that we use 90% of it for training, and 10% for testing
split_dataset = dataset.train_test_split(test_size=0.1)

In [9]:
def custom_tokenize(examples):
    # The code block below this one can be used to find what the max_length should be set to.
    # Otherwise you have too much padding
    # Consider this properly later
    tokenized_output = tokenizer(
        text=[f"Claim: {claim} Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = split_dataset["train"].map(custom_tokenize, batched=True)
tokenized_testing_dataset = split_dataset["test"].map(custom_tokenize, batched=True)

Map:   0%|          | 0/6907 [00:00<?, ? examples/s]

Map:   0%|          | 0/768 [00:00<?, ? examples/s]

In [10]:

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long, which may be unnecessary
print(tokenized_training_dataset[6]["claim"])
print(tokenized_training_dataset[6]["evidence"])
print(tokenized_training_dataset[6]["labels"])
print(tokenized_training_dataset[6]["input_ids"])
print(tokenized_training_dataset[6]["attention_mask"])

print(tokenized_testing_dataset[2]["claim"])
print(tokenized_testing_dataset[2]["evidence"])
print(tokenized_training_dataset[2]["labels"])
print(tokenized_testing_dataset[2]["input_ids"])
print(tokenized_testing_dataset[2]["attention_mask"])

Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.
Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.
2
[0, 45699, 35, 1491, 129, 16, 89, 117, 6441, 1283, 14, 1437, 50265, 16, 10, 44960, 927, 6, 723, 1437, 50265, 1437, 50366, 888, 244, 1437, 50384, 323, 55, 2195, 8, 3477, 301, 4, 27956, 35, 34289, 64, 1733, 25, 203, 25, 654, 135, 3845, 11, 1437, 50366, 9, 112, 6, 151, 42805, 6247, 132, 77, 1437, 50282, 19, 25771, 1437, 50272, 6, 600, 42, 20573, 117, 464, 11, 2147, 8, 117, 22830, 15, 97, 20012, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
# Remove ignore_mismatched_sizes when needed - this replaces the head of the pretrained model (because if using
# climateBERT/environmental-claims, it has already been fine tuned and has 2 labels
model = AutoModelForSequenceClassification.from_pretrained(
    "climatebert/distilroberta-base-climate-f",
    num_labels=3,
    # ignore_mismatched_sizes=True
).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(next(model.parameters()).device)

cuda:0


In [13]:
model.gradient_checkpointing_enable()

In [14]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [15]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/climateBERT-base/climate_fever/",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=600,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="no",
    save_steps=500,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.05,                    # Allows the model to adapt a little
    # gradient_accumulation_steps=2       # Might help with OOM errors, if we have them
    learning_rate=3e-5,
    push_to_hub=False,
)

In [16]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [17]:
trainer.train()

  0%|          | 0/2592 [00:00<?, ?it/s]

{'loss': 0.9284, 'grad_norm': 2.9882009029388428, 'learning_rate': 2.9159220146222584e-05, 'epoch': 0.23}
{'loss': 0.8418, 'grad_norm': 5.4454450607299805, 'learning_rate': 2.672217709179529e-05, 'epoch': 0.46}
{'loss': 0.8028, 'grad_norm': 8.029106140136719, 'learning_rate': 2.4309504467912268e-05, 'epoch': 0.69}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7629488110542297, 'eval_accuracy': 0.6588541666666666, 'eval_f1_score': 0.640909529347922, 'eval_runtime': 7.3742, 'eval_samples_per_second': 104.147, 'eval_steps_per_second': 13.018, 'epoch': 0.69}
{'loss': 0.7453, 'grad_norm': 6.652595043182373, 'learning_rate': 2.1884646628757107e-05, 'epoch': 0.93}
{'loss': 0.668, 'grad_norm': 9.165498733520508, 'learning_rate': 1.9447603574329814e-05, 'epoch': 1.16}
{'loss': 0.6163, 'grad_norm': 18.55978775024414, 'learning_rate': 1.7010560519902518e-05, 'epoch': 1.39}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7624629139900208, 'eval_accuracy': 0.6927083333333334, 'eval_f1_score': 0.7016458093860384, 'eval_runtime': 7.3332, 'eval_samples_per_second': 104.729, 'eval_steps_per_second': 13.091, 'epoch': 1.39}
{'loss': 0.6164, 'grad_norm': 27.17877769470215, 'learning_rate': 1.4573517465475223e-05, 'epoch': 1.62}
{'loss': 0.6074, 'grad_norm': 10.759166717529297, 'learning_rate': 1.2136474411047929e-05, 'epoch': 1.85}
{'loss': 0.5217, 'grad_norm': 10.641915321350098, 'learning_rate': 9.699431356620634e-06, 'epoch': 2.08}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.8058881759643555, 'eval_accuracy': 0.69140625, 'eval_f1_score': 0.7017369575085959, 'eval_runtime': 7.3391, 'eval_samples_per_second': 104.644, 'eval_steps_per_second': 13.081, 'epoch': 2.08}
{'loss': 0.427, 'grad_norm': 10.370409965515137, 'learning_rate': 7.262388302193339e-06, 'epoch': 2.31}
{'loss': 0.4363, 'grad_norm': 23.66501808166504, 'learning_rate': 4.8253452477660444e-06, 'epoch': 2.55}
{'loss': 0.4371, 'grad_norm': 5.572638034820557, 'learning_rate': 2.388302193338749e-06, 'epoch': 2.78}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7972612380981445, 'eval_accuracy': 0.7161458333333334, 'eval_f1_score': 0.7247325244644247, 'eval_runtime': 7.3735, 'eval_samples_per_second': 104.157, 'eval_steps_per_second': 13.02, 'epoch': 2.78}
{'train_runtime': 867.2076, 'train_samples_per_second': 23.894, 'train_steps_per_second': 2.989, 'train_loss': 0.6224810400126893, 'epoch': 3.0}


TrainOutput(global_step=2592, training_loss=0.6224810400126893, metrics={'train_runtime': 867.2076, 'train_samples_per_second': 23.894, 'train_steps_per_second': 2.989, 'total_flos': 2744905918178304.0, 'train_loss': 0.6224810400126893, 'epoch': 3.0})

In [18]:
# trainer.save_model("./results/climateBERT-base/climate_fever/first_run")
# Please remember to delete model.safetensors BEFORE adding to git. Causes issues...
# Also it is probably not worth running this block until the model is worth keeping

In [19]:
# Metrics are not included in the save model so we need to save them separately
metrics = trainer.evaluate()

  0%|          | 0/96 [00:00<?, ?it/s]

In [20]:
with open(f"./results/climateBERT-base/climate_fever/eval_metrics_seed{seed}_3.json", "w") as output_file:
    json.dump(metrics, output_file)