In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climate_fever_to_claim_evidence_pairs

In [2]:
# This should always output true now, but worth checking
print(f"GPU available: {torch.cuda.is_available()}")

GPU available: True


In [3]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/unsubstantiated_claims_classification/data/climate_fever/climate-fever-dataset-r1.jsonl", lines=True)
preprocessed_df = climate_fever_to_claim_evidence_pairs(df)


In [4]:
# Map evidence_labels to integers so that the Trainer will know what the labels mean
label_dict = {
    "REFUTES": 0,
    "NOT_ENOUGH_INFO": 1,
    "SUPPORTS": 2
}

preprocessed_df["labels"] = preprocessed_df["evidence_label"].map(label_dict)
preprocessed_df

Unnamed: 0,claim_id,claim,evidence_id,evidence_label,evidence,entropy,labels
0,0,Global warming is driving polar bears toward e...,Extinction risk from global warming:170,NOT_ENOUGH_INFO,"""Recent Research Shows Human Activity Driving ...",0.693147,1
1,0,Global warming is driving polar bears toward e...,Global warming:14,SUPPORTS,Environmental impacts include the extinction o...,0.000000,2
2,0,Global warming is driving polar bears toward e...,Global warming:178,NOT_ENOUGH_INFO,Rising temperatures push bees to their physiol...,0.693147,1
3,0,Global warming is driving polar bears toward e...,Habitat destruction:61,SUPPORTS,"Rising global temperatures, caused by the gree...",0.000000,2
4,0,Global warming is driving polar bears toward e...,Polar bear:1328,NOT_ENOUGH_INFO,"""Bear hunting caught in global warming debate"".",0.693147,1
...,...,...,...,...,...,...,...
7670,3134,"Over the last decade, heatwaves are five times...",Bushfires in Australia:126,SUPPORTS,Australia's climate has warmed by more than on...,0.000000,2
7671,3134,"Over the last decade, heatwaves are five times...",Effects of global warming:86,NOT_ENOUGH_INFO,"In the last 30–40 years, heat waves with high ...",0.693147,1
7672,3134,"Over the last decade, heatwaves are five times...",Global warming:155,NOT_ENOUGH_INFO,Many regions have probably already seen increa...,0.693147,1
7673,3134,"Over the last decade, heatwaves are five times...",Global warming:156,NOT_ENOUGH_INFO,"Since the 1950s, droughts and heat waves have ...",0.693147,1


In [5]:
dataset = Dataset.from_pandas(preprocessed_df)
dataset

Dataset({
    features: ['claim_id', 'claim', 'evidence_id', 'evidence_label', 'evidence', 'entropy', 'labels'],
    num_rows: 7675
})

In [6]:
# Shuffle the dataset! This randomly rearranges the dataset, which is good especially with this one since the same claim appears five times in a row
# The seed parameter means we can access the exact same shuffle again if we need to
seed=11
dataset = dataset.shuffle(seed=seed)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("lumilogic/climateBUG-LM")



In [8]:
# This splits our dataset so that we use 90% of it for training, and 10% for testing
split_dataset = dataset.train_test_split(test_size=0.1)

In [9]:
def custom_tokenize(examples):
    # The code block below this one can be used to find what the max_length should be set to.
    # Otherwise you have too much padding
    # Consider this properly later
    tokenized_output = tokenizer(
        text=[f"Claim: {claim} Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = split_dataset["train"].map(custom_tokenize, batched=True)
tokenized_testing_dataset = split_dataset["test"].map(custom_tokenize, batched=True)

Map:   0%|          | 0/6907 [00:00<?, ? examples/s]

Map:   0%|          | 0/768 [00:00<?, ? examples/s]

In [10]:

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long, which may be unnecessary
print(tokenized_training_dataset[6]["claim"])
print(tokenized_training_dataset[6]["evidence"])
print(tokenized_training_dataset[6]["labels"])
print(tokenized_training_dataset[6]["input_ids"])
print(tokenized_training_dataset[6]["attention_mask"])

print(tokenized_testing_dataset[2]["claim"])
print(tokenized_testing_dataset[2]["evidence"])
print(tokenized_training_dataset[2]["labels"])
print(tokenized_testing_dataset[2]["input_ids"])
print(tokenized_testing_dataset[2]["attention_mask"])

In 1905, PDO switched to a warm phase.
During a "warm", or "positive", phase, the west Pacific becomes cooler and part of the eastern ocean warms; during a "cool" or "negative" phase, the opposite pattern occurs.
1
[0, 45699, 35, 96, 40849, 6, 11707, 673, 12012, 7, 10, 3279, 4359, 4, 27956, 35, 1590, 10, 22, 29530, 1297, 50, 22, 22173, 1297, 4359, 6, 5, 3072, 3073, 3374, 12924, 8, 233, 9, 5, 4580, 6444, 997, 4339, 131, 148, 10, 22, 24336, 113, 50, 22, 33407, 113, 4359, 6, 5, 5483, 6184, 11493, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [11]:
# Remove ignore_mismatched_sizes when needed - this replaces the head of the pretrained model (because if using
# climateBERT/environmental-claims, it has already been fine tuned and has 2 labels
model = AutoModelForSequenceClassification.from_pretrained(
    "lumilogic/climateBUG-LM",
    num_labels=3,
    ignore_mismatched_sizes=True
).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at lumilogic/climateBUG-LM and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(next(model.parameters()).device)

cuda:0


In [13]:
model.gradient_checkpointing_enable()

In [14]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [15]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/climateBUG-LM",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=600,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="no",
    save_steps=500,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.05,                    # Allows the model to adapt a little
    # gradient_accumulation_steps=2       # Might help with OOM errors, if we have them
    learning_rate=3e-5,
    push_to_hub=False,
)

In [16]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [17]:
trainer.train()

  0%|          | 0/2592 [00:00<?, ?it/s]

{'loss': 0.886, 'grad_norm': 4.904855251312256, 'learning_rate': 2.9159220146222584e-05, 'epoch': 0.23}
{'loss': 0.8571, 'grad_norm': 13.988655090332031, 'learning_rate': 2.6734362307067426e-05, 'epoch': 0.46}
{'loss': 0.8334, 'grad_norm': 4.499449729919434, 'learning_rate': 2.429731925264013e-05, 'epoch': 0.69}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7635836005210876, 'eval_accuracy': 0.6875, 'eval_f1_score': 0.5969186999535802, 'eval_runtime': 7.0193, 'eval_samples_per_second': 109.413, 'eval_steps_per_second': 13.677, 'epoch': 0.69}
{'loss': 0.7815, 'grad_norm': 12.039865493774414, 'learning_rate': 2.1860276198212834e-05, 'epoch': 0.93}
{'loss': 0.7028, 'grad_norm': 9.814151763916016, 'learning_rate': 1.943541835905768e-05, 'epoch': 1.16}
{'loss': 0.6705, 'grad_norm': 19.311227798461914, 'learning_rate': 1.6998375304630383e-05, 'epoch': 1.39}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7079771161079407, 'eval_accuracy': 0.6953125, 'eval_f1_score': 0.6904790374525714, 'eval_runtime': 7.0574, 'eval_samples_per_second': 108.822, 'eval_steps_per_second': 13.603, 'epoch': 1.39}
{'loss': 0.6377, 'grad_norm': 8.336625099182129, 'learning_rate': 1.4561332250203087e-05, 'epoch': 1.62}
{'loss': 0.6049, 'grad_norm': 19.59144401550293, 'learning_rate': 1.2124289195775792e-05, 'epoch': 1.85}
{'loss': 0.5716, 'grad_norm': 21.828004837036133, 'learning_rate': 9.687246141348498e-06, 'epoch': 2.08}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.8048895001411438, 'eval_accuracy': 0.6783854166666666, 'eval_f1_score': 0.6823267483358283, 'eval_runtime': 6.8799, 'eval_samples_per_second': 111.629, 'eval_steps_per_second': 13.954, 'epoch': 2.08}
{'loss': 0.4966, 'grad_norm': 17.55531883239746, 'learning_rate': 7.2502030869212026e-06, 'epoch': 2.31}
{'loss': 0.4874, 'grad_norm': 31.529340744018555, 'learning_rate': 4.813160032493908e-06, 'epoch': 2.55}
{'loss': 0.511, 'grad_norm': 8.42222785949707, 'learning_rate': 2.3761169780666128e-06, 'epoch': 2.78}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.8075377345085144, 'eval_accuracy': 0.6809895833333334, 'eval_f1_score': 0.6825652975377808, 'eval_runtime': 7.1657, 'eval_samples_per_second': 107.177, 'eval_steps_per_second': 13.397, 'epoch': 2.78}
{'train_runtime': 914.6052, 'train_samples_per_second': 22.656, 'train_steps_per_second': 2.834, 'train_loss': 0.6557252259902012, 'epoch': 3.0}


TrainOutput(global_step=2592, training_loss=0.6557252259902012, metrics={'train_runtime': 914.6052, 'train_samples_per_second': 22.656, 'train_steps_per_second': 2.834, 'total_flos': 2744905918178304.0, 'train_loss': 0.6557252259902012, 'epoch': 3.0})

In [18]:
# trainer.save_model("./results/climateBERT-base/climate_fever/first_run")
# Please remember to delete model.safetensors BEFORE adding to git. Causes issues...
# Also it is probably not worth running this block until the model is worth keeping

In [19]:
# Metrics are not included in the save model so we need to save them separately
metrics = trainer.evaluate()

  0%|          | 0/96 [00:00<?, ?it/s]

In [20]:
with open(f"./results/climateBUG-LM/eval_metrics_seed{seed}.json", "w") as output_file:
    json.dump(metrics, output_file)