In [1]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climateBUG_to_reduced_climateBUG



In [2]:
# This should always output true now, but worth checking
print(f"is GPU available: {torch.cuda.is_available()}")

is GPU available: True


In [4]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/climate_relatedness_classification/data/climateBUG/climateBUG-training-dataset.json")
preprocessed_training_df = climateBUG_to_reduced_climateBUG(df)

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/climate_relatedness_classification/data/climateBUG/climateBUG-testing-dataset.json")
preprocessed_testing_df = climateBUG_to_reduced_climateBUG(df)


                                                statement  label
0       2015REGISTRATION DOCUMENT ANNUAL FINANCIAL REP...      0
1       Retail     banking     achieved     a     stro...      0
2       Crédit   Agricole   Assurances   captured  fur...      0
3       Our  business  is  also  exposed  to  the  ups...      0
4       Crédit  Agricole  assumed  its   responsibilit...      0
...                                                   ...    ...
121061  -0132 filed with the AMF on 7 March 2017; The ...      0
121062  Page ■Description of the main risks and contin...      1
121063  Page  ■Information on payment terms (L.441-6-1...      0
121064  Page ■Information on the labour and environmen...      0
121065       2004 for an environmental management system.      1

[121066 rows x 2 columns]
                                               statement  label
0      4CRÉDIT AGRICOLE S.A. – 2020-2021 INTEGRATED R...      0
1      p.11Exceptional financial  strength .............      0
2

In [10]:
training_dataset = Dataset.from_pandas(preprocessed_training_df)
testing_dataset = Dataset.from_pandas(preprocessed_testing_df)
training_dataset[0]

{'statement': '2015REGISTRATION DOCUMENT ANNUAL FINANCIAL REPORT2 CRÉDIT AGRICOLE S.A.      2015 REGISTRATION DOCUMENT MESSAGE FROM THE CHAIRMAN AND THE CHIEF EXECUTIVE OFFICER MESSAGE FROM THE CHAIRMANAND THE CHIEF EXECUTIVE OFFICERPHILIPPE BRASSAC, CHIEF EXECUTIVE OFFICER AND DOMINIQUE LEFEBVRE, CHAIRMAN OF CRÉDIT AGRICOLE S.A.A robust Group delivering sustainable performanceLeading  a  bank  with  a  EUR  698  billion   loan   book(1)',
 'label': 0}

In [None]:
training_dataset = training_dataset.shuffle(seed=13)
testing_dataset = testing_dataset.shuffle(seed=13)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")



In [8]:
# This splits our dataset so that we use 90% of it for training, and 10% for testing
split_dataset = dataset.train_test_split(test_size=0.1)

In [9]:
def custom_tokenize(examples):
    # The code block below this one can be used to find what the max_length should be set to.
    # Otherwise you have too much padding
    # Consider this properly later
    tokenized_output = tokenizer(
        text=[f"Claim: {claim} Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = split_dataset["train"].map(custom_tokenize, batched=True)
tokenized_testing_dataset = split_dataset["test"].map(custom_tokenize, batched=True)

Map:   0%|          | 0/6907 [00:00<?, ? examples/s]

Map:   0%|          | 0/768 [00:00<?, ? examples/s]

In [10]:

# Just for viewing purposes. Input_ids are the tokens, and attention_masks are whether they represent actual words or not.
# The max_length is set to 512 so every entry has been padded to be this long, which may be unnecessary
print(tokenized_training_dataset[6]["claim"])
print(tokenized_training_dataset[6]["evidence"])
print(tokenized_training_dataset[6]["labels"])
print(tokenized_training_dataset[6]["input_ids"])
print(tokenized_training_dataset[6]["attention_mask"])

print(tokenized_testing_dataset[2]["claim"])
print(tokenized_testing_dataset[2]["evidence"])
print(tokenized_training_dataset[2]["labels"])
print(tokenized_testing_dataset[2]["input_ids"])
print(tokenized_testing_dataset[2]["attention_mask"])

The Business Council, the Minerals Council, the Australian Industry Group, the Australian Chamber of Commerce and Industry, have all called for the [carbon] tax to be repealed.
On 1 July 2012, the Australian Federal government introduced a carbon price of AUD$23 per tonne of emitted CO2-e on selected fossil fuels consumed by major industrial emitters and government bodies such as councils.
1
[0, 45699, 35, 20, 2090, 1080, 6, 5, 20710, 1536, 1080, 6, 5, 2059, 5354, 826, 6, 5, 2059, 7514, 9, 5669, 8, 5354, 6, 33, 70, 373, 13, 5, 646, 23612, 742, 629, 7, 28, 29643, 4, 27956, 35, 374, 112, 550, 1125, 6, 5, 2059, 1853, 168, 2942, 10, 4363, 425, 9, 17551, 1629, 1922, 228, 4866, 858, 9, 37141, 1437, 50265, 12, 242, 15, 3919, 1437, 50348, 1437, 50424, 13056, 30, 538, 2683, 29901, 2696, 8, 168, 3738, 215, 25, 14751, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [11]:
# Remove ignore_mismatched_sizes when needed - this replaces the head of the pretrained model (because if using
# climateBERT/environmental-claims, it has already been fine tuned and has 2 labels
model = AutoModelForSequenceClassification.from_pretrained(
    "climatebert/distilroberta-base-climate-f",
    num_labels=3,
    # ignore_mismatched_sizes=True
).to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(next(model.parameters()).device)

cuda:0


In [13]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [14]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/climateBERT-base/climate_fever/first_run",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",
    save_steps=500,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.05,                    # Allows the model to adapt a little
    # gradient_accumulation_steps=2       # Might help with OOM errors, if we have them
    learning_rate=3e-5,
    push_to_hub=False,
)

In [15]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [16]:
trainer.train()

  0%|          | 0/2592 [00:00<?, ?it/s]

{'loss': 1.0774, 'grad_norm': 3.270073890686035, 'learning_rate': 1.153846153846154e-05, 'epoch': 0.06}
{'loss': 0.9022, 'grad_norm': 4.453432083129883, 'learning_rate': 2.2846153846153845e-05, 'epoch': 0.12}
{'loss': 0.8809, 'grad_norm': 8.023266792297363, 'learning_rate': 2.976848090982941e-05, 'epoch': 0.17}
{'loss': 0.8858, 'grad_norm': 15.712736129760742, 'learning_rate': 2.9159220146222584e-05, 'epoch': 0.23}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.8040960431098938, 'eval_accuracy': 0.6770833333333334, 'eval_f1_score': 0.5467132505175983, 'eval_runtime': 7.3571, 'eval_samples_per_second': 104.39, 'eval_steps_per_second': 13.049, 'epoch': 0.23}
{'loss': 0.8409, 'grad_norm': 4.847982406616211, 'learning_rate': 2.854995938261576e-05, 'epoch': 0.29}
{'loss': 0.8059, 'grad_norm': 4.376002788543701, 'learning_rate': 2.7940698619008936e-05, 'epoch': 0.35}
{'loss': 0.8375, 'grad_norm': 4.306921482086182, 'learning_rate': 2.7331437855402112e-05, 'epoch': 0.41}
{'loss': 0.8127, 'grad_norm': 4.305049896240234, 'learning_rate': 2.672217709179529e-05, 'epoch': 0.46}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7487987875938416, 'eval_accuracy': 0.6953125, 'eval_f1_score': 0.6366778805719092, 'eval_runtime': 7.8817, 'eval_samples_per_second': 97.441, 'eval_steps_per_second': 12.18, 'epoch': 0.46}
{'loss': 0.7948, 'grad_norm': 5.0012712478637695, 'learning_rate': 2.6112916328188468e-05, 'epoch': 0.52}
{'loss': 0.8812, 'grad_norm': 5.943113803863525, 'learning_rate': 2.5503655564581644e-05, 'epoch': 0.58}
{'loss': 0.7981, 'grad_norm': 4.253654479980469, 'learning_rate': 2.489439480097482e-05, 'epoch': 0.64}
{'loss': 0.7755, 'grad_norm': 6.514457702636719, 'learning_rate': 2.4285134037367995e-05, 'epoch': 0.69}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7090122699737549, 'eval_accuracy': 0.6966145833333334, 'eval_f1_score': 0.6319818756131816, 'eval_runtime': 7.2893, 'eval_samples_per_second': 105.36, 'eval_steps_per_second': 13.17, 'epoch': 0.69}
{'loss': 0.7405, 'grad_norm': 10.129493713378906, 'learning_rate': 2.367587327376117e-05, 'epoch': 0.75}
{'loss': 0.676, 'grad_norm': 7.575800895690918, 'learning_rate': 2.3066612510154347e-05, 'epoch': 0.81}
{'loss': 0.7505, 'grad_norm': 14.144163131713867, 'learning_rate': 2.2457351746547523e-05, 'epoch': 0.87}
{'loss': 0.7456, 'grad_norm': 7.675422191619873, 'learning_rate': 2.18480909829407e-05, 'epoch': 0.93}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.6897192001342773, 'eval_accuracy': 0.7135416666666666, 'eval_f1_score': 0.6910212314635839, 'eval_runtime': 7.2771, 'eval_samples_per_second': 105.536, 'eval_steps_per_second': 13.192, 'epoch': 0.93}
{'loss': 0.7241, 'grad_norm': 7.463010787963867, 'learning_rate': 2.125101543460601e-05, 'epoch': 0.98}
{'loss': 0.6887, 'grad_norm': 8.314745903015137, 'learning_rate': 2.064175467099919e-05, 'epoch': 1.04}
{'loss': 0.5919, 'grad_norm': 14.212474822998047, 'learning_rate': 2.0032493907392365e-05, 'epoch': 1.1}
{'loss': 0.6474, 'grad_norm': 13.65180492401123, 'learning_rate': 1.942323314378554e-05, 'epoch': 1.16}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7502684593200684, 'eval_accuracy': 0.6783854166666666, 'eval_f1_score': 0.6885036759233026, 'eval_runtime': 7.2559, 'eval_samples_per_second': 105.844, 'eval_steps_per_second': 13.231, 'epoch': 1.16}
{'loss': 0.6002, 'grad_norm': 10.697453498840332, 'learning_rate': 1.8813972380178717e-05, 'epoch': 1.22}
{'loss': 0.6836, 'grad_norm': 11.087141036987305, 'learning_rate': 1.8204711616571893e-05, 'epoch': 1.27}
{'loss': 0.5624, 'grad_norm': 16.379619598388672, 'learning_rate': 1.759545085296507e-05, 'epoch': 1.33}
{'loss': 0.5464, 'grad_norm': 14.354450225830078, 'learning_rate': 1.6986190089358245e-05, 'epoch': 1.39}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7077605128288269, 'eval_accuracy': 0.7252604166666666, 'eval_f1_score': 0.721658792469134, 'eval_runtime': 7.2488, 'eval_samples_per_second': 105.949, 'eval_steps_per_second': 13.244, 'epoch': 1.39}
{'loss': 0.6753, 'grad_norm': 9.682068824768066, 'learning_rate': 1.637692932575142e-05, 'epoch': 1.45}
{'loss': 0.61, 'grad_norm': 11.304106712341309, 'learning_rate': 1.5767668562144597e-05, 'epoch': 1.5}
{'loss': 0.6662, 'grad_norm': 13.075704574584961, 'learning_rate': 1.5158407798537775e-05, 'epoch': 1.56}
{'loss': 0.5652, 'grad_norm': 8.474553108215332, 'learning_rate': 1.454914703493095e-05, 'epoch': 1.62}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.6591325402259827, 'eval_accuracy': 0.7317708333333334, 'eval_f1_score': 0.7337382215928407, 'eval_runtime': 7.2426, 'eval_samples_per_second': 106.04, 'eval_steps_per_second': 13.255, 'epoch': 1.62}
{'loss': 0.5636, 'grad_norm': 12.415093421936035, 'learning_rate': 1.3939886271324126e-05, 'epoch': 1.68}
{'loss': 0.5878, 'grad_norm': 7.511929035186768, 'learning_rate': 1.3330625507717304e-05, 'epoch': 1.74}
{'loss': 0.5889, 'grad_norm': 9.891653060913086, 'learning_rate': 1.272136474411048e-05, 'epoch': 1.79}
{'loss': 0.5701, 'grad_norm': 13.050644874572754, 'learning_rate': 1.2112103980503656e-05, 'epoch': 1.85}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.6408607959747314, 'eval_accuracy': 0.7421875, 'eval_f1_score': 0.734067171285866, 'eval_runtime': 7.2789, 'eval_samples_per_second': 105.511, 'eval_steps_per_second': 13.189, 'epoch': 1.85}
{'loss': 0.6615, 'grad_norm': 11.468348503112793, 'learning_rate': 1.1502843216896832e-05, 'epoch': 1.91}
{'loss': 0.549, 'grad_norm': 18.9716796875, 'learning_rate': 1.0893582453290008e-05, 'epoch': 1.97}
{'loss': 0.5096, 'grad_norm': 21.44668197631836, 'learning_rate': 1.0284321689683186e-05, 'epoch': 2.03}
{'loss': 0.4209, 'grad_norm': 18.13361167907715, 'learning_rate': 9.675060926076362e-06, 'epoch': 2.08}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7130348682403564, 'eval_accuracy': 0.7252604166666666, 'eval_f1_score': 0.7314881644047704, 'eval_runtime': 7.2782, 'eval_samples_per_second': 105.521, 'eval_steps_per_second': 13.19, 'epoch': 2.08}
{'loss': 0.4465, 'grad_norm': 11.246590614318848, 'learning_rate': 9.065800162469538e-06, 'epoch': 2.14}
{'loss': 0.4019, 'grad_norm': 10.298105239868164, 'learning_rate': 8.456539398862713e-06, 'epoch': 2.2}
{'loss': 0.4975, 'grad_norm': 21.408395767211914, 'learning_rate': 7.84727863525589e-06, 'epoch': 2.26}
{'loss': 0.4241, 'grad_norm': 11.81933307647705, 'learning_rate': 7.238017871649066e-06, 'epoch': 2.31}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7288733124732971, 'eval_accuracy': 0.73046875, 'eval_f1_score': 0.7357807813571718, 'eval_runtime': 7.7076, 'eval_samples_per_second': 99.642, 'eval_steps_per_second': 12.455, 'epoch': 2.31}
{'loss': 0.4399, 'grad_norm': 13.742197036743164, 'learning_rate': 6.628757108042242e-06, 'epoch': 2.37}
{'loss': 0.4274, 'grad_norm': 9.91850757598877, 'learning_rate': 6.019496344435418e-06, 'epoch': 2.43}
{'loss': 0.4391, 'grad_norm': 11.785798072814941, 'learning_rate': 5.410235580828594e-06, 'epoch': 2.49}
{'loss': 0.4572, 'grad_norm': 6.916691780090332, 'learning_rate': 4.800974817221771e-06, 'epoch': 2.55}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7006747126579285, 'eval_accuracy': 0.7330729166666666, 'eval_f1_score': 0.7338659983299061, 'eval_runtime': 7.3014, 'eval_samples_per_second': 105.185, 'eval_steps_per_second': 13.148, 'epoch': 2.55}
{'loss': 0.4886, 'grad_norm': 15.463045120239258, 'learning_rate': 4.191714053614947e-06, 'epoch': 2.6}
{'loss': 0.4008, 'grad_norm': 17.250072479248047, 'learning_rate': 3.59463850528026e-06, 'epoch': 2.66}
{'loss': 0.5307, 'grad_norm': 17.52010154724121, 'learning_rate': 2.9853777416734364e-06, 'epoch': 2.72}
{'loss': 0.3968, 'grad_norm': 4.136968612670898, 'learning_rate': 2.3761169780666128e-06, 'epoch': 2.78}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.7001147270202637, 'eval_accuracy': 0.74609375, 'eval_f1_score': 0.7484941017842456, 'eval_runtime': 7.2675, 'eval_samples_per_second': 105.676, 'eval_steps_per_second': 13.21, 'epoch': 2.78}
{'loss': 0.4868, 'grad_norm': 14.654212951660156, 'learning_rate': 1.7668562144597887e-06, 'epoch': 2.84}
{'loss': 0.4416, 'grad_norm': 12.36707878112793, 'learning_rate': 1.157595450852965e-06, 'epoch': 2.89}
{'loss': 0.3789, 'grad_norm': 18.784025192260742, 'learning_rate': 5.483346872461414e-07, 'epoch': 2.95}
{'train_runtime': 758.2955, 'train_samples_per_second': 27.326, 'train_steps_per_second': 3.418, 'train_loss': 0.6213125343675967, 'epoch': 3.0}


TrainOutput(global_step=2592, training_loss=0.6213125343675967, metrics={'train_runtime': 758.2955, 'train_samples_per_second': 27.326, 'train_steps_per_second': 3.418, 'total_flos': 2744905918178304.0, 'train_loss': 0.6213125343675967, 'epoch': 3.0})

In [17]:
trainer.save_model("./results/climateBERT-base/climate_fever/first_run")
# Please remember to delete model.safetensors BEFORE adding to git. Causes issues...
# Also it is probably not worth running this block until the model is worth keeping

In [None]:
# Metrics are not included in the save model so we need to save them separately
metrics = trainer.evaluate()
with open("./results/climateBERT-base/climate_fever/first_run/eval_metrics.json", "w") as output_file:
    json.dump(metrics, output_file)

  0%|          | 0/96 [00:00<?, ?it/s]

: 