In [None]:
# Code for running out-of-the-box climateBERT on climate-fever dataset

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import json
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from preprocessing import climateBUG_reduce_rows
from sklearn.metrics import accuracy_score, f1_score

In [3]:
tokenizer = AutoTokenizer.from_pretrained("climatebert/distilroberta-base-climate-f")
model =  AutoModelForSequenceClassification.from_pretrained("climatebert/distilroberta-base-climate-f").to("cuda")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at climatebert/distilroberta-base-climate-f and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/climate_relatedness_classification/data/climateBUG/climateBUG-testing-dataset.json")
preprocessed_testing_df = climateBUG_reduce_rows(df, rows=1000)

                                             statement  label
0    4CRÉDIT AGRICOLE S.A. – 2020-2021 INTEGRATED R...      0
1    p.11Exceptional financial  strength .............      0
2    p.18Our strategy: be the global  relationship ...      0
3    p.30Creatingadded valueOur business model serv...      0
4    p.40Committing and upholding  our responsibili...      0
..                                                 ...    ...
995  In  the  future,  Societe  Generale  is  commi...      1
996                                       !      0
997                                  !' &\t)      0
998  ):A@\t6/,"+,2+$)...      1
999  The  Hydrogen  Council  nowbrings  together  m...      1

[1000 rows x 2 columns]
                                             statement  label
0    4CRÉDIT AGRICOLE S.A. – 2020-2021 INTEGRATED R...      0
1    p.11Exceptional financial  strength .............      0
2    p.18Our strategy: be the global  relationship ... 

In [5]:
testing_dataset = Dataset.from_pandas(preprocessed_testing_df)

In [6]:
def custom_tokenize(examples):
    tokenized_output = tokenizer(
        text=examples["statement"],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_testing_dataset = testing_dataset.map(custom_tokenize, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [8]:
training_args  = TrainingArguments(
    output_dir="/home/lukeg/Documents/VS_code/fine_tuning/lxg406/climate_relatedness_classification/vanilla/results/climateBERT/first_run",
    do_train=False,
    do_eval=True,
    )

In [9]:
trainer  = Trainer(
    model=model,
    args= training_args,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [None]:
metrics = trainer.evaluate()

  0%|          | 0/125 [00:00<?, ?it/s]

In [None]:
with open("./results/climateBERT-base/climateBUG/first_run/eval_metrics.json", "w") as output_file:
    json.dump(metrics, output_file)