In [None]:
import numpy as np
import pandas as pd
import evaluate
import torch
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss

from preprocessing import climateBUG_reduce_rows



In [None]:
# This should always output true now, but worth checking
print(f"is GPU available: {torch.cuda.is_available()}")

In [None]:
# The climate-fever dataset is in a format which the transformers Trainer does not understand
# It must be preprocessed using the functions in preprocessing.py

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/climate_relatedness_classification/data/climateBUG/climateBUG-training-dataset.json")
preprocessed_training_df = climateBUG_reduce_rows(df, rows=10000)

df = pd.read_json("/home/lukeg/Documents/VS_code/fine_tuning/lxg406/climate_relatedness_classification/data/climateBUG/climateBUG-testing-dataset.json")
preprocessed_testing_df = climateBUG_reduce_rows(df, rows=1000)
preprocessed_testing_df.loc[998]


In [None]:
training_dataset = Dataset.from_pandas(preprocessed_training_df)
testing_dataset = Dataset.from_pandas(preprocessed_testing_df)
training_dataset[0]

In [None]:
training_dataset = training_dataset.shuffle(seed=13)
testing_dataset = testing_dataset.shuffle(seed=13)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

In [None]:
def custom_tokenize(examples):
    tokenized_output = tokenizer(
        text=examples["statement"],
        max_length=512, 
        padding="max_length", 
        truncation=True)

    return tokenized_output

tokenized_training_dataset = training_dataset.map(custom_tokenize, batched=True)
tokenized_testing_dataset = testing_dataset.map(custom_tokenize, batched=True)

In [None]:
tokenized_training_dataset

In [None]:

print(tokenized_training_dataset[0]["statement"])
print(tokenized_training_dataset[0]["label"])
print(tokenized_training_dataset[0]["input_ids"])
print(tokenized_training_dataset[0]["attention_mask"])

print(tokenized_testing_dataset[9]["statement"])
print(tokenized_testing_dataset[9]["label"])
print(tokenized_testing_dataset[9]["input_ids"])
print(tokenized_testing_dataset[9]["attention_mask"])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilroberta-base",
    num_labels=2,
    # ignore_mismatched_sizes=True
).to("cuda")

In [None]:
print(next(model.parameters()).device)

In [None]:
model.gradient_checkpointing_enable()

In [None]:
def calculate_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    return {
        "accuracy": accuracy,
        "f1_score": f1,
    }

In [None]:
# This is where we set the hyperparameters
training_args = TrainingArguments(
    output_dir="./results/distilroberta/climateBUG/first_run",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",
    save_steps=500,
    fp16=True,                          # Use 16-bit floating point instead of 32 - makes computation faster
    warmup_ratio=0.05,                    # Allows the model to adapt a little
    # gradient_accumulation_steps=2       # Might help with OOM errors, if we have them
    learning_rate=3e-5,
    push_to_hub=False,
)

In [None]:
trainer  = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    tokenizer=tokenizer,
    eval_dataset=tokenized_testing_dataset,
    compute_metrics=calculate_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./results/distilroberta/climateBUG/first_run")
# Please remember to delete model.safetensors BEFORE adding to git. Causes issues...
# Also it is probably not worth running this block until the model is worth keeping

In [None]:
# Metrics are not included in the save model so we need to save them separately
metrics = trainer.evaluate()
with open("./results/distilroberta/climateBUG/first_run/eval_metrics.json", "w") as output_file:
    json.dump(metrics, output_file)