In [1]:
import pandas as pd
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "roberta-base"

In [3]:
train_dataset = pd.read_csv('../temp/causality_train.csv', index_col=0)

In [4]:
id2label = {0: "Explicitly states: no relation", 1: "Causation", 2: "Correlation", 3: "No mention of a relation"}
label2id = {"Explicitly states: no relation": 0, "Causation": 1, "Correlation": 2, "No mention of a relation": 3}

In [7]:
train_dataset.label = train_dataset.label.map(label2id)

In [8]:
train_dataset

Unnamed: 0,finding,label
0,Results from these confirmatory analyses provi...,0
1,"The task force also ""concluded that violent vi...",0
2,Increases in infectiousness as the fungus grow...,1
3,Host&amp;pathogen ecology drive seasonal dynam...,2
4,We prove that constant-depth quantum circuits ...,3
...,...,...
1335,Choir members said they felt more connected wi...,2
1336,These findings provides a neural basis for und...,2
1337,"""These findings provide a neural basis for und...",1
1338,Clades characterized by major phenotypic innov...,2


In [16]:
# Load dataset
train_dataset = pd.read_csv('../temp/causality_train.csv', index_col=0)
train_dataset.label = pd.Categorical(train_dataset.label)
train_dataset["labels"] = 0
train_dataset.loc[train_dataset["label"] == "Explicitly states: no relation", "labels"] = 0
train_dataset.loc[train_dataset["label"] == "Causation", "labels"] = 1
train_dataset.loc[train_dataset["label"] == "Correlation", "labels"] = 2
train_dataset.loc[train_dataset["label"] == "No mention of a relation", "labels"] = 3
train_dataset = train_dataset.drop(columns=['label'])

test_dataset = pd.read_csv('../temp/causality_test.csv', index_col=0)
test_dataset.label = pd.Categorical(test_dataset.label)
test_dataset["labels"] = 0
test_dataset.loc[test_dataset["label"] == "Explicitly states: no relation", "labels"] = 0
test_dataset.loc[test_dataset["label"] == "Causation", "labels"] = 1
test_dataset.loc[test_dataset["label"] == "Correlation", "labels"] = 2
test_dataset.loc[test_dataset["label"] == "No mention of a relation", "labels"] = 3
test_dataset = test_dataset.drop(columns=['label'])

In [17]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)


In [18]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["finding"], padding=True, truncation=True, max_length=1536)

In [19]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Map:   0%|          | 0/1340 [00:00<?, ? examples/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

In [20]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [21]:
# # We will need this to directly output the class names when using the pipeline without mapping the labels later.
# # Extract the number of classes and their names
# num_labels = dataset['train'].features['labels'].num_classes
# class_names = dataset["train"].features["labels"].names
# print(f"number of labels: {num_labels}")
# print(f"the labels: {class_names}")

# # Create an id2label mapping
id2label = {0: "Explicitly states: no relation", 1: "Causation", 2: "Correlation", 3: "No mention of a relation"}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [22]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir="../out",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir=f"../logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=200,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)

AttributeError: module 'numpy' has no attribute 'bool8'

In [42]:
# Fine-tune the model
trainer.train()

Step,Training Loss
10,1.3157
20,1.3815
30,1.3534
40,1.3363
50,1.3274
60,1.3439
70,1.2695
80,1.2733
90,1.258
100,1.2809


Step,Training Loss
10,1.3157
20,1.3815
30,1.3534
40,1.3363
50,1.3274
60,1.3439
70,1.2695
80,1.2733
90,1.258
100,1.2809


TrainOutput(global_step=840, training_loss=0.8301020832288832, metrics={'train_runtime': 15370.7684, 'train_samples_per_second': 0.436, 'train_steps_per_second': 0.055, 'total_flos': 805689296942400.0, 'train_loss': 0.8301020832288832, 'epoch': 5.0})

In [16]:
training_args.device

device(type='cpu')

In [43]:
trainer.save_model("gdrive/My Drive/causality_model")

In [45]:
pred = trainer.predict(test_dataset)

In [78]:
y_pred = pred.predictions.argmax(axis=1)

In [55]:
from sklearn.metrics import f1_score

In [61]:
f1_score(list(test_dataset['labels']), list(y_pred), average='macro')

0.5168584281956374

In [80]:
# Causation
indices = test_dataset['labels'] == 1
pred_causal = y_pred
pred_causal[pred_causal != 1] = 0
f1_score(test_dataset['labels'][indices], pred_causal[indices])

0.7922077922077922

In [88]:
# Correlation
indices = test_dataset['labels'] == 2
pred_causal = pred.predictions.argmax(axis=1)[indices]
pred_causal[pred_causal != 2] = 0
f1_score(test_dataset['labels'][indices], pred_causal, pos_label=2)

0.7488151658767772

In [91]:
# Correlation
# indices = test_dataset['labels'] == 2
# pred_causal = pred.predictions.argmax(axis=1)[indices]
# pred_causal[pred_causal != 2] = 0
f1_score(test_dataset['labels'], pred.predictions.argmax(axis=1), average=None)

array([0.38095238, 0.56481481, 0.61003861, 0.51162791])

In [93]:
model.save_pretrained('causality_model')

In [94]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         