In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from transformers import (
    RobertaTokenizerFast,
    RobertaModel,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "roberta-base"

In [3]:
id2label_cert = {0: "Certain", 1: "Somewhat certain", 2: "Somewhat uncertain", 3: "Uncertain"}
label2id_cert = {"Certain": 0, "Somewhat certain": 1, "Somewhat uncertain": 2, "Uncertain": 3}

id2label_caus = {0: "Explicitly states: no relation", 1: "Causation", 2: "Correlation", 3: "No mention of a relation"}
label2id_caus = {"Explicitly states: no relation": 0, "Causation": 1, "Correlation": 2, "No mention of a relation": 3}

In [4]:
train_dataset = pd.read_csv('../temp/causality_cert_train.csv', index_col=0)
train_dataset["certainty"] = train_dataset.certainty.map(label2id_cert)
train_dataset["causality"] = train_dataset.causality.map(label2id_caus)

test_dataset = pd.read_csv('../temp/causality_cert_test.csv', index_col=0)
test_dataset["certainty"] = test_dataset.certainty.map(label2id_cert)
test_dataset["causality"] = test_dataset.causality.map(label2id_caus)

In [5]:
from datasets import Dataset

# train_dataset = Dataset.from_pandas(train_dataset)
# test_dataset = Dataset.from_pandas(test_dataset)

In [34]:
train_dataset

Unnamed: 0,finding,causality,certainty
0,Results from these confirmatory analyses provi...,0,1
1,"The task force also ""concluded that violent vi...",0,2
2,Increases in infectiousness as the fungus grow...,1,2
3,Host&amp;pathogen ecology drive seasonal dynam...,2,0
4,We prove that constant-depth quantum circuits ...,3,0
...,...,...,...
1335,Choir members said they felt more connected wi...,2,0
1336,These findings provides a neural basis for und...,2,2
1337,"""These findings provide a neural basis for und...",1,2
1338,Clades characterized by major phenotypic innov...,2,1


In [6]:
# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer.
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
# def tokenize(batch):
#     return tokenizer.encode_plus(batch["finding"], padding=True, truncation=True, max_length=1536, return_token_type_ids=True)

In [7]:
class ClassificationData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.dataframe = dataframe.reset_index(drop=True)
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        if isinstance(index, (list, np.ndarray)):
            # idx is a list of indices -> return a batch dict
            batch = [self._getitem_single(i) for i in index]
            # batch is list of dicts; collate into dict of tensors:
            return {
                key: torch.stack([item[key] for item in batch])
                for key in batch[0]
            }
        else:
            # single idx
            return self._getitem_single(index)

    def _getitem_single(self, index):
        row = self.dataframe.iloc[index]
        text = str(row.finding)
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'causality': torch.tensor(row["causality"], dtype=torch.float),
            'certainty': torch.tensor(row["certainty"], dtype=torch.float)
        }

In [29]:
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)

training_set = ClassificationData(train_dataset, tokenizer, 512)
testing_set = ClassificationData(test_dataset, tokenizer, 512)

In [30]:
train_params = {'batch_size': 8,
                'shuffle': False,
                'num_workers': 0
                }

test_params = {'batch_size': 8,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [31]:
for data in training_loader:
    print(data)

{'ids': tensor([[    0, 41981,    31,  ...,     1,     1,     1],
        [    0,   133,  3685,  ...,     1,     1,     1],
        [    0, 48958,    11,  ...,     1,     1,     1],
        ...,
        [    0,   250,  1844,  ...,     1,     1,     1],
        [    0,  4528,   775,  ...,     1,     1,     1],
        [    0,   133,   892,  ...,     1,     1,     1]]), 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'causality': tensor([0., 0., 1., 2., 3., 3., 1., 2.]), 'certainty': tensor([1., 2., 2., 0., 0., 0., 0., 1.])}
{'ids': tensor([[    0,   170,  2006,  ...,     1,     

In [32]:
# train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
# test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

In [33]:
print(training_set[1324])

{'ids': tensor([    0, 48958,    11, 19166,  1825,    25,     5, 37933, 11461,    15,
        39635,   282,  1295,  8142,   189,  1305,  2608,  9235,     6,    19,
        16277,  2284,  1328,     5,  2608,     4,     2,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,

In [34]:
print(training_set[400])

{'ids': tensor([    0,  2522,   775,  4991,    10, 16570, 38919,  5043, 21005,    31,
            5, 11324,   227,  6878,  2849,  7841,  8244,  1480,   347,  1546,
            8,     5,  2329, 44231, 43141,    11,     5, 35867,     6,    61,
        15885,     5, 15063,     9, 41419,  4590,    19,   239,   305,  3999,
        22436,     4,     2,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,

In [12]:
# train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "certainty", "causality"])
# test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "certainty", "causality"])

In [35]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 8)
        self.unflatten = torch.nn.Unflatten(1, (2, 4))

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        pooler = self.classifier(pooler)
        output = self.unflatten(pooler)
        return output

In [36]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else torch.device("mps")

In [37]:
model = RobertaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [38]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=2e-5, weight_decay=0.01)

In [39]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [15]:
len(training_loader.dataset)

1340

In [20]:
torch.device("mps")

device(type='mps')

In [25]:
def train(epoch):
    tr_loss = 0
    n_correct_caus = 0
    n_correct_cert = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)

        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        certainty = data['certainty'].to(device, dtype = torch.long)
        causality = data['causality'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        print(outputs)
        loss_cert = loss_function(outputs[0], certainty)
        loss_caus = loss_function(outputs[1], causality)
        loss = loss_cert + loss_caus
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct_caus += calcuate_accuracy(big_idx[0], certainty)
        n_correct_caus += calcuate_accuracy(big_idx[1], causality)

        nb_tr_steps += 1
        nb_tr_examples+=certainty.size(0)

        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step_caus = (n_correct_caus*100)/nb_tr_examples
            accu_step_cert = (n_correct_cert*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: causality - {accu_step_caus}, certainty - {accu_step_cert}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        # optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: causality - {(n_correct_caus*100)/nb_tr_examples} certainty - {(n_correct_cert*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: causality - {(n_correct_caus*100)/nb_tr_examples} certainty - {(n_correct_cert*100)/nb_tr_examples}")

    return

In [28]:
EPOCHS = 5
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
for epoch in range(EPOCHS):
    train(epoch)

AttributeError: 'RobertaClass' object has no attribute 'gradient_checkpointing_enable'

In [9]:
# # Create an id2label mapping
id2label = {0: "Certain", 1: "Somewhat certain", 2: "Somewhat uncertain", 3: "Uncertain"}

# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [10]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir="../out",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir=f"../logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=200,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113861345479058, max=1.0…

Step,Training Loss
10,1.3488
20,1.3749
30,1.351
40,1.3447
50,1.309
60,1.2758
70,1.2647
80,1.2226
90,1.1862
100,1.1234


TrainOutput(global_step=840, training_loss=0.9259893110820225, metrics={'train_runtime': 420.6802, 'train_samples_per_second': 15.927, 'train_steps_per_second': 1.997, 'total_flos': 805689296942400.0, 'train_loss': 0.9259893110820225, 'epoch': 5.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7c41bfb350, execution_count=11 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f7c41afb450, raw_cell="# Fine-tune the model
trainer.train()" store_history=True silent=False shell_futures=True cell_id=f01ca77e-fb7c-4f42-86ec-74977baf5d67> result=TrainOutput(global_step=840, training_loss=0.9259893110820225, metrics={'train_runtime': 420.6802, 'train_samples_per_second': 15.927, 'train_steps_per_second': 1.997, 'total_flos': 805689296942400.0, 'train_loss': 0.9259893110820225, 'epoch': 5.0})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [12]:
trainer.save_model("../certainty_model")

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f7da8494610, raw_cell="trainer.save_model("../certainty_model")" store_history=True silent=False shell_futures=True cell_id=9371c2d0-8be7-4549-a0be-e458767ff950>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7c1812f610, execution_count=12 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f7da8494610, raw_cell="trainer.save_model("../certainty_model")" store_history=True silent=False shell_futures=True cell_id=9371c2d0-8be7-4549-a0be-e458767ff950> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [13]:
pred = trainer.predict(test_dataset)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f7c181bd010, raw_cell="pred = trainer.predict(test_dataset)" store_history=True silent=False shell_futures=True cell_id=c193ebf5-d3b6-4db5-8ea3-13c9f6ac7934>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7c41c65c90, execution_count=13 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f7c181bd010, raw_cell="pred = trainer.predict(test_dataset)" store_history=True silent=False shell_futures=True cell_id=c193ebf5-d3b6-4db5-8ea3-13c9f6ac7934> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [14]:
y_pred = pred.predictions.argmax(axis=1)
from sklearn.metrics import f1_score
f1_score(list(test_dataset['labels']), list(y_pred), average='macro')

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f7c1812dc90, raw_cell="y_pred = pred.predictions.argmax(axis=1)
from skle.." store_history=True silent=False shell_futures=True cell_id=171c5bff-127d-4db3-816d-46172c682ec2>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

0.40918129489558064

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7c18163950, execution_count=14 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f7c1812dc90, raw_cell="y_pred = pred.predictions.argmax(axis=1)
from skle.." store_history=True silent=False shell_futures=True cell_id=171c5bff-127d-4db3-816d-46172c682ec2> result=0.40918129489558064>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [15]:
f1_score(test_dataset['labels'], pred.predictions.argmax(axis=1), average=None)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f7c1812ddd0, raw_cell="f1_score(test_dataset['labels'], pred.predictions..." store_history=True silent=False shell_futures=True cell_id=869fdb61-39f1-417d-a401-9f4f23a13d61>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

array([0.68707483, 0.47272727, 0.47692308, 0.        ])

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7c181b02d0, execution_count=15 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f7c1812ddd0, raw_cell="f1_score(test_dataset['labels'], pred.predictions..." store_history=True silent=False shell_futures=True cell_id=869fdb61-39f1-417d-a401-9f4f23a13d61> result=array([0.68707483, 0.47272727, 0.47692308, 0.        ])>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [16]:
model.save_pretrained('certainty_model')

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7f7c18190910, raw_cell="model.save_pretrained('certainty_model')" store_history=True silent=False shell_futures=True cell_id=5b9fce05-38bd-4216-bb5e-08fe43dba2e6>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7f7c41c66c10>> (for post_run_cell), with arguments args (<ExecutionResult object at 7f7c18136650, execution_count=16 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7f7c18190910, raw_cell="model.save_pretrained('certainty_model')" store_history=True silent=False shell_futures=True cell_id=5b9fce05-38bd-4216-bb5e-08fe43dba2e6> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given