In [1]:
%%capture
! pip install datasets fast-fit transformers accelerate
! pip install evaluate
! pip install langdetect

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
USE_COLAB = True

In [2]:
import os
import evaluate
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import Dataset

if USE_COLAB:
    from drive.MyDrive.Github.NLPSharedTask.essentials.config import ABSTRACTS
    from drive.MyDrive.Github.NLPSharedTask.essentials.data_functions import read_data
else:
    from essentials.config import ABSTRACTS
    from essentials.data_functions import read_data

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define model
model = AutoModelForSequenceClassification.from_pretrained(
    'allenai/scibert_scivocab_uncased',
    num_labels=17,
    label2id={n: n+1 for n in range(0,17)},
    id2label={n: n-1 for n in range(1,18)},
    return_dict=True)

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [11]:
if USE_COLAB:
    base_dir = 'drive/MyDrive/Github/NLPSharedTask'
else:
    base_dir = ''

# LOAD DATA

In [None]:
df = pd.read_csv('cleaned_data_with_null_with_synth.csv')

# CREATE TRAIN/TEST SPLIT

In [20]:
def tokenize_text(texts):
    return tokenizer(texts, truncation=True, max_length=256, return_tensors=None)

In [24]:
def rule_based_train_test_split(
    data: pd.DataFrame,
    label_col: str = 'label',
    test_size: float = 0.3,
    random_state: int | None = None
) -> dict:
    """Creates train-test split that makes sure that at least two abstracts for each id are in the test set."""

    abstract_data = data[data.is_abstract == 1]

    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)

    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()

    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])

    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)

    return train, test

In [None]:
# Apply huggingface tokenizer
tokenized_output = tokenize_text(df['text_clean'].to_list())

In [None]:
df_tokenized = pd.DataFrame({
    'input_ids': list(tokenized_output['input_ids']),
    'attention_mask': list(tokenized_output['attention_mask']),
    'token_type_ids': list(tokenized_output.get('token_type_ids', [[]]*len(df))),
    'label': df['label'].tolist(),
    'is_abstract': df['is_abstract'].to_list()
})

train_df, test_df = rule_based_train_test_split(df_tokenized, random_state=42)

train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])

"\ntrain_df['labels'] = list(label_binarizer.fit_transform(train_df['label'].apply(lambda x: [x])))\ntest_df['labels'] = list(label_binarizer.fit_transform(test_df['label'].apply(lambda x: [x])))\n\ntrain_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'labels']])\ntest_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'labels']])\n"

# FINE-TUNING

For training, use the suggested values from the paper:

In all settings, we apply a dropout of 0.1 and optimize cross entropy loss using Adam (Kingma and Ba, 2015). We finetune for 2 to 5 epochs using a batch size of 32 and a learning rate of 5e-6, 1e-5, 2e-5, or 5e-5 with a slanted triangular schedule (Howard and Ruder, 2018) which is equivalent to the linear warmup followed by linear decay (Devlin et al., 2019). For each dataset and BERT variant, we pick the best learning rate and number of epochs on the development set and report the corresponding test results. We found the setting that works best across most datasets and models is 2 or 4 epochs and a learning rate of 2e-5. While task-dependent, optimal hyperparameters for each task are often the same across BERT variants.

In [None]:
# Multiple class prediction (one prediction)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

'\n# Multiple label prediction (multiple predictions)\nclf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])\n\ndef sigmoid(x):\n   return 1 / (1 + np.exp(-x))\n\ndef compute_metrics(eval_pred):\n   predictions, labels = eval_pred\n   predictions = sigmoid(predictions)\n   predictions = (predictions > 0.5).astype(int).reshape(-1)\n   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))\n'

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(base_dir, 'models/results'),
    num_train_epochs=2,  # As best setting suggested 2 or 4
    warmup_steps=500,  # Slanted triangular schedule start
    learning_rate=2e-5,  # Best learning rate as suggested in the paper
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    lr_scheduler_type='linear',  # Corresponds to linear warmup followed by linear decay
)

In [None]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Adam Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Multiple class Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Start training
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluation
results = trainer.evaluate()
print(results)

In [None]:
# Saving the model
model_path = os.path.join(base_dir, 'models/scibert_model_base')
trainer.save_model(model_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(model_path)

In [None]:
# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_dir, 'models/scibert_model_base'))

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(os.path.join(base_dir, 'models/scibert_model_base'))

# Create a prediction pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
nlp('evolutionary dynamic structural genetic variation lineage hybrid origin not well explored although structural mutation may increase controlled hybrid cross therefore tested whether structural variant accumulate fish recent hybrid origin invasive cottus relative parental specie cottus rhenanus cottus perifretum variation exon gene assessed using comparative genome hybridization array twelve gene showed significantly higher copy number invasive cottus compared parent coincided increased expression three gene related vision detoxification muscle development suggesting possible gene dosage effect copy number increase putative transposon assessed comparative mapping genomic dna read de novo assembly repetitive element contrast exon copy number increase repetitive element common invasive cottus whereas decrease rare among increased repetitive element occurred higher number perifretum compared rhenanus abundant rhenanus implies biased mutational process amplifies genetic material one ancestor ass frequency de novo mutation hybridization screened f offspring parental specie change five candidate locus found no evidence new structural variant indicating rare detected given sampling scheme instead must accumulated generation observed controlled cross')