In [2]:
import evaluate
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from datasets import Dataset
from essentials.config import ABSTRACTS
from essentials.data_functions import read_data

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define model
model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=17, return_dict=True)

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def rule_based_train_test_split(data: pd.DataFrame, label_col: str = 'label', test_size: float = 0.2, random_state: int | None = None) -> dict:
    abstract_data = data[data.is_abstract == 1]
    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)
    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()
    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])
    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)
    
    return train, test


def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors=None)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

In [5]:
# Load data
zofa = read_data(ABSTRACTS)
osdg = read_data('osdg_cleaning/osdg_clean.csv', format='csv')

# Prepare the DataFrame
zofa['is_abstract'] = 1
df = pd.concat([
    zofa[['ABSTRACT', 'SDG', 'is_abstract']].rename(columns={'ABSTRACT': 'text', 'SDG': 'label'}),
    osdg[['text', 'sdg', 'is_abstract']].rename(columns={'sdg': 'label'})
])
df = df[df.label != 0].copy()

In [6]:
# Applying tokenizer
tokenized_output = tokenize_function(df['text'].to_list())

In [7]:
df_tokenized = pd.DataFrame({
    'input_ids': list(tokenized_output['input_ids']),
    'attention_mask': list(tokenized_output['attention_mask']),
    'token_type_ids': list(tokenized_output.get('token_type_ids', [[]]*len(df))),
    'label': df['label'].to_list(),
    'is_abstract': df['is_abstract'].to_list()
})

In [8]:
train_df, test_df = rule_based_train_test_split(df_tokenized, random_state=42)

In [9]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [11]:
# Start training
trainer.train()

  0%|          | 0/12825 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyboardInterrupt: 

In [None]:
# Evaluation
results = trainer.evaluate()
print(results)

In [None]:
# Saving the model
model_path = "./results/scibert_model"
trainer.save_model(model_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(model_path)

In [None]:
# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained("./results/scibert_model")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./results/scibert_model")

# Create a prediction pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Example text
texts = ["This is a test text for classification."]

# Make predictions
predictions = nlp(texts)