In [1]:
%%capture
! pip install datasets fast-fit transformers accelerate
! pip install evaluate
! pip install langdetect

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
USE_COLAB = False

In [4]:
import os
import evaluate
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import Dataset

if USE_COLAB:
    from drive.MyDrive.Github.NLPSharedTask.essentials.config import ABSTRACTS
    from drive.MyDrive.Github.NLPSharedTask.essentials.data_functions import read_data
else:
    from essentials.config import ABSTRACTS
    from essentials.data_functions import read_data

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

label2id={n: n+1 for n in range(0,17)}
id2label={n: n-1 for n in range(1,18)}

# Define model
model = AutoModelForSequenceClassification.from_pretrained(
    'allenai/scibert_scivocab_uncased',
    num_labels=18,
    return_dict=True)

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
if USE_COLAB:
    base_dir = 'drive/MyDrive/Github/NLPSharedTask'
else:
    base_dir = ''

# LOAD DATA

In [9]:
df = pd.read_csv(os.path.join(base_dir, 'cleaned_data_with_null_with_weakly_labeled_with_synth.csv'))

In [None]:
def balance_classes(df, target_col, balance_level=0.5):
    """
    Balance the class distribution in a DataFrame to reduce class imbalances.

    Args:
    df (pd.DataFrame): The DataFrame containing the data.
    target_col (str): The name of the column that contains the target classes.
    balance_level (float, optional): A float between 0 and 1 where 0 means no balancing and 1 means full balancing. Default is 0.5.

    Returns:
    pd.DataFrame: A DataFrame with adjusted class distribution.
    """
    if not (0 <= balance_level <= 1):
        raise ValueError("Balance level must be between 0 and 1.")

    # Calculate class counts and determine the maximum number we want for each class
    class_counts = df[target_col].value_counts()
    max_size = class_counts.min() + (class_counts.max() - class_counts.min()) * balance_level

    # Sample from each class
    def resample_class(group):
        n = min(len(group), int(max_size))
        return group.sample(n, replace=False)

    # Group by the target column and apply sampling
    balanced_df = df.groupby(target_col).apply(resample_class).reset_index(drop=True)

    return balanced_df

# CREATE TRAIN/TEST SPLIT

In [10]:
def tokenize_text(texts):
    return tokenizer(texts, truncation=True, max_length=256, return_tensors=None)

In [11]:
def rule_based_train_test_split(
    data: pd.DataFrame,
    label_col: str = 'label',
    test_size: float = 0.3,
    random_state: int | None = None
) -> dict:
    """Creates train-test split that makes sure that at least two abstracts for each id are in the test set."""

    abstract_data = data[data.is_abstract == 1]

    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)

    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()

    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])

    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)

    return train, test

In [14]:
# Apply huggingface tokenizer
tokenized_output = tokenize_text(df['text_clean'].to_list())

In [16]:
df_tokenized = pd.DataFrame({
    'raw_text': df['text_clean'].tolist(),
    'input_ids': list(tokenized_output['input_ids']),
    'attention_mask': list(tokenized_output['attention_mask']),
    'token_type_ids': list(tokenized_output.get('token_type_ids', [[]]*len(df))),
    'label': df['label'].tolist(),
    'is_abstract': df['is_abstract'].to_list()
})

train_df, test_df = rule_based_train_test_split(df_tokenized, random_state=42)

train_df.to_csv(os.path.join(base_dir, 'train_df_synth_weak_null.csv'))
test_df.to_csv(os.path.join(base_dir, 'test_df_synth_weak_null.csv'))

train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])

# FINE-TUNING

For training, use the suggested values from the paper:

In all settings, we apply a dropout of 0.1 and optimize cross entropy loss using Adam (Kingma and Ba, 2015). We finetune for 2 to 5 epochs using a batch size of 32 and a learning rate of 5e-6, 1e-5, 2e-5, or 5e-5 with a slanted triangular schedule (Howard and Ruder, 2018) which is equivalent to the linear warmup followed by linear decay (Devlin et al., 2019). For each dataset and BERT variant, we pick the best learning rate and number of epochs on the development set and report the corresponding test results. We found the setting that works best across most datasets and models is 2 or 4 epochs and a learning rate of 2e-5. While task-dependent, optimal hyperparameters for each task are often the same across BERT variants.

In [22]:
# Multiple class prediction (one prediction)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

In [23]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(base_dir, 'models/results'),
    num_train_epochs=2,  # As best setting suggested 2 or 4
    warmup_steps=500,  # Slanted triangular schedule start
    learning_rate=2e-5,  # Best learning rate as suggested in the paper
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    lr_scheduler_type='linear',  # Corresponds to linear warmup followed by linear decay
)

In [24]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Adam Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Multiple class Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

In [25]:
# Start training
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7806,0.755494,0.791749,0.792058
2,0.5611,0.6762,0.815167,0.814943


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=10652, training_loss=0.8309846721195073, metrics={'train_runtime': 2379.6651, 'train_samples_per_second': 35.805, 'train_steps_per_second': 4.476, 'total_flos': 5129265581828568.0, 'train_loss': 0.8309846721195073, 'epoch': 2.0})

In [26]:
# Saving the model
model_path = os.path.join(base_dir, 'models/scibert_model_with_null_with_synth_with_weakly_labeled')
trainer.save_model(model_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(model_path)

('drive/MyDrive/Github/NLPSharedTask/models/scibert_model_with_null_with_synth_with_weakly_labeled/tokenizer_config.json',
 'drive/MyDrive/Github/NLPSharedTask/models/scibert_model_with_null_with_synth_with_weakly_labeled/special_tokens_map.json',
 'drive/MyDrive/Github/NLPSharedTask/models/scibert_model_with_null_with_synth_with_weakly_labeled/vocab.txt',
 'drive/MyDrive/Github/NLPSharedTask/models/scibert_model_with_null_with_synth_with_weakly_labeled/added_tokens.json',
 'drive/MyDrive/Github/NLPSharedTask/models/scibert_model_with_null_with_synth_with_weakly_labeled/tokenizer.json')

In [17]:
# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_dir, 'models/scibert_model_with_null_with_synth_with_weakly_labeled'))

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [25]:
from concurrent.futures import ProcessPoolExecutor

nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, framework='pt')  # device=0

# Load data
test_df = pd.read_csv(os.path.join(base_dir, 'test_df_synth_weak_null.csv'))
test_df = test_df[test_df.is_abstract == 1].copy()

In [28]:
# Process the data in batches
test_df['prediction'] = test_df.raw_text.apply(nlp)

# Optional: Save the predictions
test_df.to_csv('predictions_true_abstracts.csv', index=False)

In [29]:
test_df

Unnamed: 0.1,Unnamed: 0,raw_text,input_ids,attention_mask,token_type_ids,label,is_abstract,prediction
44,44,article survey major international relation th...,"[102, 2148, 3241, 1626, 2565, 2923, 1983, 2641...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.998364746570..."
55,55,balance individual interest common good need r...,"[102, 5189, 1161, 1291, 1495, 1846, 965, 431, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.998547017574..."
63,63,property certain quantum magnet described term...,"[102, 3713, 2361, 4632, 10618, 1356, 902, 1756...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,"[{'label': 'LABEL_7', 'score': 0.8958142399787..."
88,88,purpose purpose paper provide methodological d...,"[102, 3559, 3559, 1203, 1584, 12500, 12415, 42...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.996728777885..."
106,106,abstract provide overview grounded theory appr...,"[102, 4940, 1584, 6502, 19477, 1983, 1139, 496...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.995740056037..."
...,...,...,...,...,...,...,...,...
18135,18135,educational administration rich domain scholar...,"[102, 6336, 3762, 4765, 2059, 18806, 2794, 482...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.996002256870..."
18147,18147,abstractthis article deal legislative definiti...,"[102, 4940, 8321, 2148, 5752, 24586, 2248, 122...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.998618483543..."
18196,18196,study examines whether teacher certification a...,"[102, 527, 15817, 1681, 9740, 20390, 5971, 246...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_4', 'score': 0.9923455119132..."
18197,18197,concept globalization gradually permeated crim...,"[102, 2614, 27763, 9049, 6824, 224, 12107, 179...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",16,1,"[{'label': 'LABEL_16', 'score': 0.998563349246..."
