In [1]:
%%capture
! pip install datasets fast-fit transformers accelerate
! pip install evaluate
! pip install langdetect

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
USE_COLAB = True

In [4]:
import os
import evaluate
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import Dataset

if USE_COLAB:
    from drive.MyDrive.Github.NLPSharedTask.essentials.config import ABSTRACTS
    from drive.MyDrive.Github.NLPSharedTask.essentials.data_functions import read_data
else:
    from essentials.config import ABSTRACTS
    from essentials.data_functions import read_data

In [5]:
# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

label2id={n: n+1 for n in range(0,17)}
id2label={n: n-1 for n in range(1,18)}

model_path = 'allenai/scibert_scivocab_uncased'
# Define model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=18,
    return_dict=True)

# 'allenai/scibert_scivocab_uncased'
# 'FacebookAI/xlm-roberta-large'
# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [6]:
if USE_COLAB:
    base_dir = 'drive/MyDrive/Github/NLPSharedTask'
else:
    base_dir = ''

# LOAD DATA

In [7]:
df = pd.read_csv(os.path.join(base_dir, 'data_with_null_with_synth_gpt-4-turbo.csv'))

In [8]:
df = df.dropna(subset='text_clean')

# CREATE TRAIN/TEST SPLIT

In [9]:
def tokenize_text(texts):
    return tokenizer(texts, truncation=True, max_length=256, padding="max_length", return_tensors="pt")

In [10]:
def rule_based_train_test_split(
    data: pd.DataFrame,
    label_col: str = 'label',
    test_size: float = 0.3,
    random_state: int | None = None
) -> dict:
    """Creates train-test split that makes sure that at least two abstracts for each id are in the test set."""

    abstract_data = data[data.is_abstract == 1]

    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)

    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()

    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])

    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)

    return train, test

In [11]:
def split_data(
	data: pd.DataFrame
):
    train = data[~(data.is_abstract == 1)]
    test = data[data.is_abstract == 1]

    return train, test

In [12]:
# Apply huggingface tokenizer
tokenized_output = tokenize_text(df['text_clean'].to_list())

In [None]:
"""
df_tokenized = pd.DataFrame({
    'raw_text': df['text_clean'].tolist(),
    'input_ids': list(tokenized_output['input_ids']),
    'attention_mask': list(tokenized_output['attention_mask']),
    'token_type_ids': list(tokenized_output.get('token_type_ids', [[]]*len(df))),
    'label': df['label'].tolist(),
    'is_abstract': df['is_abstract'].to_list()
})
"""


In [13]:
df_tokenized = pd.DataFrame({
    'raw_text': df['text_clean'].tolist(),
    'input_ids': [i.tolist() for i in tokenized_output['input_ids']],
    'attention_mask': [i.tolist() for i in tokenized_output['attention_mask']],
    'token_type_ids': [i.tolist() for i in tokenized_output.get('token_type_ids', torch.zeros((len(df), 256), dtype=torch.long))],
    'label': df['label'].tolist(),
    'is_abstract': df['is_abstract'].to_list()
})

train_df, test_df = split_data(df_tokenized)

train_df[['raw_text', 'label', 'is_abstract']].to_csv(os.path.join(base_dir, f"train_df_{model_path.split('/')[-1]}.csv"))
test_df[['raw_text', 'label', 'is_abstract']].to_csv(os.path.join(base_dir, f"test_df_{model_path.split('/')[-1]}.csv"))

train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])

# FINE-TUNING

For training, use the suggested values from the paper:

In all settings, we apply a dropout of 0.1 and optimize cross entropy loss using Adam (Kingma and Ba, 2015). We finetune for 2 to 5 epochs using a batch size of 32 and a learning rate of 5e-6, 1e-5, 2e-5, or 5e-5 with a slanted triangular schedule (Howard and Ruder, 2018) which is equivalent to the linear warmup followed by linear decay (Devlin et al., 2019). For each dataset and BERT variant, we pick the best learning rate and number of epochs on the development set and report the corresponding test results. We found the setting that works best across most datasets and models is 2 or 4 epochs and a learning rate of 2e-5. While task-dependent, optimal hyperparameters for each task are often the same across BERT variants.

In [15]:
# Multiple class prediction (one prediction)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    }

In [18]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(base_dir, 'models/results'),
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,  # As best setting suggested 2 or 4
    warmup_steps=500,  # Slanted triangular schedule start
    learning_rate=2e-5,  # Best learning rate as suggested in the paper
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    lr_scheduler_type='linear',  # Corresponds to linear warmup followed by linear decay
    load_best_model_at_end=True
)

In [19]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Adam Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Multiple class Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

In [20]:
# Start training
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7034,0.614603,0.840718


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# Saving the model
output_path = os.path.join(base_dir, 'models/scibert_gpt-4')
trainer.save_model(output_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(model_path)

('allenai/scibert_scivocab_uncased/tokenizer_config.json',
 'allenai/scibert_scivocab_uncased/special_tokens_map.json',
 'allenai/scibert_scivocab_uncased/vocab.txt',
 'allenai/scibert_scivocab_uncased/added_tokens.json',
 'allenai/scibert_scivocab_uncased/tokenizer.json')

In [None]:
# Load the trained model
model_name = 'mixtral'
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_dir, f'models/scibert_{model_name}'))

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')



In [None]:
from concurrent.futures import ProcessPoolExecutor

nlp = pipeline("text-classification", model=model, tokenizer=tokenizer, framework='pt', device=0)  # device=0

# Load data
test_df = pd.read_csv(os.path.join(base_dir, f'test_df_{model_name}.csv'))
test_df = test_df[test_df.is_abstract == 1].copy()

In [None]:
# Process the data in batches
test_df['prediction'] = test_df.raw_text.apply(nlp)

# Optional: Save the predictions
test_df.to_csv(os.path.join(base_dir, f'predictions_{model_name}.csv'), index=False)

In [None]:
test_df

Unnamed: 0.1,Unnamed: 0,raw_text,is_abstract,prediction
5,5,climate austerity timescales target paper prob...,1,"[{'label': 'LABEL_16', 'score': 0.988149762153..."
27,27,study examines whether teacher certification a...,1,"[{'label': 'LABEL_4', 'score': 0.9400513172149..."
29,29,current research multicultural team tends exhi...,1,"[{'label': 'LABEL_16', 'score': 0.989135503768..."
80,80,may amount new phase study democratization ass...,1,"[{'label': 'LABEL_16', 'score': 0.998567223548..."
99,99,implication designating territorial integrity ...,1,"[{'label': 'LABEL_16', 'score': 0.998436987400..."
...,...,...,...,...
18006,18006,paper provides timely comparative analysis rec...,1,"[{'label': 'LABEL_16', 'score': 0.997286081314..."
18012,18012,paper explores constitutional dimension compar...,1,"[{'label': 'LABEL_16', 'score': 0.998491287231..."
18013,18013,freedom expression definitional challenge not ...,1,"[{'label': 'LABEL_16', 'score': 0.99853515625}]"
18019,18019,clear legal definition water entitlement appro...,1,"[{'label': 'LABEL_6', 'score': 0.9917386174201..."
