In [2]:
import os
import re
import evaluate
import numpy as np
import pandas as pd
import spacy
import torch

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langdetect import detect
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    AdamW,
    pipeline,
)
from datasets import Dataset
from essentials.config import ABSTRACTS
from essentials.data_functions import read_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define model
model = AutoModelForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=17, return_dict=True)

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# LOAD DATA

In [4]:
# Load zofa and osdg data
zofa = read_data(ABSTRACTS)
osdg = read_data('osdg_data/osdg-community-data-v2024-01-01.csv', format='csv', delimiter='\t')

In [5]:
# Add is_abstract dummy to zofa
zofa['is_abstract'] = 1

In [6]:
# Clean OSDG data
osdg['language'] = osdg.text.apply(detect)

# Filter non-english texts out
osdg = osdg[osdg.language == 'en'].copy()

# Naive search for abstracts in the OSDG data
wanted_words = ['abstract', 'this paper', 'this study', 'this article']
osdg['is_abstract'] = [1 if any(word in text.lower() for word in wanted_words) else 0 for text in osdg.text]

In [7]:
# Combine OSDG and ZOFA
df_base = pd.concat([
    zofa[['ABSTRACT', 'SDG', 'is_abstract']].rename(columns={'ABSTRACT': 'text', 'SDG': 'label'}),
    osdg[['text', 'sdg', 'is_abstract']].rename(columns={'sdg': 'label'})
])

In [8]:
# Remove 0 since None class is not part of OSDG
df_base = df_base[df_base.label != 0].copy()

In [9]:
def synthetic_data_reading():

    synthetic_data = []

    dir = os.path.join(os.getcwd(), "synthetic_data", "produced_data", "gen_results")

    for folder in os.listdir(dir):
        for data in os.listdir(os.path.join(dir, folder)):
            if data.endswith(".jsonl"):
                df = pd.read_json(os.path.join(dir, folder, data), lines=True)
                synthetic_data.append(df)

    df_synthetic = pd.concat(synthetic_data)
    return df_synthetic

In [10]:
# Load synthetic data
df_synth = synthetic_data_reading()
df_synth['is_abstract'] = 0

# Create ZOFA + OSDG + SYNTH DataFrame
df_synth = pd.concat([
    df_base,
    df_synth[['text', 'sdg_id', 'is_abstract']].rename(columns={'sdg_id': 'label'})
])

# DEFINE DATAFRAME TO USE IN SUBSEQUENT STEPS

In [11]:
df = df_base.copy()

# PREPORCESSING

In [12]:
#NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
sentiment_bearing_stopwords = ['not', 'no', 'nor', 'never', 'yes', 'should', 'could', 'would']
stop_words_without_sentiment = [word for word in stop_words if word not in sentiment_bearing_stopwords]

# spaCY for NER
! python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/manuelbolz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
def remove_urls_and_html_tags(text: str) -> str:
    html_tags_pattern = r'<.*?>'
    text_without_html_tags = re.sub(html_tags_pattern, '', text)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text_without_html_tags)

In [14]:
def named_entity_regocnition(text: str) -> list[str]:
    doc = nlp(text)
    return ["".join(ent.text) for ent in doc.ents]

In [15]:
def preprocess_text(text: str) -> str:
        
    # Lowercasing
    text = text.lower()
    
    # Removal of urls and html tags
    text = remove_urls_and_html_tags(text)
    
    #Removal of Numeric values
    text = re.sub(r'\d+', '', text) 
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token.isalpha()]
        
    # Selective removal of stopwords
    tokens = [token for token in tokens if token not in stop_words_without_sentiment]       
        
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

In [16]:
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512, return_tensors=None)

In [17]:
# Apply custom pre-processing
df['text_clean'] = df.text.apply(preprocess_text)

In [18]:
df

Unnamed: 0,text,label,is_abstract,text_clean
3,Evolutionary dynamics of structural genetic va...,14,1,evolutionary dynamic structural genetic variat...
10,Successfully predicting the future states of s...,15,1,successfully predicting future state system co...
11,Poverty remains one of the most pressing probl...,1,1,poverty remains one pressing problem facing wo...
15,As part of a trans-disciplinary research proje...,6,1,part research project series survey interventi...
17,Supermarket food sales data might serve as a s...,3,1,supermarket food sale data might serve simple ...
...,...,...,...,...
42629,"It also features individual accountability, wh...",4,0,also feature individual accountability mean te...
42630,Since the full capacity is not likely to be ut...,7,0,since full capacity not likely utilized multip...
42631,This article notes the judgment in Sophocleous...,16,1,article note judgment sophocleous v secretary ...
42632,Groundwater quality can also be affected by co...,6,0,groundwater quality also affected contaminatio...


In [19]:
# Apply huggingface tokenizer
tokenized_output = tokenize_function(df['text_clean'].to_list())

# CREATE TRAIN/TEST SPLIT

In [20]:
def rule_based_train_test_split(
    data: pd.DataFrame,
    label_col: str = 'label',
    test_size: float = 0.2,
    random_state: int | None = None
) -> dict:
    """Creates train-test split that makes sure that at least two abstracts for each id are in the test set."""

    abstract_data = data[data.is_abstract == 1]

    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)

    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()

    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])

    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)
    
    return train, test

In [21]:
df_tokenized = pd.DataFrame({
    'input_ids': list(tokenized_output['input_ids']),
    'attention_mask': list(tokenized_output['attention_mask']),
    'token_type_ids': list(tokenized_output.get('token_type_ids', [[]]*len(df))),
    'label': df['label'].to_list(),
    'is_abstract': df['is_abstract'].to_list()
})

train_df, test_df = rule_based_train_test_split(df_tokenized, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# FINE-TUNING

For training, use the suggested values from the paper:

In all settings, we apply a dropout of 0.1 and optimize cross entropy loss using Adam (Kingma and Ba, 2015). We finetune for 2 to 5 epochs using a batch size of 32 and a learning rate of 5e-6, 1e-5, 2e-5, or 5e-5 with a slanted triangular schedule (Howard and Ruder, 2018) which is equivalent to the linear warmup followed by linear decay (Devlin et al., 2019). For each dataset and BERT variant, we pick the best learning rate and number of epochs on the development set and report the corresponding test results. We found the setting that works best across most datasets and models is 2 or 4 epochs and a learning rate of 2e-5. While task-dependent, optimal hyperparameters for each task are often the same across BERT variants.

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

In [1]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,  # As best setting suggested 2 or 4
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,  # Slanted triangular schedule start
    learning_rate=2e-5,  # Best learning rate as suggested in the paper
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    lr_scheduler_type='linear',  # Corresponds to linear warmup followed by linear decay
)

optimizer = AdamW(model.parameters(), lr=2e-5)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

NameError: name 'TrainingArguments' is not defined

In [11]:
# Start training
trainer.train()

  0%|          | 0/12825 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyboardInterrupt: 

In [None]:
# Evaluation
results = trainer.evaluate()
print(results)

In [None]:
# Saving the model
model_path = "./models/scibert_model_base"
trainer.save_model(model_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(model_path)

In [None]:
# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained("./models/scibert_model_base")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./models/scibert_model_base")

# Create a prediction pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Make predictions
predictions = nlp(...)