In [1]:
%%capture
! pip install datasets fast-fit transformers accelerate
! pip install evaluate
! pip install langdetect

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
USE_COLAB = True

In [4]:
import os
import re
import evaluate
import numpy as np
import pandas as pd
import spacy
import torch

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langdetect import detect
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import Dataset

if USE_COLAB:
    from drive.MyDrive.Github.NLPSharedTask.essentials.config import ABSTRACTS
    from drive.MyDrive.Github.NLPSharedTask.essentials.data_functions import read_data
else:
    from essentials.config import ABSTRACTS
    from essentials.data_functions import read_data

In [5]:
# Select device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define model
model = AutoModelForSequenceClassification.from_pretrained(
    'allenai/scibert_scivocab_uncased',
    num_labels=17,
    label2id={n: n+1 for n in range(0,17)},
    id2label={n: n-1 for n in range(1,18)},
    return_dict=True)

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [6]:
if USE_COLAB:
    base_dir = 'drive/MyDrive/Github/NLPSharedTask'
else:
    base_dir = ''

# LOAD DATA

In [7]:
# Load zofa and osdg data
zofa = read_data(ABSTRACTS)
osdg = read_data(os.path.join(base_dir, 'osdg_data/osdg-community-data-v2024-01-01.csv'), format='csv', delimiter='\t')

In [8]:
# Add is_abstract dummy to zofa
zofa['is_abstract'] = 1

In [9]:
# Clean OSDG data
osdg['language'] = osdg.text.apply(detect)

# Filter non-english texts out
osdg = osdg[osdg.language == 'en'].copy()

# Naive search for abstracts in the OSDG data
wanted_words = ['abstract', 'this paper', 'this study', 'this article']
osdg['is_abstract'] = [1 if any(word in text.lower() for word in wanted_words) else 0 for text in osdg.text]

In [10]:
# Combine OSDG and ZOFA
df_base = pd.concat([
    zofa[['ABSTRACT', 'SDG', 'is_abstract']].rename(columns={'ABSTRACT': 'text', 'SDG': 'label'}),
    osdg[['text', 'sdg', 'is_abstract']].rename(columns={'sdg': 'label'})
])

In [11]:
# Remove 0 since None class is not part of OSDG
df_base = df_base[df_base.label != 0].copy()

# Recast labels to range 0:17
df_base['label'] -= 1

In [12]:
def synthetic_data_reading(base_dir=os.getcwd()):

    synthetic_data = []

    dir = os.path.join(base_dir, "synthetic_data", "produced_data", "gen_results")

    for folder in os.listdir(dir):
        for data in os.listdir(os.path.join(dir, folder)):
            if data.endswith(".jsonl"):
                df = pd.read_json(os.path.join(dir, folder, data), lines=True)
                synthetic_data.append(df)

    df_synthetic = pd.concat(synthetic_data)
    return df_synthetic

In [13]:
# Load synthetic data
if USE_COLAB:
    df_synth = synthetic_data_reading(base_dir=base_dir)
else:
    df_synth = synthetic_data_reading()

df_synth['is_abstract'] = 0

# Create ZOFA + OSDG + SYNTH DataFrame
df_synth = pd.concat([
    df_base,
    df_synth[['text', 'sdg_id', 'is_abstract']].rename(columns={'sdg_id': 'label'})
])

# DEFINE DATAFRAME TO USE IN SUBSEQUENT STEPS

In [14]:
df = df_base.copy()

In [15]:
df

Unnamed: 0,text,label,is_abstract
3,Evolutionary dynamics of structural genetic va...,13,1
10,Successfully predicting the future states of s...,14,1
11,Poverty remains one of the most pressing probl...,0,1
15,As part of a trans-disciplinary research proje...,5,1
17,Supermarket food sales data might serve as a s...,2,1
...,...,...,...
42629,"It also features individual accountability, wh...",3,0
42630,Since the full capacity is not likely to be ut...,6,0
42631,This article notes the judgment in Sophocleous...,15,1
42632,Groundwater quality can also be affected by co...,5,0


# PREPORCESSING

In [16]:
#NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
sentiment_bearing_stopwords = ['not', 'no', 'nor', 'never', 'yes', 'should', 'could', 'would']
stop_words_without_sentiment = [word for word in stop_words if word not in sentiment_bearing_stopwords]

# spaCY for NER
! python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
def remove_urls_and_html_tags(text: str) -> str:
    html_tags_pattern = r'<.*?>'
    text_without_html_tags = re.sub(html_tags_pattern, '', text)
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text_without_html_tags)

In [18]:
def named_entity_regocnition(text: str) -> list[str]:
    doc = nlp(text)
    return ["".join(ent.text) for ent in doc.ents]

In [19]:
def preprocess_text(text: str) -> str:

    # Lowercasing
    text = text.lower()

    # Removal of urls and html tags
    text = remove_urls_and_html_tags(text)

    #Removal of Numeric values
    text = re.sub(r'\d+', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token.isalpha()]

    # Selective removal of stopwords
    tokens = [token for token in tokens if token not in stop_words_without_sentiment]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [20]:
def tokenize_text(texts):
    return tokenizer(texts, truncation=True, max_length=512, return_tensors=None)

In [21]:
# Apply custom pre-processing
df['text_clean'] = df.text.apply(preprocess_text)

In [None]:
# Apply huggingface tokenizer
tokenized_output = tokenize_text(df['text_clean'].to_list())

# CREATE TRAIN/TEST SPLIT

In [24]:
def rule_based_train_test_split(
    data: pd.DataFrame,
    label_col: str = 'label',
    test_size: float = 0.2,
    random_state: int | None = None
) -> dict:
    """Creates train-test split that makes sure that at least two abstracts for each id are in the test set."""

    abstract_data = data[data.is_abstract == 1]

    # Randomly sample 2 abstracts per sdg group
    test_a = abstract_data.groupby(label_col).sample(n=1, random_state=random_state)

    # Remove the entries already in the test set from the rest of the data
    data = data[~data.index.isin(test_a.index)].copy()

    # Split the remaining data into train and test
    train, test_b = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[label_col])

    # Concatenate both test sets and shuffle them again
    test = pd.concat([test_a, test_b]).sample(frac=1).reset_index(drop=True)

    return train, test

In [None]:
df_tokenized = pd.DataFrame({
    'input_ids': list(tokenized_output['input_ids']),
    'attention_mask': list(tokenized_output['attention_mask']),
    'token_type_ids': list(tokenized_output.get('token_type_ids', [[]]*len(df))),
    'label': df['label'].tolist(),
    'is_abstract': df['is_abstract'].to_list()
})

train_df, test_df = rule_based_train_test_split(df_tokenized, random_state=42)

train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'label']])

"\ntrain_df['labels'] = list(label_binarizer.fit_transform(train_df['label'].apply(lambda x: [x])))\ntest_df['labels'] = list(label_binarizer.fit_transform(test_df['label'].apply(lambda x: [x])))\n\ntrain_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'labels']])\ntest_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'labels']])\n"

# FINE-TUNING

For training, use the suggested values from the paper:

In all settings, we apply a dropout of 0.1 and optimize cross entropy loss using Adam (Kingma and Ba, 2015). We finetune for 2 to 5 epochs using a batch size of 32 and a learning rate of 5e-6, 1e-5, 2e-5, or 5e-5 with a slanted triangular schedule (Howard and Ruder, 2018) which is equivalent to the linear warmup followed by linear decay (Devlin et al., 2019). For each dataset and BERT variant, we pick the best learning rate and number of epochs on the development set and report the corresponding test results. We found the setting that works best across most datasets and models is 2 or 4 epochs and a learning rate of 2e-5. While task-dependent, optimal hyperparameters for each task are often the same across BERT variants.

In [None]:
# Multiple class prediction (one prediction)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    }

'\n# Multiple label prediction (multiple predictions)\nclf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])\n\ndef sigmoid(x):\n   return 1 / (1 + np.exp(-x))\n\ndef compute_metrics(eval_pred):\n   predictions, labels = eval_pred\n   predictions = sigmoid(predictions)\n   predictions = (predictions > 0.5).astype(int).reshape(-1)\n   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))\n'

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(base_dir, 'models/results'),
    num_train_epochs=2,  # As best setting suggested 2 or 4
    warmup_steps=500,  # Slanted triangular schedule start
    learning_rate=2e-5,  # Best learning rate as suggested in the paper
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    lr_scheduler_type='linear',  # Corresponds to linear warmup followed by linear decay
)

In [None]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Adam Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Multiple class Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Start training
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluation
results = trainer.evaluate()
print(results)

In [None]:
# Saving the model
model_path = os.path.join(base_dir, 'models/scibert_model_base')
trainer.save_model(model_path)

# Saving the tokenizer associated with the model
tokenizer.save_pretrained(model_path)

In [None]:
# Load the trained model
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_dir, 'models/scibert_model_base'))

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(os.path.join(base_dir, 'models/scibert_model_base'))

# Create a prediction pipeline
nlp = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [None]:
nlp('evolutionary dynamic structural genetic variation lineage hybrid origin not well explored although structural mutation may increase controlled hybrid cross therefore tested whether structural variant accumulate fish recent hybrid origin invasive cottus relative parental specie cottus rhenanus cottus perifretum variation exon gene assessed using comparative genome hybridization array twelve gene showed significantly higher copy number invasive cottus compared parent coincided increased expression three gene related vision detoxification muscle development suggesting possible gene dosage effect copy number increase putative transposon assessed comparative mapping genomic dna read de novo assembly repetitive element contrast exon copy number increase repetitive element common invasive cottus whereas decrease rare among increased repetitive element occurred higher number perifretum compared rhenanus abundant rhenanus implies biased mutational process amplifies genetic material one ancestor ass frequency de novo mutation hybridization screened f offspring parental specie change five candidate locus found no evidence new structural variant indicating rare detected given sampling scheme instead must accumulated generation observed controlled cross')

# FastFit PIPELINE

In [22]:
from datasets import load_dataset
from transformers import RobertaTokenizer
from fastfit import FastFitTrainer, sample_dataset

In [26]:
df['label'] = df.label.astype(str)

train_df, temp_df = rule_based_train_test_split(df, random_state=42, test_size=0.3)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Select only the 'text' and 'sdg' columns for the final datasets
train_df = train_df[['text_clean', 'label']]
val_df = val_df[['text_clean', 'label']]
test_df = test_df[['text_clean', 'label']]

# Saving the datasets to CSV files
train_df.to_csv(f'{base_dir}/train_data.csv', index=False)
val_df.to_csv(f'{base_dir}/val_data.csv', index=False)
test_df.to_csv(f'{base_dir}/test_data.csv', index=False)

In [30]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29922 entries, 18217 to 28974
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text_clean  29922 non-null  object
 1   label       29922 non-null  object
dtypes: object(2)
memory usage: 701.3+ KB


In [None]:
# FastFit tokenizes the labels, does not like integer labels :(
    # transforming the label to text ones. wasted too much time on this for no reason probably.
import csv

NUMBER_MAPPINGS={
    0:"zero",
    1:"one",
    2:"two",
    3:"three",
    4:"four",
    5:"five",
    6:"six",
    7:"seven",
    8:"eight",
    9:"nine",
    10:"ten",
    11:"eleven",
    12:"twelve",
    13:"thirteen",
    14:"fourteen",
    15:"fifteen",
    16:"sixteen",
    17:"seventeen"
}

def convert_integers_to_strings(file_path, output_path):
    with open(file_path, mode='r', newline='') as file:
        reader = csv.reader(file)
        data = [row for row in reader]

    # Change integers in the second column to strings
    for row in data:
        if row and len(row) > 1 and row[1] != 'sdg':
            try:
                # Convert the second column to string if it's an integer
                row[1] = NUMBER_MAPPINGS[int(row[1])]
            except ValueError:
                # If it's not an integer, do nothing
                pass

    # Write the updated data to a new CSV file
    with open(output_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(data)


for path in ['train_data.csv', 'val_data.csv', 'test_data.csv']:
    convert_integers_to_strings(path,path)

In [32]:
# Load your dataset from the CSV files
dataset = load_dataset('csv', data_files={
    'train': f'{base_dir}/train_data.csv',
    'validation': f'{base_dir}/val_data.csv',
    'test': f'{base_dir}/test_data.csv'
})


# Initialize the FastFit trainer with correct column names and paths
trainer = FastFitTrainer(
    model_name_or_path="allenai/scibert_scivocab_cased",
    label_column_name="label",
    text_column_name="text_clean",
    num_train_epochs=40,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    max_text_length=128,
    dataloader_drop_last=False,
    num_repeats=4,
    optim="adafactor",
    clf_loss_factor=0.1,
    fp16=True,
    dataset=dataset,
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).

In [None]:
model = trainer.train()

In [None]:
results = trainer.evaluate()
print("Accuracy: {:.1f}%".format(results["eval_accuracy"] * 100))

In [None]:
model.save_pretrained("drive/MyDrive/scibert_all")

# Test Multi Label Prediction (not working right now)

In [None]:
"""
from sklearn.preprocessing import MultiLabelBinarizer

# Example labels (adjust according to your label format)
label_binarizer = MultiLabelBinarizer(classes=range(1, 18))
"""

In [None]:
"""
train_df['labels'] = list(label_binarizer.fit_transform(train_df['label'].apply(lambda x: [x])))
test_df['labels'] = list(label_binarizer.fit_transform(test_df['label'].apply(lambda x: [x])))

train_dataset = Dataset.from_pandas(train_df[['input_ids', 'attention_mask', 'token_type_ids', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['input_ids', 'attention_mask', 'token_type_ids', 'labels']])
"""

In [None]:
"""
train_dataset.set_format("torch")
train_dataset = (train_dataset
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

test_dataset.set_format("torch")
test_dataset = (test_dataset
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))
"""

In [None]:
"""
# Multiple label prediction (multiple predictions)
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))
"""

In [None]:
"""
from torch.nn import BCEWithLogitsLoss

class MultiClassTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").float()  # Ensure labels are floating-point for BCEWithLogitsLoss
        outputs = model(**inputs)
        logits = outputs.logits  # Assuming model outputs have logits as an attribute

        # Define the loss function
        loss_fct = BCEWithLogitsLoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Multiple label Trainer
trainer = MultiClassTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)
"""