In [79]:
%pip install pandas spacy transformers seqeval torch scikit-learn hf_xet
# %python -m spacy download en_core_web_sm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
55729.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [80]:
import csv
import pandas as pd
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
import json
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from transformers import BertTokenizerFast, BertModel

nlp = spacy.load("en_core_web_sm")

In [81]:
articles_df = pd.read_csv("globenewswire_articles_finance.csv")
triplets_list = []

with open("finance_articles_triplets.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=',', quotechar='"')
    header = next(reader)
    
    for row in reader:
        if not row:
            continue
        
        url = row[0]
        triplet_fields = []
        
        for field in row[1:]:
            if field.strip():
                str_tuple = field.strip()
                if str_tuple.startswith('(') and str_tuple.endswith(')'):
                    inner_str = str_tuple[1:-1]
                    elements = [elem.strip() for elem in inner_str.split(',')]
                    triplet_fields.append(tuple(elements))
                else:
                    triplet_fields.append((str_tuple,))

        triplets_list.append({"url": url, "triplets": triplet_fields})

triplets_df = pd.DataFrame(triplets_list)
merged_df = pd.merge(triplets_df, articles_df[["url", "content"]], on="url", how="inner")

In [82]:
def tokenize_text(text):
    return [token.text for token in nlp(text)]

In [83]:
def get_bio_tags(text, spo_list):
    tokens = tokenize_text(text)
    tags = ['O'] * len(tokens)

    for spo in spo_list:
        try:
            subject, predicate, obj = spo
            spans = {
                'SUB': subject.split(),
                'PRED': predicate.split(),
                'OBJ': obj.split()
            }

            for label, span_tokens in spans.items():
                for i in range(len(tokens) - len(span_tokens) + 1):
                    if tokens[i:i+len(span_tokens)] == span_tokens:
                        tags[i] = f'B-{label}'
                        for j in range(1, len(span_tokens)):
                            tags[i + j] = f'I-{label}'
                        break
        except Exception as e:
            continue

    return tokens, tags

In [84]:
dataset = []

for idx, row in articles_df.iterrows():
    url = row['url']
    content = row['content']

    matching_triplets_row = triplets_df[triplets_df['url'] == url]
    if matching_triplets_row.empty:
        continue

    tokens, tags = get_bio_tags(content, triplets_df.iloc[idx, 1])
    dataset.append((tokens, tags))

In [85]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Unique BIO tags
tag_values = ['O', 'B-SUB', 'I-SUB', 'B-PRED', 'I-PRED', 'B-OBJ', 'I-OBJ']
tag2id = {tag: i for i, tag in enumerate(tag_values)}
id2tag = {i: tag for tag, i in tag2id.items()}

In [86]:
class SPOBioDataset(Dataset):
    def __init__(self, data, tokenizer, tag2id, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, tags = self.data[idx]

        tokenized_input = self.tokenizer(tokens,
                                        is_split_into_words=True,
                                        padding='max_length',
                                        truncation=True,
                                        max_length=self.max_len,
                                        return_tensors="pt")

        word_ids = tokenized_input.word_ids(batch_index=0)
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(tag2id.get(tags[word_idx], tag2id['O']))

        return {
            'input_ids': tokenized_input['input_ids'].squeeze(),
            'attention_mask': tokenized_input['attention_mask'].squeeze(),
            'labels': torch.tensor(label_ids)
        }

In [87]:
train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)

train_dataset = SPOBioDataset(train_data, tokenizer, tag2id)
val_dataset = SPOBioDataset(val_data, tokenizer, tag2id)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Training

In [92]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BERTTagger(nn.Module):
    def __init__(self, tag2id):
        super(BERTTagger, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, len(tag2id))

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)
        return logits

def compute_class_weights(dataset, tag2id):
    tag_counts = Counter(tag for _, tags in dataset for tag in tags)
    total = sum(tag_counts.values())
    weights = [1.0 - (tag_counts[tag] / total) for tag in tag2id.keys()]
    print(weights)
    weights[0] += 0.1
    print(weights)
    return torch.tensor(weights).to(device)

In [93]:
model = BERTTagger(tag2id).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
weights = compute_class_weights(train_data, tag2id)
loss_func = nn.CrossEntropyLoss(ignore_index=-100, weight=weights)

[0.01763469908449644, 0.9976737205463004, 0.9973735554555005, 0.9966231427285007, 0.9992245735154335, 0.9959727850317674, 0.9954975236380009]
[0.11763469908449645, 0.9976737205463004, 0.9973735554555005, 0.9966231427285007, 0.9992245735154335, 0.9959727850317674, 0.9954975236380009]


In [94]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)

        loss = loss_func(logits.view(-1, len(tag2id)), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} | Loss: {avg_loss:.4f}")


Epoch 1 | Loss: 1.0931
Epoch 2 | Loss: 0.7601
Epoch 3 | Loss: 0.5948
Epoch 4 | Loss: 0.4629
Epoch 5 | Loss: 0.3518
Epoch 6 | Loss: 0.2550
Epoch 7 | Loss: 0.2307
Epoch 8 | Loss: 0.1814
Epoch 9 | Loss: 0.1420
Epoch 10 | Loss: 0.0996


# Inference

In [95]:
@torch.no_grad()
def predict_bio_tags(text, model, tokenizer, tag2id, id2tag, device):
    model.eval()
    tokens = tokenize_text(text)

    tokenized_input = tokenizer(tokens,
                                is_split_into_words=True,
                                return_tensors="pt",
                                truncation=True,
                                padding="max_length",
                                max_length=512)

    input_ids = tokenized_input["input_ids"].to(device)
    attention_mask = tokenized_input["attention_mask"].to(device)

    logits = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(logits, dim=2)

    word_ids = tokenized_input.word_ids(batch_index=0)
    predicted_tags = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        tag_id = predictions[0][idx].item()
        tag = id2tag[tag_id]
        predicted_tags.append((tokens[word_idx], tag))

    return predicted_tags

def extract_spans(tagged_tokens):
    spans = {'SUB': [], 'PRED': [], 'OBJ': []}
    current_span = []
    current_label = None

    for token, tag in tagged_tokens:
        if tag == 'O':
            if current_span and current_label:
                spans[current_label].append(" ".join(current_span))
                current_span = []
                current_label = None
        elif tag.startswith('B-'):
            if current_span and current_label:
                spans[current_label].append(" ".join(current_span))
            current_label = tag[2:]
            current_span = [token]
        elif tag.startswith('I-') and current_label == tag[2:]:
            current_span.append(token)
        else:
            if current_span and current_label:
                spans[current_label].append(" ".join(current_span))
            current_span = []
            current_label = None

    # Final check
    if current_span and current_label:
        spans[current_label].append(" ".join(current_span))

    return spans

def form_triplets_from_spans(spans):
    subs = spans['SUB']
    preds = spans['PRED']
    objs = spans['OBJ']

    triplets = []
    for s in subs:
        for p in preds:
            for o in objs:
                triplets.append((s, p, o))

    return triplets
sample_text = articles_df.iloc[0]["content"]

print(sample_text)

tagged = predict_bio_tags(sample_text, model, tokenizer, tag2id, id2tag, device)
spans = extract_spans(tagged)
triplets = form_triplets_from_spans(spans)

print("Predicted Triplets:")
for t in triplets:
    print(t)

STOCKHOLM, SWEDEN – March 12, 2025. Karolinska Development AB (Nasdaq Stockholm: KDEV) today announces that portfolio company PharmNovo received positive feedback regarding its most advanced drug candidate, PN6047, in a pre-IND meeting with the U.S. Food and Drug Administration (FDA). The meeting aimed to provide guidance on the design of the company's planned Phase 2a clinical trial for the treatment of peripheral neuropathy and allodynia. PharmNovo conducted a regulatory pre-IND Type B meeting with the FDA in January 2025. Such meetings are typically held before submitting an Investigational New Drug (IND) application, which is required to conduct clinical studies in the U.S. During the meeting, PharmNovo presented preclinical data, sought advice on its Chemistry, Manufacturing, and Controls (CMC) activities, and received guidance on the design of its planned Phase 2a study for PN6047. FDA did not direct any negative remarks neither concerning the CMC information nor the preclinical 