In [251]:
# %pip install pandas spacy transformers seqeval torch scikit-learn hf_xet
# %python -m spacy download en_core_web_sm

In [252]:
import csv
import pandas as pd
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from transformers import BertTokenizerFast, BertModel

nlp = spacy.load("en_core_web_sm")

In [253]:
articles_df = pd.DataFrame()
sectors = ["finance", "healthcare", "tech"]
triplets_list = []
for sector in sectors:
    articles = pd.read_csv(f"globenewswire_articles_{sector}.csv")
    articles_df = pd.concat([articles_df, articles], ignore_index=True)

    with open(f"{sector}_articles_triplets.csv", "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        header = next(reader)
        
        for row in reader:
            if not row:
                continue
            
            url = row[0]
            triplet_fields = []
            
            for field in row[1:]:
                if field.strip():
                    str_tuple = field.strip()
                    if str_tuple.startswith('(') and str_tuple.endswith(')'):
                        inner_str = str_tuple[1:-1]
                        elements = [elem.strip() for elem in inner_str.split(',')]
                        triplet_fields.append(tuple(elements))
                    else:
                        triplet_fields.append((str_tuple,))

            triplets_list.append({"url": url, "triplets": triplet_fields})

triplets_df = pd.DataFrame(triplets_list)

In [254]:
def tokenize_text(text):
    return [token.text for token in nlp(text)]

In [255]:
def get_bio_tags(text, spo_list):
    tokens = tokenize_text(text)
    tags = ['O'] * len(tokens)

    for spo in spo_list:
        try:
            subject, predicate, obj = spo
            spans = {
                'SUB': subject.split(),
                'PRED': predicate.split(),
                'OBJ': obj.split()
            }

            for label, span_tokens in spans.items():
                for i in range(len(tokens) - len(span_tokens) + 1):
                    if tokens[i:i+len(span_tokens)] == span_tokens:
                        tags[i] = f'B-{label}'
                        for j in range(1, len(span_tokens)):
                            tags[i + j] = f'I-{label}'
                        break
        except Exception as e:
            continue

    return tokens, tags

In [256]:
dataset = []

for idx, row in articles_df.iterrows():
    url = row['url']
    content = row['content']

    matching_triplets_row = triplets_df[triplets_df['url'] == url]
    if matching_triplets_row.empty:
        continue

    tokens, tags = get_bio_tags(content, triplets_df.iloc[idx, 1])
    dataset.append((tokens, tags))

In [257]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Unique BIO tags
tag_values = ['O', 'B-SUB', 'I-SUB', 'B-PRED', 'I-PRED', 'B-OBJ', 'I-OBJ']
tag2id = {tag: i for i, tag in enumerate(tag_values)}
id2tag = {i: tag for tag, i in tag2id.items()}

In [258]:
class SPOBioDataset(Dataset):
    def __init__(self, data, tokenizer, tag2id, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.tag2id = tag2id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens, tags = self.data[idx]

        tokenized_input = self.tokenizer(tokens,
                                        is_split_into_words=True,
                                        padding='max_length',
                                        truncation=True,
                                        max_length=self.max_len,
                                        return_tensors="pt")

        word_ids = tokenized_input.word_ids(batch_index=0)
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(tag2id.get(tags[word_idx], tag2id['O']))

        return {
            'input_ids': tokenized_input['input_ids'].squeeze(),
            'attention_mask': tokenized_input['attention_mask'].squeeze(),
            'labels': torch.tensor(label_ids)
        }

In [259]:
train_data, val_data = train_test_split(dataset, test_size=0.1, random_state=42)

train_dataset = SPOBioDataset(train_data, tokenizer, tag2id)
val_dataset = SPOBioDataset(val_data, tokenizer, tag2id)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Training

In [260]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BERTTagger(nn.Module):
    def __init__(self, tag2id):
        super(BERTTagger, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, len(tag2id))

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)
        return logits

def compute_class_weights(dataset, tag2id):
    tag_counts = Counter(tag for _, tags in dataset for tag in tags)
    total = sum(tag_counts.values())
    weights = [1.0 - (tag_counts[tag] / total) for tag in tag2id.keys()]
    print(weights)
    # weights[0] += 0.14
    weights[0] += 0.1
    print(weights)
    return torch.tensor(weights).to(device)

In [261]:
model = BERTTagger(tag2id).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
weights = compute_class_weights(train_data, tag2id)
loss_func = nn.CrossEntropyLoss(ignore_index=-100, weight=weights)

[0.020662489170075182, 0.9973756321653805, 0.9974108923858073, 0.9967812455924725, 0.9991688662327981, 0.9950937921863352, 0.9935070822671315]
[0.12066248917007519, 0.9973756321653805, 0.9974108923858073, 0.9967812455924725, 0.9991688662327981, 0.9950937921863352, 0.9935070822671315]


In [262]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)

        loss = loss_func(logits.view(-1, len(tag2id)), labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} | Loss: {avg_loss:.4f}")


Epoch 1 | Loss: 0.9002
Epoch 2 | Loss: 0.5746
Epoch 3 | Loss: 0.4213
Epoch 4 | Loss: 0.3052
Epoch 5 | Loss: 0.2129
Epoch 6 | Loss: 0.1574
Epoch 7 | Loss: 0.1156
Epoch 8 | Loss: 0.0843
Epoch 9 | Loss: 0.0741
Epoch 10 | Loss: 0.0615


# Inference

In [263]:
@torch.no_grad()
def predict_bio_tags(text, model, tokenizer, id2tag, device):
    model.eval()
    tokens = tokenize_text(text)

    tokenized_input = tokenizer(tokens,
                                is_split_into_words=True,
                                return_tensors="pt",
                                truncation=True,
                                padding="max_length",
                                max_length=512)

    input_ids = tokenized_input["input_ids"].to(device)
    attention_mask = tokenized_input["attention_mask"].to(device)

    logits = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(logits, dim=2)

    word_ids = tokenized_input.word_ids(batch_index=0)
    predicted_tags = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        tag_id = predictions[0][idx].item()
        tag = id2tag[tag_id]
        if idx == 0 or word_ids[idx] != word_ids[idx - 1]:
            predicted_tags.append((tokens[word_idx], tag))

    return predicted_tags

def extract_and_form_triplets(text, tagged_tokens):

    # Step 1: Extract spans from BIO-tagged tokens
    spans = {'SUB': [], 'PRED': [], 'OBJ': []}
    current_span = []
    current_label = None

    for token, tag in tagged_tokens:
        if tag == 'O':
            if current_span and current_label:
                spans[current_label].append(" ".join(current_span))
            current_span = []
            current_label = None
        elif tag.startswith('B-'):
            if current_span and current_label:
                spans[current_label].append(" ".join(current_span))
            current_label = tag[2:]
            current_span = [token]
        elif tag.startswith('I-') and current_label == tag[2:]:
            current_span.append(token)
        else:
            if current_span and current_label:
                spans[current_label].append(" ".join(current_span))
            current_span = []
            current_label = None

    if current_span and current_label:
        spans[current_label].append(" ".join(current_span))

    # Step 2: Filter out short or lowercase-only spans
    # def filter_spans(spans):
    #     def is_valid(span):
    #         return len(span.split()) > 1 or span[0].isupper()

    #     return {
    #         k: [s for s in v if is_valid(s)] for k, v in spans.items()
    #     }

    # filtered_spans = filter_spans(spans)

    filtered_spans = spans

    # Step 3: Match spans within same sentence
    doc = nlp(text)
    triplets = []

    for sent in doc.sents:
        sent_text = sent.text
        subjs = [s for s in filtered_spans["SUB"] if s in sent_text]
        preds = [p for p in filtered_spans["PRED"] if p in sent_text]
        objs = [o for o in filtered_spans["OBJ"] if o in sent_text]

        for s in subjs:
            for p in preds:
                for o in objs:
                    triplets.append((s, p, o))

    return list(set(triplets))

In [264]:
sample_text = articles_df.iloc[0]["content"]

# doc = nlp(sample_text)
# with open("temp1.txt", "w") as f:
#     for sent in doc.sents:
#         sent_text = sent.text
#         print(sent_text)
#         f.write(sent_text)
#         f.write("\n")
# sample_text = "SALT LAKE CITY, March  12, 2025  (GLOBE NEWSWIRE) -- Clene, Inc. (Nasdaq: CLNN) and its subsidiary, Clene Nanomedicine, Inc., a clinical-stage biopharmaceutical company focused on revolutionizing the treatment of neurodegenerative diseases, including amyotrophic lateral sclerosis (ALS) and multiple sclerosis (MS), today announced new evidence from a cross-regimen, post hoc analysis of long-term survival in HEALEY ALS Platform Trial participants. The analyses further substantiate that treatment with CNM-Au8¬Æ 30 mg delivers a significant survival benefit for people living with ALS. New Survival AnalysesThe analyses compared survival in participants who received CNM-Au8 30 mg (Regimen C) to those of Regimen A in the HEALEY ALS Platform Trial. Regimen A provided a large concurrent control group vs. CNM-Au8 treatment using the same randomization criteria established within the HEALEY master protocol. Long-term survival status, determined through public records and site reporting, was evaluated over a follow-up period of up to 48 months. 78% of participants across both groups received standard ALS background therapy (riluzole, edaravone, or both) at baseline. These results are consistent with previous survival benefits observed in the HEALEY ALS Platform Trial‚Äôs 24-week double-blind period, the open-label extension of the Phase 2 RESCUE-ALS trial, and analyses of Expanded Access Programs compared to ALS natural history controls. ‚ÄúWe are highly encouraged by these results, as the significant survival advantage demonstrated by CNM-Au8 not only reinforces its potential to extend life for people living with ALS but also validates our strategic direction as we prepare for the launch of our confirmatory Phase 3 RESTORE-ALS study in mid-2025,‚Äù stated Rob Etherington, President and CEO of Clene. ‚ÄúWe look forward to discussing these findings with the FDA as we advance toward commercialization.‚Äù Merit Cudkowicz, M.D., M.S.c., Principal Investigator and sponsor of the HEALEY ALS Platform Trial, director of the Sean M. Healey & AMG Center for ALS, and executive director of the Mass General Brigham Neuroscience Institute, said, ‚ÄúThe innovative design of the HEALEY ALS Platform Trial has enabled us to extract clear and meaningful survival data that helps make decisions about CNM-Au8 drug development.‚Äù About Regimen ARegimen A was one of the first three regimens investigated in the HEALEY ALS Platform Trial. Eligible participants were randomized in a 3:1 ratio to receive active treatment or matching placebo for a planned duration of 24 weeks. Participants assigned to Regimen A had to receive both quadrivalent and serotype B meningococcal vaccinations at least 14 days prior to the first dose of study drug, and participants were excluded from Regimen A if they had a history of meningococcal disease or prior treatment with a complement inhibitor. Regimen A was stopped prematurely for futility after all participants had been randomized, and approximately 70% had completed the Week 24 visit. Participants were instructed to discontinue study dosing, and a final early termination study visit was conducted. Long-term survival status of Regimen A participants was tracked from public records and site reporting independently of the early termination. There was no difference in long-term survival in participants randomized to Regimen A active compared to Regimen A placebo, supporting the combined analyses of the entire Regimen A population for comparisons of long-term survival to CNM-Au8 30 mg (Regimen C) participants. About CleneClene Inc., (Nasdaq: CLNN) (along with its subsidiaries, ‚ÄúClene‚Äù and its wholly owned subsidiary Clene Nanomedicine, Inc.), is a late clinical-stage biopharmaceutical company focused on improving mitochondrial health and protecting neuronal function to treat neurodegenerative diseases, including amyotrophic lateral sclerosis, Parkinson‚Äôs disease, and multiple sclerosis. CNM-Au8¬Æ is an investigational first-in-class therapy that improves central nervous system cells‚Äô survival and function via a mechanism that targets mitochondrial function and the NAD pathway while reducing oxidative stress. CNM-Au8¬Æ is a federally registered trademark of Clene Nanomedicine, Inc. The company is based in Salt Lake City, Utah, with R&D and manufacturing operations in Maryland. For more information, please visit www.clene.com or follow us on X (formerly Twitter) and LinkedIn. About CNM-Au8¬ÆCNM-Au8 is an oral suspension of gold nanocrystals developed to restore neuronal health and function by increasing energy production and utilization. The catalytically active nanocrystals of CNM-Au8 drive critical cellular energy producing reactions that enable neuroprotection and remyelination by increasing neuronal and glial resilience to disease-relevant stressors. CNM-Au8¬Æ is a federally registered trademark of Clene Nanomedicine, Inc. About RESTORE-ALSRESTORE-ALS is a Phase 3 confirmatory global, multi-center, randomized, double-blind, parallel group, placebo-controlled study to evaluate the efficacy, safety, pharmacodynamics, and pharmacokinetics of CNM-Au8 in participants diagnosed with ALS on stable background therapy. The study is designed to investigate the effects of CNM-Au8 on improved survival (primary endpoint) and delayed time to ALS clinical worsening events (secondary efficacy endpoint).¬†Participants will be randomized in a 2:1 ratio to receive either active treatment with CNM-Au8 30 mg or matched placebo daily during the 108-week double-blind treatment period. The Phase 3 RESTORE-ALS clinical trial, due to launch in mid-2025, is planned to serve as the confirmatory clinical trial required to meet the FDA‚Äôs guidance for an ‚Äúunderway‚Äù clinical trial when a New Drug Application requesting Accelerated Approval is submitted. Forward-Looking StatementsThis press release contains ‚Äúforward-looking statements‚Äù within the meaning of Section 21E of the Securities Exchange Act of 1934, as amended, and Section 27A of the Securities Act of 1933, as amended, which are intended to be covered by the ‚Äúsafe harbor‚Äù provisions created by those laws. Clene‚Äôs forward-looking statements include, but are not limited to, statements regarding our or our management team‚Äôs expectations, hopes, beliefs, intentions or strategies regarding our future operations. In addition, any statements that refer to projections, forecasts or other characterizations of future events or circumstances, including any underlying assumptions, are forward-looking statements. The words ‚Äúanticipate,‚Äù ‚Äúbelieve,‚Äù ‚Äúcontemplate,‚Äù ‚Äúcontinue,‚Äù ‚Äúestimate,‚Äù ‚Äúexpect,‚Äù ‚Äúintends,‚Äù ‚Äúmay,‚Äù ‚Äúmight,‚Äù ‚Äúplan,‚Äù ‚Äúpossible,‚Äù ‚Äúpotential,‚Äù ‚Äúpredict,‚Äù ‚Äúproject,‚Äù ‚Äúshould,‚Äù ‚Äúwill,‚Äù ‚Äúwould,‚Äù and similar expressions may identify forward-looking statements, but the absence of these words does not mean that a statement is not forward-looking. These forward-looking statements represent our views as of the date of this press release and involve a number of judgments, risks and uncertainties. We anticipate that subsequent events and developments will cause our views to change. We undertake no obligation to update forward-looking statements to reflect events or circumstances after the date they were made, whether as a result of new information, future events or otherwise, except as may be required under applicable securities laws. Accordingly, forward-looking statements should not be relied upon as representing our views as of any subsequent date. As a result of a number of known and unknown risks and uncertainties, our actual results or performance may be materially different from those expressed or implied by these forward-looking statements. Some factors that could cause actual results to differ include our ability to demonstrate the efficacy and safety of our drug candidates; the clinical results for our drug candidates, which may not support further development or marketing approval; actions of regulatory agencies, which may affect the initiation, timing and progress of clinical trials and marketing approval; our ability to achieve commercial success for our drug candidates, if approved; our limited operating history and our ability to obtain additional funding for operations and to complete the development and commercialization of our drug candidates; and other risks and uncertainties set forth in ‚ÄúRisk Factors‚Äù in our most recent Annual Report on Form 10-K and any subsequent Quarterly Reports on Form 10-Q. In addition, statements that ‚Äúwe believe‚Äù and similar statements reflect our beliefs and opinions on the relevant subject. These statements are based upon information available to us as of the date of this press release, and while we believe such information forms a reasonable basis for such statements, such information may be limited or incomplete, and our statements should not be read to indicate that we have conducted an exhaustive inquiry into, or review of, all potentially available relevant information. These statements are inherently uncertain and you are cautioned not to rely unduly upon these statements. All information in this press release is as of the date of this press release. The information contained in any website referenced herein is not, and shall not be deemed to be, part of or incorporated into this press release."

tagged = predict_bio_tags(sample_text, model, tokenizer, id2tag, device)
# for t in tagged:
#     if(t[1] != "O"):
#         print(t, end=", ")
# print()
triplets = extract_and_form_triplets(sample_text, tagged)

# with open("temp.txt", "w") as f:
#     for t in triplets:
#         f.write(str(t))
#         f.write("\n")

print("Predicted Triplets:")
for t in triplets:
    print(t)

Predicted Triplets:
('PharmNovo', 'is', 'Karolinska Development AB')
('Viktor Drvota', 'said', 'FDA')
('PharmNovo', 'received', 'Phase')
('PharmNovo', 'announces', 'FDA')
('PharmNovo', 'announces', 'feedback')
('PharmNovo', 'is', 'Phase')
('PharmNovo', 'said', 'FDA')
('PharmNovo', 'announces', 'Karolinska Development AB')
('Viktor Drvota', 'said', 'Phase')
('PharmNovo', 'conducted', 'FDA')
('PharmNovo', 'plans to apply', 'FDA')
('Viktor Drvota', 'said', 'CEO')
('PharmNovo', 'plans to apply', 'feedback')
('PharmNovo', 'received', 'FDA')
('PharmNovo', 'said', 'Phase')
('PharmNovo', 'received', 'feedback')
('PharmNovo', 'said', 'CEO')
('PharmNovo', 'received', 'Karolinska Development AB')
('PharmNovo', 'is', 'FDA')
('PharmNovo', 'is', 'feedback')
