# Introduction

This notebook is an attempt to perform a literature validation of the top k scoring pathways using Natural Language Processing (NLP) with BioBERT. This is done to proof that the DRW based scoring algorithms provide more biologically correct results than the EG scoring method.

It is still to be decided if this will be used in the BOO report as this might be subjected to bias.

## Libs

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from Bio import Entrez
from tqdm import tqdm

In [2]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

## Functions

In [3]:
def get_embedding(text):
    if not isinstance(text, str) or text.strip() == "":
        return np.zeros(768)
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512  # <-- belangrijk
    )
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def cosine_sim(text1, text2):
    return cosine_similarity([get_embedding(text1)], [get_embedding(text2)])[0][0]

def fetch_abstracts(query, retmax=50):
    """This function ..."""
    handle = Entrez.esearch(db="pubmed", term=query, retmax=retmax)
    record = Entrez.read(handle)
    ids = record["IdList"]
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
    return handle.read()

def fetch_pubmed_abstracts(query, max_results=5):
    try:
        search = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        record = Entrez.read(search)
        ids = record["IdList"]
        if not ids:
            return []
        fetch = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
        abstracts = fetch.read().split("\n\n")
        return [ab.strip() for ab in abstracts if ab.strip()]
    except Exception as e:
        print(f"PubMed query failed for '{query}': {e}")
        return []

# RPTEC

In [4]:
data_dir = "C:/Users/semde/Documents/BOO_Scripts/Data/RPTEC_TXG-MAPr"

In [5]:
df_eg_RPTEC = pd.read_csv(os.path.join(data_dir, "eg_joined_RPTEC.csv"))
df_drw_RPTEC = pd.read_csv(os.path.join(data_dir, "drw_joined_RPTEC.csv"))

## Eigengene scoring

### Preprocessing

In [6]:
df_eg_RPTEC = df_eg_RPTEC[["sample_id", "abs_eg_score", "module_number", "annotation"]]

In [7]:
df_eg_RPTEC = df_eg_RPTEC.sort_values(by="abs_eg_score", ascending=False)
print(df_eg_RPTEC.head())

                                          sample_id  abs_eg_score  \
2487   LU_HRPTECTERT1_SINGLE_ARISTOLOCHICACID_T3_C2      9.060121   
53575       LU_HRPTECTERT1_SINGLE_LEADACETATE_T3_C3      8.701595   
2547   LU_HRPTECTERT1_SINGLE_ARISTOLOCHICACID_T2_C3      8.459835   
515         LU_HRPTECTERT1_SINGLE_OCHRATOXINA_T2_C2      7.878635   
525         LU_HRPTECTERT1_SINGLE_OCHRATOXINA_T2_C3      7.812291   

       module_number                                         annotation  
2487              11  immune(immune, natural killer cell, lymphocyte...  
53575            264  metabolism(metabolism), rna processing(transcr...  
2547              11  immune(immune, natural killer cell, lymphocyte...  
515                3  metabolism(metabolism), mitochondria(mitochond...  
525                3  metabolism(metabolism), mitochondria(mitochond...  


In [8]:
df_eg_RPTEC["annotation_term"] = df_eg_RPTEC["annotation"].str.extract(r"^([^()]+)") # This extracts the first term before the () for use in the query
df_eg_RPTEC["condition"] = df_eg_RPTEC["sample_id"].str.extract(r"SINGLE_(.*?)_T") # This extracts only the drug of the experimental condition

In [9]:
print(df_eg_RPTEC.head())

                                          sample_id  abs_eg_score  \
2487   LU_HRPTECTERT1_SINGLE_ARISTOLOCHICACID_T3_C2      9.060121   
53575       LU_HRPTECTERT1_SINGLE_LEADACETATE_T3_C3      8.701595   
2547   LU_HRPTECTERT1_SINGLE_ARISTOLOCHICACID_T2_C3      8.459835   
515         LU_HRPTECTERT1_SINGLE_OCHRATOXINA_T2_C2      7.878635   
525         LU_HRPTECTERT1_SINGLE_OCHRATOXINA_T2_C3      7.812291   

       module_number                                         annotation  \
2487              11  immune(immune, natural killer cell, lymphocyte...   
53575            264  metabolism(metabolism), rna processing(transcr...   
2547              11  immune(immune, natural killer cell, lymphocyte...   
515                3  metabolism(metabolism), mitochondria(mitochond...   
525                3  metabolism(metabolism), mitochondria(mitochond...   

      annotation_term         condition  
2487           immune  ARISTOLOCHICACID  
53575      metabolism       LEADACETATE  
2547    

### BioBERT literature validation

In [16]:
k = 10  # top k pathwats
df_topk = df_eg_RPTEC.sort_values(by="abs_eg_score", ascending=False).head(k)

In [17]:
results = []

Entrez.email = "semdegroot2003@gmail.com"

for _, row in tqdm(df_topk.iterrows(), total=len(df_topk)):
    condition = row["condition"]
    term = row["annotation_term"]
    abs_score = row["abs_eg_score"]

    # PubMed query string
    query = f'"{term}" AND "{condition}"'

    # Fetch abstracts
    abstracts = fetch_pubmed_abstracts(query, max_results=5)

    if not abstracts:
        avg_sim = np.nan
    else:
        term_emb = get_embedding(row["annotation"])
        sims = [cosine_sim(term_emb, get_embedding(ab)) for ab in abstracts]
        avg_sim = np.mean(sims)

    results.append({
        "condition": condition,
        "annotation_term": term,
        "abs_eg_score": abs_score,
        "bioBERT_lit_score": avg_sim,
        "n_abstracts": len(abstracts)
    })

df_biobert_validated = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:18<00:00,  7.89s/it]


In [18]:
df_biobert_validated.sort_values("abs_eg_score", ascending=False)

Unnamed: 0,condition,annotation_term,abs_eg_score,bioBERT_lit_score,n_abstracts
0,ARISTOLOCHICACID,immune,9.060121,,0
1,LEADACETATE,metabolism,8.701595,0.0,38
2,ARISTOLOCHICACID,immune,8.459835,,0
3,OCHRATOXINA,metabolism,7.878635,0.0,36
4,OCHRATOXINA,metabolism,7.812291,0.0,36
5,OCHRATOXINA,energy,7.643661,0.0,14
6,OCHRATOXINA,hormone,7.382725,0.0,7
7,OCHRATOXINA,hormone,7.287448,0.0,7
8,OCHRATOXINA,signaling,7.20771,0.0,8
9,MITOMYCINC,lipid,7.03504,0.0,35


In [20]:
print("Abstract example:", abstracts[20])
print("Term embedding:", get_embedding(term)[:5])
print("Abstract embedding:", get_embedding(abstracts[0])[:5])

Abstract example: DOI: 10.1245/s10434-018-6628-x
PMCID: PMC6132421
PMID: 30027459 [Indexed for MEDLINE]
Term embedding: [ 0.14805877 -0.12710814 -0.02423691 -0.10718861 -0.3888849 ]
Abstract embedding: [ 0.07397994 -0.3922555  -0.06351098  0.0311538   0.35845697]


## Weighted Directed Random Walk (wDRW)

## Weighted Significant Directed Random Walk (s-wDRW)

# PHH

## Eigengene scoring

## Weighted Directed Random Walk (wDRW)

## Weighted Significant Directed Random Walk (s-wDRW)