In [4]:
import pandas as pd
from pathlib import Path

CSV_PATH = Path("../data/processed/biocup_chunks.csv")
df = pd.read_csv(CSV_PATH)

TEXT_COL = "chunk_text"
ID_COL = "chunk_id"

df[TEXT_COL] = df[TEXT_COL].astype(str).fillna("")
df = df[df[TEXT_COL].str.strip().ne("")].reset_index(drop=True)

print("rows:", len(df))
df.head(3)

rows: 5104


Unnamed: 0,chunk_id,case_id,primary_site,tcga_type,patient_id,section,original_section,chunk_index,sub_index,chunk_text,has_tnm,has_size,has_ihc,has_lymph,has_margins,has_tumor_size_cue,is_admin_noise
0,BIOCUP_00001|LYMPH_NODES|0|1|c6d9cbd012,BIOCUP_00001,lung,LUSC,TCGA-18-4086,LYMPH_NODES,GENERAL,0,1,[case_id=BIOCUP_00001 | site=lung | type=LUSC ...,False,False,False,True,False,False,False
1,BIOCUP_00001|DIAGNOSIS|0|0|81b72f36e4,BIOCUP_00001,lung,LUSC,TCGA-18-4086,DIAGNOSIS,DIAGNOSIS,0,0,[case_id=BIOCUP_00001 | site=lung | type=LUSC ...,False,False,False,False,False,False,False
2,BIOCUP_00001|LYMPH_NODES|0|1|a41a38b6f0,BIOCUP_00001,lung,LUSC,TCGA-18-4086,LYMPH_NODES,DIAGNOSIS,0,1,[case_id=BIOCUP_00001 | site=lung | type=LUSC ...,False,False,False,True,False,False,False


In [5]:


df_small = df.sample(100, random_state=42).reset_index(drop=True)
texts = df_small[TEXT_COL].tolist()
texts[0][:300]



'[case_id=BIOCUP_00355 | site=pancreas | type=PAAD | section=MARGINS] -5 pancreas with posterior margin, next to sections and -8 a single cross section of mass with pancreatic resection margin and common duct.'

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np

dense_model_name = "BAAI/bge-base-en-v1.5"   # ou "intfloat/e5-base-v2"
dense_model = SentenceTransformer(dense_model_name)

dense = dense_model.encode(texts, batch_size=32, normalize_embeddings=True, show_progress_bar=True)
dense = np.asarray(dense, dtype="float32")

print("dense shape:", dense.shape)
print("norm mean:", np.mean(np.linalg.norm(dense, axis=1)))


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 4/4 [00:11<00:00,  2.85s/it]

dense shape: (100, 768)
norm mean: 1.0





In [7]:
from sklearn.metrics.pairwise import cosine_similarity

q = dense[0].reshape(1, -1)
sims = cosine_similarity(q, dense)[0]
top = sims.argsort()[::-1][:5]

for idx in top:
    print("score:", round(float(sims[idx]), 3), "| id:", df_small.loc[idx, ID_COL], "| section:", df_small.loc[idx, "section"])
    print(df_small.loc[idx, TEXT_COL][:220], "\n")


score: 1.0 | id: BIOCUP_00355|MARGINS|19|1|57fdf19aa0 | section: MARGINS
[case_id=BIOCUP_00355 | site=pancreas | type=PAAD | section=MARGINS] -5 pancreas with posterior margin, next to sections and -8 a single cross section of mass with pancreatic resection margin and common duct. 

score: 0.793 | id: BIOCUP_00302|CLINICAL_HISTORY|17|0|6edea33336 | section: CLINICAL_HISTORY
[case_id=BIOCUP_00302 | site=pancreas | type=PAAD | section=CLINICAL_HISTORY] 24=multiple nodes from lesser curve. 25=representative from greater curve. 26=single anterior pancreatic node. 27=multiple nodes from posterio 

score: 0.785 | id: BIOCUP_00316|MICRO|3|0|548a66863f | section: MICRO
[case_id=BIOCUP_00316 | site=pancreas | type=PAAD | section=MICRO] Metastatic carcinoma to 5 of 14 peripancreatic lymph nodes (5/14). Perineural, lymphovascular and large vessel invasion identified. C. PANCREAS, NEW PANC 

score: 0.784 | id: BIOCUP_00335|MARGINS|9|0|7ff8a44710 | section: MARGINS
[case_id=BIOCUP_00335 | site=panc

In [8]:
query = "lymph nodes negative for malignancy"
qvec = dense_model.encode([query], normalize_embeddings=True)[0].reshape(1, -1)
sims = cosine_similarity(qvec, dense)[0]
top = sims.argsort()[::-1][:5]

for idx in top:
    print("score:", round(float(sims[idx]), 3), "| primary_site:", df_small.loc[idx, "primary_site"], "| section:", df_small.loc[idx, "section"])
    print(df_small.loc[idx, TEXT_COL][:220], "\n")


score: 0.786 | primary_site: colon | section: LYMPH_NODES
[case_id=BIOCUP_00104 | site=colon | type=COAD | section=LYMPH_NODES] Regional lymph nodes (pN) : Twenty-six lymph nodes are dissected from the specimen. They are all negative for metastatic tumor. (pN0). Non-lymph node  

score: 0.765 | primary_site: breast | section: LYMPH_NODES
[case_id=BIOCUP_00145 | site=breast | type=BRCA | section=LYMPH_NODES] Sentinel lymph node, left axillary, sentinel biopsy: A single left axillary sentinel lymph node with blue dye is negative for metastatic carcinoma [A 

score: 0.727 | primary_site: pancreas | section: SYNOPTIC
[case_id=BIOCUP_00323 | site=pancreas | type=PAAD | section=SYNOPTIC] Distal duodenal margin: Negative. Proximal or distal pancreatic margin: Negative. Peripancreatic soft tissue margin (including retroperitoneal and pos 

score: 0.724 | primary_site: breast | section: LYMPH_NODES
[case_id=BIOCUP_00155 | site=breast | type=BRCA | section=LYMPH_NODES] A. LYMPH NODE, SENTINEL #1

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

splade_name = "prithivida/Splade_PP_en_v1"
tok = AutoTokenizer.from_pretrained(splade_name)
splade = AutoModelForMaskedLM.from_pretrained(splade_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
splade = splade.to(device).eval()
device


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


'cpu'

In [10]:
@torch.no_grad()
def splade_encode(texts, batch_size=4, max_length=192, topk=128):
    out = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)

        logits = splade(**enc).logits                    # (B, L, V)
        weights = torch.log1p(torch.relu(logits)).max(1).values  # (B, V)

        weights = weights.cpu()
        for w in weights:
            nz = torch.nonzero(w).squeeze(-1)
            vals = w[nz]
            if nz.numel() > topk:
                vals, idx = torch.topk(vals, k=topk)
                nz = nz[idx]
            out.append((nz.numpy(), vals.numpy()))
    return out

texts20 = texts[:20]
sparse20 = splade_encode(texts20, batch_size=4, max_length=192, topk=128)
print("encoded:", len(sparse20))
print("example nnz:", len(sparse20[0][0]))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


encoded: 20
example nnz: 83


In [11]:
nz, vals = sparse20[0]
print("nonzero:", len(nz))
print("vals stats:", float(vals.min()), float(vals.mean()), float(vals.max()))


nonzero: 83
vals stats: 0.004286803305149078 0.6061486005783081 2.4616620540618896


In [12]:
top_idx = vals.argsort()[::-1][:15]
top_token_ids = nz[top_idx]
top_tokens = tok.convert_ids_to_tokens(top_token_ids.tolist())

list(zip(top_tokens, vals[top_idx].round(3).tolist()))


[('##cup', 2.4619998931884766),
 ('margin', 1.774999976158142),
 ('posterior', 1.7599999904632568),
 ('##cre', 1.7489999532699585),
 ('##ad', 1.725000023841858),
 ('pa', 1.6369999647140503),
 ('cross', 1.6260000467300415),
 ('pan', 1.5609999895095825),
 ('bio', 1.4819999933242798),
 ('duct', 1.4520000219345093),
 ('margins', 1.4079999923706055),
 ('section', 1.3270000219345093),
 ('case', 1.3009999990463257),
 ('5', 1.2640000581741333),
 ('##ection', 1.2630000114440918)]

In [13]:
def contains_token(nz, vals, token_str):
    # token_str est un token de tokenizer (ex: "lymph")
    tid = tok.convert_tokens_to_ids(token_str)
    return tid in set(nz.tolist())

q = "lymph nodes negative for malignancy"
q_sparse = splade_encode([q], batch_size=1, max_length=192, topk=128)[0]

print("query contains 'lymph' token?", contains_token(q_sparse[0], q_sparse[1], "lymph"))


query contains 'lymph' token? False
