In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-ontology/obi_test_types.txt


In [2]:
# ===============================================================
# STEP 0 ▸ Install & import
# ===============================================================
!pip install -q sentence-transformers scikit-learn tqdm unsloth

import os, json, random, re
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import euclidean_distances

import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# ===============================================================
# STEP 1 ▸ Load LLM (Qwen-3 via Unsloth)
# ===============================================================
model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    load_in_4bit=True,
    dtype=None,
)
FastLanguageModel.for_inference(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

# ===============================================================
# STEP 2 ▸ Load types.txt
# ===============================================================
with open("/kaggle/input/data-ontology/obi_test_types.txt") as f:
    types = [line.strip() for line in f if line.strip()]
print(f"✅ Loaded {len(types)} types")

# ===============================================================


2025-07-12 13:54:28.604534: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752328468.629329     276 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752328468.636725     276 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.3: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Loaded 2821 types


In [3]:
# ===============================================================
# STEP 0 ▸ Install & import
# ===============================================================

# ===============================================================

# ===============================================================
# STEP 3 ▸ Embed with BERT
# ===============================================================
bert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
embeddings = bert.encode(types, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

# ===============================================================
# STEP 4 ▸ Cluster with KMeans
# ===============================================================
n_clusters = max(1, len(types) // 50)  # target ~50 terms/cluster
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=256, random_state=42)
cluster_ids = kmeans.fit_predict(embeddings)

clusters = {}
for idx, cid in enumerate(cluster_ids):
    clusters.setdefault(cid, []).append(idx)

print(f"✅ Formed {len(clusters)} clusters")

# ===============================================================
# STEP 5 ▸ Create Top-K pairs within each cluster
# ===============================================================
MAX_CLUSTER_SIZE = 150
TOP_K = 5
candidate_pairs = []

for cid, idxs in clusters.items():
    if len(idxs) > MAX_CLUSTER_SIZE:
        idxs = random.sample(idxs, MAX_CLUSTER_SIZE)

    cluster_embeds = np.array([embeddings[i] for i in idxs])
    dists = euclidean_distances(cluster_embeds)

    for i, idx_i in enumerate(idxs):
        nearest = np.argsort(dists[i])[1:TOP_K+1]  # skip self
        for j in nearest:
            idx_j = idxs[j]
            parent = types[idx_i]
            child = types[idx_j]
            candidate_pairs.append((parent, child))

print(f"✅ Candidate pairs to test: {len(candidate_pairs):,}")

# ===============================================================



Batches:   0%|          | 0/45 [00:00<?, ?it/s]



✅ Formed 56 clusters
✅ Candidate pairs to test: 13,456


In [4]:


FEW_SHOT_EXAMPLES = [
    ("hemoglobin assay","cooximitery arterial blood hemoglobin assay", "Yes"),
    ("signal conversion function", "signal amplification function", "Yes"),
    ("exclusion criterion", "chemotherapy treatment exclusion criterion", "Yes"),
]

def build_few_shot_prompt(parent, child):
    examples_text = ""
    for i, (p, c, label) in enumerate(FEW_SHOT_EXAMPLES, 1):
        examples_text += f"Example {i}:\nParent: {p}\nChild: {c}\nAnswer: {label}\n\n"

    return (
        "You are a biomedical ontology expert.\n"
        "Decide if the following relation is an is-a (subclass) relationship.\n\n"
        f"{examples_text}"
        "Now decide:\n\n"
        f"Parent: {parent}\n"
        f"Child: {child}\n"
        f"Answer Yes or No."
    )

_yes_pattern = re.compile(r"^\s*(yes|true)\b", flags=re.I)

def make_prompt_msg(parent, child):
    return {
        "role": "user",
        "content": build_few_shot_prompt(parent, child)
    }

system_msg = {"role": "system", "content": "You are a biomedical ontology classifier."}
BATCH_SIZE = 8
results = []

for i in tqdm(range(0, len(candidate_pairs), BATCH_SIZE), desc="LLM few-shot"):
    batch = candidate_pairs[i:i+BATCH_SIZE]
    msgs = [[system_msg, make_prompt_msg(p, c)] for p, c in batch]

    try:
        input_ids = tokenizer.apply_chat_template(
            msgs, return_tensors="pt", padding=True, truncation=True
        ).to(device)

        attn_mask = (input_ids != tokenizer.pad_token_id)

        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attn_mask,
            max_new_tokens=10,
            do_sample=False
        )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        for (parent, child), output in zip(batch, outputs):
            last_line = output.strip().splitlines()[-1].lower()
            if _yes_pattern.match(last_line):
                results.append({"parent": parent, "child": child})

    except Exception as e:
        print("⚠️  Batch failed:", e)

print(f"✅ is-a pairs found: {len(results):,}")

# ===============================================================
# STEP 7 ▸ Save output as *_pairs.json with ONLY parent/child
# ===============================================================
os.makedirs("outputs", exist_ok=True)

with open("outputs/obi_pairs.json", "w") as f:
    json.dump(results, f, indent=2)

print("✅ Output written to outputs/obi_pairs.json")


LLM few-shot:   0%|          | 0/1682 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

✅ is-a pairs found: 15
✅ Output written to outputs/obi_pairs.json


In [5]:
# import json
# from collections import defaultdict

# # Load flat is-a pairs
# with open("/kaggle/working/outputs/obi_pairs.json") as f:
#     flat_data = json.load(f)

# # Group by child
# child_to_parents = defaultdict(list)
# for entry in flat_data:
#     child_to_parents[entry["child"]].append(entry["parent"])

# # Format output
# grouped_data = [
#     {
#         "parent": ", ".join(sorted(set(parents))),
#         "child": child
#     }
#     for child, parents in child_to_parents.items()
# ]

# # Save to new grouped submission file
# with open("outputs/obi_pairs_grouped.json", "w") as f:
#     json.dump(grouped_data, f, indent=2)

# print("✅ Grouped output saved to outputs/obi_pairs_grouped.json")


# # [ output will now appear like this 
# #   {
# #     "parent": "mountain, hill, rock",
# #     "child": "karst area"
# #   },
# #   {
# #     "parent": "spot, building, farm",
# #     "child": "dike"
# #   }
# # ]

The full ontology detection pipeline starts by first embedding the 2,880 input types using MiniLM, a lightweight transformer-based model optimized for speed and semantic similarity tasks. Specifically, sentence-transformers/all-MiniLM-L6-v2 is used via the SentenceTransformer API, which encodes each type (a short phrase or term) into a fixed-length vector representation that captures its semantic meaning. These embeddings are then grouped using MiniBatchKMeans, a scalable variant of the K-Means clustering algorithm, to cluster semantically similar types together, reducing the search space and limiting comparisons to smaller subsets of related terms. Within each cluster, Euclidean distance is computed to find the top-k nearest neighbors for each type, and candidate parent-child pairs are formed by treating each term and its closest neighbors as possible "parent → child" combinations. These candidate pairs are then passed to a 4-bit quantized Qwen-3 1.7B model via the Unsloth framework for efficient inference. For each pair, a few-shot prompt is dynamically built that includes multiple hand-crafted examples of correct and incorrect “is-a” relationships to guide the LLM’s reasoning. The prompt ends with the test pair and a question asking if it represents a subclass relationship, to which Qwen responds with “Yes” or “No.” If the response starts with “Yes,” the pair is accepted as a valid taxonomic relation. Finally, all accepted pairs are saved in the required submission format. Together, this BERT+LLM hybrid approach combines MiniLM's fast semantic grouping with Qwen's deep reasoning to efficiently detect hierarchical (is-a) relationships from text-only type names.

