In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-ontology/obi_test_types.txt


In [2]:
!pip install unsloth
!pip install -q sentence-transformers scikit-learn

import torch, json, random, os, numpy as np
from tqdm.auto import tqdm

Collecting unsloth
  Downloading unsloth-2025.7.3-py3-none-any.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.7.4 (from unsloth)
  Downloading unsloth_zoo-2025.7.4-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.26-py3-none-any.whl.metadata (12 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets<4.0.0,>=3.4.1->unsloth)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Col

In [4]:
# ===============================================================
# STEP 0 ▸ Install & import
# ===============================================================
!pip install -q sentence-transformers scikit-learn tqdm

import os, json, random, hashlib, re
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import euclidean_distances

import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

# ===============================================================
# STEP 1 ▸ Load LLM (Qwen-3 via Unsloth)
# ===============================================================
model_name = "unsloth/Qwen3-1.7B-unsloth-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    load_in_4bit=True,
    dtype=None,
)
FastLanguageModel.for_inference(model)
device = "cuda" if torch.cuda.is_available() else "cpu"

# ===============================================================
# STEP 2 ▸ Load types.txt
# ===============================================================
with open("/kaggle/input/data-ontology/obi_test_types.txt") as f:
    types = [line.strip() for line in f if line.strip()]
print(f"✅ Loaded {len(types)} types")

# ===============================================================
# STEP 3 ▸ Embed with BERT
# ===============================================================
bert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
embeddings = bert.encode(types, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

# ===============================================================
# STEP 4 ▸ Cluster with KMeans
# ===============================================================
n_clusters = max(1, len(types) // 50)  # target ~50 terms/cluster
kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=256, random_state=42)
cluster_ids = kmeans.fit_predict(embeddings)

clusters = {}
for idx, cid in enumerate(cluster_ids):
    clusters.setdefault(cid, []).append(idx)

print(f"✅ Formed {len(clusters)} clusters")

# ===============================================================
# STEP 5 ▸ Create Top-K pairs within each cluster
# ===============================================================
MAX_CLUSTER_SIZE = 150
TOP_K = 5
candidate_pairs = []

for cid, idxs in clusters.items():
    if len(idxs) > MAX_CLUSTER_SIZE:
        idxs = random.sample(idxs, MAX_CLUSTER_SIZE)

    cluster_embeds = np.array([embeddings[i] for i in idxs])
    dists = euclidean_distances(cluster_embeds)

    for i, idx_i in enumerate(idxs):
        nearest = np.argsort(dists[i])[1:TOP_K+1]  # skip self
        for j in nearest:
            idx_j = idxs[j]
            parent = types[idx_i]
            child = types[idx_j]
            candidate_pairs.append((parent, child))

print(f"✅ Candidate pairs to test: {len(candidate_pairs):,}")

# ===============================================================
# STEP 6 ▸ Batch zero-shot prediction with Qwen
# ===============================================================
_prompt_template = (
    "You are a biomedical ontology expert.\n"
    "Decide if the following relation is an is-a (subclass) relationship.\n\n"
    "Parent: {parent}\n"
    "Child: {child}\n"
    "Answer Yes or No."
)

_yes_pattern = re.compile(r"^\s*(yes|true)\b", flags=re.I)

def make_prompt_msg(parent, child):
    return {
        "role": "user",
        "content": _prompt_template.format(parent=parent, child=child)
    }

system_msg = {"role": "system", "content": "You are a biomedical ontology classifier."}
BATCH_SIZE = 8
results = []

for i in tqdm(range(0, len(candidate_pairs), BATCH_SIZE), desc="LLM zero-shot"):
    batch = candidate_pairs[i:i+BATCH_SIZE]
    msgs = [[system_msg, make_prompt_msg(p, c)] for p, c in batch]

    try:
        input_ids = tokenizer.apply_chat_template(
            msgs, return_tensors="pt", padding=True, truncation=True
        ).to(device)

        attn_mask = (input_ids != tokenizer.pad_token_id)

        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attn_mask,
            max_new_tokens=10,
            do_sample=False
        )

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

        for (parent, child), output in zip(batch, outputs):
            last_line = output.strip().splitlines()[-1].lower()
            if _yes_pattern.match(last_line):
                results.append({"parent": parent, "child": child})

    except Exception as e:
        print("⚠️  Batch failed:", e)

print(f"✅ is-a pairs found: {len(results):,}")

# ===============================================================
# STEP 7 ▸ Save output as *_pairs.json with ID
# ===============================================================
os.makedirs("outputs", exist_ok=True)

def make_id(p, c):
    return "TR_" + hashlib.sha1(f"{p}->{c}".encode()).hexdigest()[:8]

final = [
    {
        # "ID": make_id(d["parent"], d["child"]),
        "parent": d["parent"],
        "child": d["child"]
    }
    for d in results
]

with open("outputs/obi_pairs.json", "w") as f:
    json.dump(final, f, indent=2)

print("✅ Output written to outputs/obi_pairs.json")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


==((====))==  Unsloth 2025.7.3: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Loaded 2821 types


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/45 [00:00<?, ?it/s]



✅ Formed 56 clusters
✅ Candidate pairs to test: 13,456


LLM zero-shot:   0%|          | 0/1682 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

✅ is-a pairs found: 2,075
✅ Output written to outputs/obi_pairs.json


In [5]:
import json

# Load your existing file with IDs
with open("/kaggle/working/outputs/obi_pairs.json") as f:
    data_with_ids = json.load(f)

# Create new list without IDs
data_without_ids = [
    {"parent": entry["parent"], "child": entry["child"]}
    for entry in data_with_ids
]

# Save to a new submission file
with open("outputs/submission_pairs.json", "w") as f:
    json.dump(data_without_ids, f, indent=2)

print("✅ Submission file written to outputs/submission_pairs.json")


✅ Submission file written to outputs/submission_pairs.json
