In [None]:
import json
from pathlib import Path

INPUT_DIR = Path("/kaggle/input/fullfull/annotations")
OUTPUT_FILE = Path("merged_full.jsonl") 

merged = []

for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            text = rec.get("text", "")
            spans = rec.get("spans", [])
            relations = rec.get("relations", [])

            if not spans:
                continue

            merged.append({
                "text": text,
                "spans": spans,
                "relations": relations
            })

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged and saved {len(merged)} records (text + spans + relations) to: {OUTPUT_FILE.resolve()}")

In [None]:
import json
from pathlib import Path

INPUT_DIR = Path("/kaggle/input/fullfull/annotations")
OUTPUT_FILE = Path("merged_full.jsonl")  

merged = []
relation_count = 0  

for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            text = rec.get("text", "")
            spans = rec.get("spans", [])
            relations = rec.get("relations", [])

          
            if not spans:
                continue

      
            relation_count += len(relations) 

            merged.append({
                "text": text,
                "spans": spans,
                "relations": relations
            })


with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged and saved {len(merged)} records (text + spans + relations) to: {OUTPUT_FILE.resolve()}")
print(f"Total number of relation labels annotated: {relation_count}")


In [None]:
import json
from pathlib import Path
from itertools import product


INPUT_FILE = Path("merged_full.jsonl")
OUTPUT_FILE = Path("relation_binary.jsonl")

relation_data = []

with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record.get("text", "")
        spans = record.get("spans", [])
        relations = record.get("relations", [])

  
        for idx, span in enumerate(spans):
            span["id"] = idx

 
        pos2id = {
            (span["token_start"], span["token_end"]): span["id"]
            for span in spans
        }

        pos_set = set()
        rel_type_map = {}
        for rel in relations:
  
            head_lbl = rel["head_span"].get("label")
            child_lbl = rel["child_span"].get("label")
            if not (head_lbl == "PATIENT" and child_lbl in ("HPO_TERM", "GENE_VARIANT")):
                continue

            h_key = (rel["head_span"]["token_start"], rel["head_span"]["token_end"])
            c_key = (rel["child_span"]["token_start"], rel["child_span"]["token_end"])
            if h_key not in pos2id or c_key not in pos2id:
                continue

            head_id  = pos2id[h_key]
            child_id = pos2id[c_key]
            pos_set.add((head_id, child_id))
            rel_type_map[(head_id, child_id)] = rel.get("label", "unknown")


        for span1, span2 in product(spans, spans):
            if span1["id"] == span2["id"]:
                continue
            if span1["label"] != "PATIENT":
                continue
            if span2["label"] not in ("HPO_TERM", "GENE_VARIANT"):
                continue

            pair = (span1["id"], span2["id"])
            label = 1 if pair in pos_set else 0
            relation = rel_type_map.get(pair, "no_relation")

            relation_data.append({
                "text":       text,
                "head":       span1["text"],
                "head_type":  span1["label"],
                "child":      span2["text"],
                "child_type": span2["label"],
                "relation":   relation,
                "label":      label,
            })

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f" Generated {len(relation_data)} relation instances and saved to: {OUTPUT_FILE.resolve()}")

In [None]:
import json
from pathlib import Path

# Input file
INPUT_FILE = Path("relation_binary.jsonl")

# Output files
POSITIVES_FILE = Path("positives_triples.jsonl")
NEGATIVES_FILE = Path("negatives_triples.jsonl")

with INPUT_FILE.open("r", encoding="utf-8") as infile, \
     POSITIVES_FILE.open("w", encoding="utf-8") as pos_out, \
     NEGATIVES_FILE.open("w", encoding="utf-8") as neg_out:

    for line in infile:
        rec = json.loads(line)
        triple = {
            "head":       rec["head"],
            "head_type":  rec["head_type"],
            "child":      rec["child"],
            "child_type": rec["child_type"],
            "relation":   rec["relation"],
            "label":      rec["label"],
        }
        if rec.get("label") == 1:
            pos_out.write(json.dumps(triple, ensure_ascii=False) + "\n")
        else:
            neg_out.write(json.dumps(triple, ensure_ascii=False) + "\n")

print(f" Positive triples saved to: {POSITIVES_FILE.resolve()}")
print(f" Negative triples saved to: {NEGATIVES_FILE.resolve()}")

In [None]:
import json
from pathlib import Path

POSITIVES_FILE = Path("positives_triples.jsonl")
NEGATIVES_FILE = Path("negatives_triples.jsonl")


positive_count = 0
negative_count = 0


with POSITIVES_FILE.open("r", encoding="utf-8") as pos_file:
    for line in pos_file:
        rec = json.loads(line)
        if rec.get("label") == 1:
            positive_count += 1

with NEGATIVES_FILE.open("r", encoding="utf-8") as neg_file:
    for line in neg_file:
        rec = json.loads(line)
        if rec.get("label") == 0:
            negative_count += 1


total_count = positive_count + negative_count


print(f"Total number of positive instances (label = 1): {positive_count}")
print(f"Total number of negative instances (label = 0): {negative_count}")
print(f"Total number of instances: {total_count}")


In [None]:




import json
from pathlib import Path
from itertools import product


INPUT_FILE = Path("merged_full_1.jsonl")
OUTPUT_FILE = Path("relation_binary_1.jsonl")

relation_data = []
positive_count = 0 
negative_count = 0 


with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record.get("text", "")
        spans = record.get("spans", [])
        relations = record.get("relations", [])

  
        for idx, span in enumerate(spans):
            span["id"] = idx


        pos2id = {
            (span["token_start"], span["token_end"]): span["id"]
            for span in spans
        }


        pos_set = set() 
        for span1, span2 in product(spans, spans):
            if span1["id"] == span2["id"]:
                continue
            if span1["label"] != "PATIENT":
                continue
            if span2["label"] not in ("HPO_TERM", "GENE_VARIANT"):
                continue

            pair = (span1["id"], span2["id"])


            label = 0
            for rel in relations:
                head_lbl = rel["head_span"].get("label")
                child_lbl = rel["child_span"].get("label")
                if head_lbl == "PATIENT" and child_lbl in ("HPO_TERM", "GENE_VARIANT"):
                    h_key = (rel["head_span"]["token_start"], rel["head_span"]["token_end"])
                    c_key = (rel["child_span"]["token_start"], rel["child_span"]["token_end"])
                    if h_key == (span1["token_start"], span1["token_end"]) and c_key == (span2["token_start"], span2["token_end"]):
                        label = 1
                        break

            relation = "no_relation" if label == 0 else "relation"

    
            relation_data.append({
                "text": text,
                "head": span1["text"],
                "head_type": span1["label"],
                "child": span2["text"],
                "child_type": span2["label"],
                "relation": relation,
                "label": label,
            })

      
            if label == 1:
                positive_count += 1
            else:
                negative_count += 1


with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Generated {len(relation_data)} relation instances and saved to: {OUTPUT_FILE.resolve()}")
print(f"Total number of positive instances (label = 1): {positive_count}")
print(f"Total number of negative instances (label = 0): {negative_count}")


In [None]:
import json
from pathlib import Path
from itertools import product

INPUT_DIR = Path("/kaggle/input/dataset/1")
OUTPUT_FILE = Path("merged_full_1.jsonl")  

merged = []
relation_count = 0 


for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            text = rec.get("text", "")
            spans = rec.get("spans", [])
            relations = rec.get("relations", [])


            if not spans:
                continue


            relation_count += len(relations) 

            merged.append({
                "text": text,
                "spans": spans,
                "relations": relations
            })


with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")


print(f"Merged and saved {len(merged)} records (text + spans + relations) to: {OUTPUT_FILE.resolve()}")
print(f"Total number of relation labels annotated: {relation_count}")



INPUT_FILE = Path("merged_full_1.jsonl")
OUTPUT_FILE = Path("relation_binary_1.jsonl")

relation_data = []
positive_count = 0  
negative_count = 0  

with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record.get("text", "")
        spans = record.get("spans", [])
        relations = record.get("relations", [])

        for idx, span in enumerate(spans):
            span["id"] = idx

        pos2id = {
            (span["token_start"], span["token_end"]): span["id"]
            for span in spans
        }


        pos_set = set()  
        for span1, span2 in product(spans, spans):
            if span1["id"] == span2["id"]:
                continue
            if span1["label"] != "PATIENT":
                continue
            if span2["label"] not in ("HPO_TERM", "GENE_VARIANT"):
                continue

            pair = (span1["id"], span2["id"])

            label = 0
            for rel in relations:
                head_lbl = rel["head_span"].get("label")
                child_lbl = rel["child_span"].get("label")
                if head_lbl == "PATIENT" and child_lbl in ("HPO_TERM", "GENE_VARIANT"):
                    h_key = (rel["head_span"]["token_start"], rel["head_span"]["token_end"])
                    c_key = (rel["child_span"]["token_start"], rel["child_span"]["token_end"])
                    if h_key == (span1["token_start"], span1["token_end"]) and c_key == (span2["token_start"], span2["token_end"]):
                        label = 1
                        break

            relation = "no_relation" if label == 0 else "relation" 


            relation_data.append({
                "text": text,
                "head": span1["text"],
                "head_type": span1["label"],
                "child": span2["text"],
                "child_type": span2["label"],
                "relation": relation,
                "label": label,
            })

   
            if label == 1:
                positive_count += 1
            else:
                negative_count += 1

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Generated {len(relation_data)} relation instances and saved to: {OUTPUT_FILE.resolve()}")
print(f"Total number of positive instances (label = 1): {positive_count}")
print(f"Total number of negative instances (label = 0): {negative_count}")


In [None]:
import json
from pathlib import Path
from itertools import product

# Part 1: Merge all files into merged_full_1.jsonl
INPUT_DIR = Path("/kaggle/input/dataset/1")
OUTPUT_FILE_MERGED = Path("merged_full_1.jsonl")  # Output file for merged data

merged = []
relation_count = 0  # Count total relations

# Merge data from all files
for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            text = rec.get("text", "")
            spans = rec.get("spans", [])
            relations = rec.get("relations", [])

            # Skip records without spans
            if not spans:
                continue

            # Count relations in this record
            relation_count += len(relations)

            merged.append({
                "text": text,
                "spans": spans,
                "relations": relations
            })

# Save merged data to output file
with OUTPUT_FILE_MERGED.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

# Print statistics
print(f"Merged and saved {len(merged)} records (text + spans + relations) to: {OUTPUT_FILE_MERGED.resolve()}")
print(f"Total number of relation labels annotated: {relation_count}")

# Part 2: Generate relation instances
INPUT_FILE = Path("merged_full_1.jsonl")
OUTPUT_FILE_RELATIONS = Path("relation_binary_1.jsonl")

relation_data = []
positive_count = 0  # Count positive instances
negative_count = 0  # Count negative instances

# Read merged input file
with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record.get("text", "")
        spans = record.get("spans", [])
        relations = record.get("relations", [])

        # 1. Assign unique integer IDs to each span
        for idx, span in enumerate(spans):
            span["id"] = idx

        # 2. Build mapping of token positions to span IDs
        pos2id = {
            (span["token_start"], span["token_end"]): span["id"]
            for span in spans
        }

        # 3. Collect annotated relation pairs (PATIENT → HPO_TERM or GENE_VARIANT)
        pos_set = set()
        for rel in relations:
            head_lbl = rel["head_span"].get("label")
            child_lbl = rel["child_span"].get("label")
            if head_lbl == "PATIENT" and child_lbl in ("HPO_TERM", "GENE_VARIANT"):
                h_key = (rel["head_span"]["token_start"], rel["head_span"]["token_end"])
                c_key = (rel["child_span"]["token_start"], rel["child_span"]["token_end"])
                if h_key in pos2id and c_key in pos2id:
                    head_id = pos2id[h_key]
                    child_id = pos2id[c_key]
                    pos_set.add((head_id, child_id))

        # 4. Generate all possible PATIENT → HPO_TERM or GENE_VARIANT pairs
        patients = [s for s in spans if s["label"] == "PATIENT"]
        targets = [s for s in spans if s["label"] in ("HPO_TERM", "GENE_VARIANT")]
        for span1, span2 in product(patients, targets):
            if span1["id"] == span2["id"]:  # Skip same entity
                continue

            pair = (span1["id"], span2["id"])
            # Label as positive if pair exists in pos_set, else negative
            label = 1 if pair in pos_set else 0
            relation = "relation" if label == 1 else "no_relation"

            # Record relation instance
            relation_data.append({
                "text": text,
                "head": span1["text"],
                "head_type": span1["label"],
                "child": span2["text"],
                "child_type": span2["label"],
                "relation": relation,
                "label": label,
            })

            # Update counters
            if label == 1:
                positive_count += 1
            else:
                negative_count += 1

# Save relation instances to output file
with OUTPUT_FILE_RELATIONS.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Generated {len(relation_data)} relation instances and saved to: {OUTPUT_FILE_RELATIONS.resolve()}")
print(f"Total number of positive instances (label = 1): {positive_count}")
print(f"Total number of negative instances (label = 0): {negative_count}")

In [None]:
import json
from pathlib import Path
from itertools import product


INPUT_DIR = Path("/kaggle/input/dataset/1")
OUTPUT_FILE_MERGED = Path("merged_full_1.jsonl")

merged = []
relation_count = 0 


for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                continue


            relations = rec.get("relations", [])
            

            if not rec.get("spans") and not relations:
                continue
            

            relation_count += len(relations)
            

            spans = rec.get("spans", [])
            

            merged.append({
                "text": rec.get("text", ""),
                "spans": spans,
                "relations": relations
            })


with OUTPUT_FILE_MERGED.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged {len(merged)} records")
print(f"Total relations: {relation_count}")


INPUT_FILE = Path("merged_full_1.jsonl")
OUTPUT_FILE_RELATIONS = Path("relation_binary_1.jsonl")

relation_data = []
positive_count = 0
negative_count = 0
no_entity_records = 0 
no_entity_relations = 0  
no_entity_target_relations = 0 

with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record["text"]
        spans = record["spans"]
        relations = record["relations"]
        

        if not spans:
            no_entity_records += 1
            no_entity_relations += len(relations)  
            

            for rel in relations:
                head_span = rel["head_span"]
                child_span = rel["child_span"]
                

                if (head_span["label"] == "PATIENT" and 
                    child_span["label"] in ("HPO_TERM", "GENE_VARIANT")):
                    
                    no_entity_target_relations += 1 
                    

                    relation_data.append({
                        "text": text,
                        "head": head_span["text"],
                        "head_type": head_span["label"],
                        "child": child_span["text"],
                        "child_type": child_span["label"],
                        "relation": "relation",
                        "label": 1,
                    })
                    positive_count += 1
            continue


        for idx, span in enumerate(spans):
            span["id"] = idx

        pos2id = {}
        for span in spans:
            key = (span["token_start"], span["token_end"])
            pos2id[key] = span["id"]


        positive_relations = set()
        for rel in relations:
            head_span = rel["head_span"]
            child_span = rel["child_span"]
    
            if (head_span["label"] == "PATIENT" and 
                child_span["label"] in ("HPO_TERM", "GENE_VARIANT")):
                
                head_key = (head_span["token_start"], head_span["token_end"])
                child_key = (child_span["token_start"], child_span["token_end"])
                
                if head_key in pos2id and child_key in pos2id:
                    positive_relations.add((pos2id[head_key], pos2id[child_key]))


        patients = [s for s in spans if s["label"] == "PATIENT"]
        targets = [s for s in spans if s["label"] in ("HPO_TERM", "GENE_VARIANT")]
        
        for patient, target in product(patients, targets):
            if patient["id"] == target["id"]:
                continue
                

            is_positive = (patient["id"], target["id"]) in positive_relations
            label = 1 if is_positive else 0
            
            relation_data.append({
                "text": text,
                "head": patient["text"],
                "head_type": patient["label"],
                "child": target["text"],
                "child_type": target["label"],
                "relation": "relation" if is_positive else "no_relation",
                "label": label,
            })

            if is_positive:
                positive_count += 1
            else:
                negative_count += 1


with OUTPUT_FILE_RELATIONS.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Generated {len(relation_data)} relation instances")
print(f"Positive instances: {positive_count}")
print(f"Negative instances: {negative_count}")
print(f"Records without entities but with relations: {no_entity_records}")
print(f"Total relations in records without entities: {no_entity_relations}")
print(f"Target relations (PATIENT→HPO_TERM/GENE_VARIANT) in records without entities: {no_entity_target_relations}")

In [None]:
import json
from pathlib import Path
from itertools import product


INPUT_DIR = Path("/kaggle/input/dataset/1")
OUTPUT_FILE_MERGED = Path("merged_full_1.jsonl")

merged = []
relation_count = 0 

for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                continue

       
            relations = rec.get("relations", [])
            
     
            if not rec.get("spans") and not relations:
                continue
            
            relation_count += len(relations)
            

            spans = rec.get("spans", [])
            

            merged.append({
                "text": rec.get("text", ""),
                "spans": spans,
                "relations": relations
            })

with OUTPUT_FILE_MERGED.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged {len(merged)} records")
print(f"Total relations: {relation_count}")


INPUT_FILE = Path("merged_full_1.jsonl")
OUTPUT_FILE_RELATIONS = Path("relation_binary_1.jsonl")

relation_data = []
positive_count = 0
negative_count = 0

with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record["text"]
        spans = record["spans"]
        relations = record["relations"]
        

        if not spans:
            continue


        for idx, span in enumerate(spans):
            span["id"] = idx


        pos2id = {}
        for span in spans:
            key = (span["token_start"], span["token_end"])
            pos2id[key] = span["id"]


        positive_relations = set()
        for rel in relations:
            head_span = rel["head_span"]
            child_span = rel["child_span"]
            
 
            if (head_span["label"] == "PATIENT" and 
                child_span["label"] in ("HPO_TERM", "GENE_VARIANT")):
                
                head_key = (head_span["token_start"], head_span["token_end"])
                child_key = (child_span["token_start"], child_span["token_end"])
                
                if head_key in pos2id and child_key in pos2id:
                    positive_relations.add((pos2id[head_key], pos2id[child_key]))


        patients = [s for s in spans if s["label"] == "PATIENT"]
        targets = [s for s in spans if s["label"] in ("HPO_TERM", "GENE_VARIANT")]
        
        for patient, target in product(patients, targets):
            if patient["id"] == target["id"]:
                continue
                

            is_positive = (patient["id"], target["id"]) in positive_relations
            label = 1 if is_positive else 0
            
            relation_data.append({
                "text": text,
                "head": patient["text"],
                "head_type": patient["label"],
                "child": target["text"],
                "child_type": target["label"],
                "relation": "relation" if is_positive else "no_relation",
                "label": label,
            })

            if is_positive:
                positive_count += 1
            else:
                negative_count += 1


with OUTPUT_FILE_RELATIONS.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"Generated {len(relation_data)} relation instances")
print(f"Positive instances: {positive_count}")
print(f"Negative instances: {negative_count}")