In [1]:
import json
from pathlib import Path

INPUT_DIR = Path("/kaggle/input/fullfull/annotations")
OUTPUT_FILE = Path("merged_spans_with_entities.jsonl")

merged = []

for span_path in sorted(INPUT_DIR.glob("*_spans.jsonl")):
    filename = span_path.name
    with span_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            spans = rec.get("spans", [])
            if not spans:
                continue

            entry = {
                "text": rec.get("text", ""),
                "tokens": rec.get("tokens", []),
                "spans": spans,
            }
            merged.append(entry)

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged and saved {len(merged)} entity-containing records to: {OUTPUT_FILE.resolve()}")


Skipping empty line at polg_16919951_spans.jsonl:1
JSON decode error at polg_16919951_spans.jsonl:2 — Expecting value: line 1 column 1 (char 0)
Skipping empty line at polg_16919951_spans.jsonl:3
Merged and saved 675 entity-containing records to: /kaggle/working/merged_spans_with_entities.jsonl


In [None]:
import json
from pathlib import Path

INPUT_DIR = Path("/kaggle/input/fullfull/annotations")      
OUTPUT_FILE = Path("merged_spans.jsonl")                 

merged = []

for span_path in sorted(INPUT_DIR.glob("*_spans.jsonl")):
    filename = span_path.name
    with span_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            spans = rec.get("spans", [])
            if not spans:
                continue

            merged.append({
                "text": rec.get("text", ""),
                "spans": spans,
            })

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged and saved {len(merged)} records (text + spans) to: {OUTPUT_FILE.resolve()}")


Skipping empty line at polg_16919951_spans.jsonl:1
JSON decode error at polg_16919951_spans.jsonl:2 — Expecting value: line 1 column 1 (char 0)
Skipping empty line at polg_16919951_spans.jsonl:3
Merged and saved 675 records (text + spans) to: /kaggle/working/merged_spans.jsonl


In [3]:
import json

def convert_to_spans(jsonl_path, output_path):
    def find_spans(text, entity_text):
        spans = []
        start = 0
        while True:
            idx = text.find(entity_text, start)
            if idx < 0:
                break
            spans.append((idx, idx + len(entity_text)))
            start = idx + len(entity_text)
        return spans

    output = []
    with open(jsonl_path, "r", encoding="utf-8") as fin:
        for lineno, line in enumerate(fin, 1):
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON parse error on line {lineno}: {e}")
                print(">>", line)
                continue  # skip bad line
            text = entry.get("text", "")
            entities = entry.get("entities", [])
            spans = []
            for ent in entities:
                for start, end in find_spans(text, ent["text"]):
                    spans.append({
                        "start": start,
                        "end": end,
                        "label": ent["label"]
                    })
            output.append({"text": text, "spans": spans})

    with open(output_path, "w", encoding="utf-8") as fout:
        for item in output:
            fout.write(json.dumps(item, ensure_ascii=False) + "\n")

# Usage
convert_to_spans(
    "/kaggle/input/silver-standard-data/temple.jsonl",
    "/kaggle/working/temple.jsonl"
)


In [5]:
# Automatically install `iterative-stratification` if missing
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
except ModuleNotFoundError:
    import subprocess, sys
    subprocess.check_call([
        sys.executable, "-m", "pip", "install", "-q", "iterative-stratification"
    ])
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


In [None]:
# ================================================================
#  Full pipeline for preparing BIO data with *stratified* splits
#  (English comments throughout for clarity)
#  ---------------------------------------------------------------
#  1)  Count global entity frequencies              (entity_stats)
#  2)  Multi‑label iterative–stratified split 8/1/1 (stratified_split)
#      – stratification **only** on four key labels:
#        HPO_TERM / GENE_VARIANT / AGE_ONSET / AGE_FOLLOWUP
#  3)  Print entity distributions for each split    (show_stats)
#  4)  Load silver‑standard corpora & extend TRAIN  (extend_train)
#      – DEV / TEST remain frozen for fair eval
# ================================================================

import json
from pathlib import Path
from collections import Counter, defaultdict

import pandas as pd            
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from transformers import AutoTokenizer


FILE_MERGED = Path("/kaggle/working/merged_spans_with_entities.jsonl")
PATH_PUBMED = Path("/kaggle/working/temple.jsonl")
DIR_SILVER  = Path("/kaggle/input/silver-standard-data")       # *.jsonl files
OUT_DIR     = Path("/kaggle/working/bio_outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_BIO = OUT_DIR / "train.jsonl"
DEV_BIO   = OUT_DIR / "dev.jsonl"
TEST_BIO  = OUT_DIR / "test.jsonl"

# entity labels present in the annotation
ENTITY_TYPES = {
    "AGE_ONSET", "AGE_FOLLOWUP", "AGE_DEATH",
    "PATIENT", "HPO_TERM",
    "GENE", "GENE_VARIANT",
}

# only these are used for stratified splitting
KEY_TYPES = ["HPO_TERM", "GENE_VARIANT", "AGE_ONSET", "AGE_FOLLOWUP"]

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    use_fast=True
)


def iter_jsonl(path: Path):
    """Yield dicts from a .jsonl file, robust to blank / bad lines."""
    with path.open("r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except json.JSONDecodeError:
                continue


def make_bio_labels(spans, enc):
    """Return list[str] BIO tags aligned to the tokenizer output."""
    tokens   = enc.tokens()
    offsets  = enc["offset_mapping"]
    word_ids = enc.word_ids()

    tags = ["O"] * len(tokens)
    span_to_tokens = []

    # mark all entity spans
    for sp in spans:
        s, e, typ = sp["start"], sp["end"], sp["label"]
        idxs = [
            i for i, (b, t) in enumerate(offsets)
            if not (t <= s or b >= e)          # overlap with char‑span
        ]
        span_to_tokens.append(idxs)
        if not idxs:
            continue
        tags[idxs[0]] = f"B-{typ}"
        for i in idxs[1:]:
            tags[i] = f"I-{typ}"

    # single‑token spans must be "B‑" not "I‑"
    for idxs in span_to_tokens:
        if len(idxs) == 1:
            tags[idxs[0]] = tags[idxs[0]].replace("I-", "B-")

    # continuation word‑pieces: O → I‑sameLabel
    prev_wid = None
    for i, wid in enumerate(word_ids):
        if (wid is not None and wid == prev_wid
                and tags[i] == "O"
                and tags[i - 1].startswith(("B-", "I-"))):
            tags[i] = "I-" + tags[i - 1][2:]
        prev_wid = wid

    return tags


def record_to_bio(rec):
    """Convert one raw JSON record to {'tokens': [...], 'labels': [...]}."""
    text  = rec.get("text", "")
    spans = [s for s in rec.get("spans", []) if s.get("label") in ENTITY_TYPES]
    if not spans:
        return None

    enc = tokenizer(
        text,
        add_special_tokens=False,
        return_offsets_mapping=True,
        truncation=True,
        max_length=512
    )
    return {
        "tokens": enc.tokens(),
        "labels": make_bio_labels(spans, enc)
    }


def dump_jsonl(path: Path, data):
    """Write list[dict] to disk as UTF‑8 JSON‑lines."""
    with path.open("w", encoding="utf-8") as fh:
        for obj in data:
            fh.write(json.dumps(obj, ensure_ascii=False) + "\n")


def all_entity_stats(bio_data):
    """Return (global_counter, per_type_counter_dict)."""
    global_cnt = Counter()
    per_type   = defaultdict(Counter)

    for item in bio_data:
        tokens, labels = item["tokens"], item["labels"]
        i = 0
        while i < len(labels):
            if labels[i].startswith("B-"):
                typ = labels[i][2:]
                j = i + 1
                while j < len(labels) and labels[j] == f"I-{typ}":
                    j += 1
                surf = tokenizer.convert_tokens_to_string(tokens[i:j]).strip()
                global_cnt[surf] += 1
                per_type[typ][surf] += 1
                i = j
            else:
                i += 1
    return global_cnt, per_type



print(">> Loading merged file …")
merged_bio = [
    bio for rec in iter_jsonl(FILE_MERGED)
    if (bio := record_to_bio(rec)) is not None
]
print(f"Loaded {len(merged_bio)} annotated records")

# Optional sanity‑check
GLOBAL_CNT, TYPE_CNT = all_entity_stats(merged_bio)
for t in KEY_TYPES:
    rare = {e: c for e, c in TYPE_CNT[t].items() if c < 3}
    print(f"{t:14s} : {len(rare):4d} entities appear < 3×")


def build_label_matrix(bio_data):
    """Binary multi‑label matrix   shape = (n_docs, len(KEY_TYPES))."""
    bags = []
    for it in bio_data:
        present = set(
            lab[2:] for lab in it["labels"]
            if lab.startswith("B-") and lab[2:] in KEY_TYPES
        )
        bags.append(list(present))

    mlb = MultiLabelBinarizer(classes=KEY_TYPES)
    return mlb.fit_transform(bags)


label_matrix = build_label_matrix(merged_bio)

# First split out TEST (10 %)
print(">> Stratified 4‑fold split for TEST …")
mskf = MultilabelStratifiedKFold(
    n_splits=4, shuffle=True, random_state=42
)
train_dev_idx, test_idx = next(
    mskf.split(range(len(merged_bio)), label_matrix)
)
test_m    = [merged_bio[i] for i in test_idx]
train_dev = [merged_bio[i] for i in train_dev_idx]

# Then split TRAIN / DEV (9:1 ⇒ 10 %)
print(">> Stratified 3‑fold split for DEV …")
label_matrix_td = label_matrix[train_dev_idx]
mskf2 = MultilabelStratifiedKFold(
    n_splits=3, shuffle=True, random_state=42
)
td_idx, dev_idx = next(
    mskf2.split(range(len(train_dev)), label_matrix_td)
)
train_m = [train_dev[i] for i in td_idx]
dev_m   = [train_dev[i] for i in dev_idx]

print(f"Split sizes →  TRAIN:{len(train_m)}  DEV:{len(dev_m)}  TEST:{len(test_m)}")


def entity_freq(bio_data):
    """Return nested Counter: {label_type -> Counter(surface -> freq)}."""
    out = defaultdict(Counter)
    for it in bio_data:
        toks, labs = it["tokens"], it["labels"]
        i = 0
        while i < len(labs):
            if labs[i].startswith("B-"):
                typ = labs[i][2:]
                j = i + 1
                while j < len(labs) and labs[j] == f"I-{typ}":
                    j += 1
                surf = tokenizer.convert_tokens_to_string(toks[i:j]).strip()
                out[typ][surf] += 1
                i = j
            else:
                i += 1
    return out


def print_stats(split_name, counter_dict, *, show_types=None, top_k=None):
    """
    Nicely print entity frequencies.

    Parameters
    ----------
    split_name   : str   – header printed before the table
    counter_dict : dict  – output of `entity_freq`
    show_types   : iterable[str] | None
                    Which label types to print.  None → print all.
    top_k        : int | None
                    How many rows per type.  None → print all.
    """
    print(f"\n===== {split_name} =====")

    types = show_types if show_types is not None else sorted(counter_dict)

    for typ in types:
        sub = counter_dict.get(typ, {})
        if not sub:          # skip empty categories
            continue

        rows = sub.most_common(top_k or len(sub))
        print(f"\n{typ}  (#unique={len(sub)})")
        for ent, freq in rows:
            print(f"  {ent:<45} {freq}")


split_entity_cnt = {
    "TRAIN": entity_freq(train_m),
    "DEV":   entity_freq(dev_m),
    "TEST":  entity_freq(test_m),
}

for name, cnt in split_entity_cnt.items():
    print_stats(name, cnt, show_types=None, top_k=None)


TRAIN_GOLD = OUT_DIR / "train_gold.jsonl"
dump_jsonl(TRAIN_GOLD, train_m)
print(f">> Saved gold-standard TRAIN ➜ {TRAIN_GOLD.name} ({len(train_m)} records)")

# extend_train  – add PubMed + silver‑standard corpora

def load_extra_bio(path: Path):
    """Convert an external jsonl file to BIO format."""
    extra = []
    if path.exists():
        print(f">> Converting {path.name}")
        extra.extend(
            bio for rec in iter_jsonl(path)
            if (bio := record_to_bio(rec)) is not None
        )
    else:
        print(f">> {path} not found – skipped")
    return extra


extra_train = load_extra_bio(PATH_PUBMED)

if DIR_SILVER.exists():
    for jf in sorted(DIR_SILVER.glob("*.jsonl")):
        if jf.name == "temple.jsonl":
            continue
        extra_train.extend(load_extra_bio(jf))
else:
    print(">> DIR_SILVER not found – skipped")

train_final = train_m + extra_train
print(f"TRAIN after extension : {len(train_final)} records (+{len(extra_train)} silver)")


dump_jsonl(TRAIN_BIO, train_final)
dump_jsonl(DEV_BIO,   dev_m)
dump_jsonl(TEST_BIO,  test_m)

print(f"\nSaved  ➜  {TRAIN_BIO.name},  {DEV_BIO.name},  {TEST_BIO.name}")


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

>> Loading merged file …
Loaded 675 annotated records
HPO_TERM       : 1689 entities appear < 3×
GENE_VARIANT   :  188 entities appear < 3×
AGE_ONSET      :   66 entities appear < 3×
AGE_FOLLOWUP   :   49 entities appear < 3×
>> Stratified 4‑fold split for TEST …
>> Stratified 3‑fold split for DEV …
Split sizes →  TRAIN:338  DEV:169  TEST:168

===== TRAIN =====

AGE_DEATH  (#unique=14)
  9 months after                                2
  5                                             2
  66                                            2
  17 years 9 months                             1
  17                                            1
  18 - month                                    1
  16 - months                                   1
  5. 5 years                                    1
  27 months                                     1
  2 weeks after                                 1
  polg1                                         1
  67                                            1
  32 months

In [7]:
import re, json, random
from collections import defaultdict
from typing import List, Tuple, Optional

raw = r"""===== TRAIN =====

AGE_DEATH  (#unique=21)
  17                                            2
  18 - month                                    2
  8 months                                      2
  17 years 9 months                             1
  59                                            1
  5. 5 months                                   1
  16 - months                                   1
  5. 5                                          1
  5½                                            1
  9 months after                                1
  27 months                                     1
  50 months                                     1
  3 years 6 months                              1
  5                                             1
  66                                            1
  67                                            1
  32 months                                     1
  8 - month                                     1
  7 months                                      1
  3 months after presenting                     1
  two months                                    1

AGE_FOLLOWUP  (#unique=48)
  32                                            3
  26                                            3
  17                                            2
  54                                            2
  58                                            2
  27                                            2
  16                                            2
  23                                            2
  30                                            2
  52                                            2
  29                                            2
  57                                            2
  69                                            2
  38                                            2
  28                                            1
  3. 5                                          1
  55                                            1
  8 months after                                1
  gestational age = 36 weeks                    1
  focal seizures refractory                     1
  multifocality of her seizures                 1
  focal motor seizure                           1
  visual sensory seizure                        1
  right homonymous hemianopsia                  1
  somatosensory seizure                         1
  cognitive side effects                        1
  focal status epilepticus                      1
  eeg with interictal left parieto - occipital sharp waves 1
  parieto - occipital sharp waves               1
  frequent migraine headaches                   1
  visual aura                                   1
  moderate obesity                              1
  abdominal striae                              1
  a 54                                          1
  45                                            1
  26 -                                          1
  74                                            1
  2008                                          1
  59                                            1
  79                                            1
  35                                            1
  80                                            1
  60                                            1
  1 year of follow - up                         1
  4                                             1
  14                                            1
  25                                            1
  37                                            1

AGE_ONSET  (#unique=61)
  15                                            3
  31 / 2 months                                 3
  16                                            3
  23                                            2
  2                                             2
  4                                             2
  64                                            2
  24                                            2
  72                                            2
  childhood                                     2
  33                                            2
  2 years pre viously                           1
  9                                             1
  12                                            1
  30                                            1
  36                                            1
  ﬁve                                           1
  22                                            1
  18 - month                                    1
  2 months                                      1
  a 5 - year history                            1
  of 5 months                                   1
  present for 10 years                          1
  14 - month                                    1
  80 - year - old - man presented with a 7 - year history 1
  infantile - onset                             1
  6 months                                      1
  16 months                                     1
  19                                            1
  17                                            1
  primary school                                1
  3 - year history                              1
  31 / 2 month                                  1
  eight year history                            1
  1year                                         1
  13                                            1
  43                                            1
  past ten years                                1
  18 months                                     1
  9 months                                      1
  39                                            1
  early childhood                               1
  four months                                   1
  4 - month                                     1
  4 months                                      1
  at 4 months                                   1
  forties                                       1
  5 - year history                              1
  48                                            1
  34                                            1
  neonate                                       1
  immediately after birth                       1
  49                                            1
  35                                            1
  19 months                                     1
  55                                            1
  27                                            1
  7                                             1
  first day                                     1
  child                                         1
  early 50s                                     1

GENE  (#unique=15)
  polg                                          127
  polg1                                         65
  polg - 1                                      3
  polymerase gamma                              3
  polymerase ' y                                2
  p. a467t                                      2
  polγ                                          2
  dna polymerase gamma1                         1
  dna polymerase ' y                            1
  p. g848s                                      1
  polymerase gamma protein                      1
  c. 1399g > a                                  1
  c. 3285c > g                                  1
  p. s1095r                                     1
  polγa                                         1

GENE_VARIANT  (#unique=201)
  w748s                                         17
  a467t                                         16
  c. 2243g > c                                  6
  t251i                                         6
  p. w748s                                      4
  c. 752c > t                                   4
  p. a467t                                      4
  c. 1399g > a                                  4
  p587l                                         4
  p. ala467thr                                  4
  p. r964c                                      4
  p. l83p                                       3
  c. 2662g > a                                  3
  c. 1288a > t                                  3
  p. g848s                                      3
  l623w                                         3
  c. 1760c > t                                  3
  p. p587l                                      3
  c. 2851t > a                                  3
  p. y951n                                      3
  c. 2993c > t                                  3
  p. 998s > l                                   3
  c. 3550g > c                                  3
  p. 1184d > h                                  3
  p. trp748ser                                  3
  c. 2209g > c                                  3
  p. gly737arg                                  3
  r807c                                         3
  p. thr251ile                                  3
  p. pro587leu                                  3
  c. 1796c > t                                  3
  p. thr599ile                                  3
  c. 3104 + 3a > t                              3
  r853w                                         2
  c. 248t > c                                   2
  p. g888s                                      2
  t851a                                         2
  e1143g                                        2
  c. 2752t > c                                  2
  p. w918r                                      2
  1868t > g                                     2
  2263a > g                                     2
  k755e                                         2
  p. t251i                                      2
  k1191n                                        2
  c. 2542g > a                                  2
  c. 3626 _ 3629dupgata                         2
  c. 3643 + 2tnc                                2
  p. a862t                                      2
  p. h277l                                      2
  c. 1399g? a                                   2
  c. 2243g? c                                   2
  c. 1190c > t                                  2
  p. arg1096cys                                 2
  i1185n                                        2
  c. 680g > a                                   2
  p. arg227gin                                  2
  c. 3098c > t                                  2
  p. ala1033val                                 2
  c. 926g > a                                   2
  p. arg309his                                  2
  p587 l                                        2
  c. 2591a > g                                  2
  p. asn864ser                                  2
  c. 3649g > c                                  2
  p. ala1217pro                                 2
  c. 590t > c                                   2
  c. 2740a > c                                  2
  c. 2543g > c                                  2
  p. g848a                                      2
  c. 452 t > c                                  2
  p. l151p                                      2
  p. gly23serfs∗236                             2
  c. 67 _ 88del                                 2
  g2491c                                        1
  g737r                                         1
  , c. 248t > c                                 1
  g888s                                         1
  r1047w.                                       1
  a2551g                                        1
  c3139t in                                     1
  ( r1047w                                      1
  q879h                                         1
  t885s                                         1
  p. m430l                                      1
  p. m430                                       1
  1399g! a                                      1
  c. 3311cg                                     1
  ( p. s1104c                                   1
  c. 2542ga                                     1
  the t9256g                                    1
  a11390g                                       1
  ( k755e                                       1
  c. 3572a > g                                  1
  k1191n.                                       1
  tyr955cys                                     1
  c752t                                         1
  c1760t                                        1
  p. t251i /                                    1
  , c. 1760c > t                                1
  g848s                                         1
  common c. 1399gna                             1
  a467t )                                       1
  cis p. [ w748s                                1
  p. [ w748s ; e1143g                           1
  c. 3643 + 2 tnc                               1
  patient 1                                     1
  p. [ w748s ; e1143g ]                         1
  r953c                                         1
  deletion encompassing exons 15 [UNK] 21       1
  intragenic deletion of ~ 4. 7 kb              1
  c. 1156c > t                                  1
  p. r386c                                      1
  c. 2794c > t                                  1
  p. h932y                                      1
  pt914p                                        1
  p. [ ala467thr ]                              1
  p. [ gly848ser ]                              1
  p. g848s.                                     1
  c. 2584g > a                                  1
  c. 830a > t                                   1
  ser998leu                                     1
  asp1184his                                    1
  p765t                                         1
  1399g > a                                     1
  , a467t                                       1
  2243g > c                                     1
  homozygous c. 2243g > c                       1
  to p. trp748ser                               1
  c. 3556g > c                                  1
  p. d1186h                                     1
  d1186h                                        1
  p587                                          1
  d1186                                         1
  deleted                                       1
  deletion                                      1
  deletion comprises the entire polg1 gene      1
  polg1 gene was deleted on one allele          1
  one deleted polg1 allele                      1
  c. 2564t > c ) 1                              1
  ( c. 2564t > c ) 1                            1
  p. k512m                                      1
  c. 1535a > t                                  1
  c. 2665g > a                                  1
  p. a889t                                      1
  c. 2669c [ a                                  1
  p. d890a                                      1
  c. 3286c > t                                  1
  , c. 3286c > t                                1
  ( p. a467t                                    1
  and c. 3285c > g                              1
  p. s1095r                                     1
  i1185t                                        1
  a957v                                         1
  c. 2870c > t                                  1
  p. a957v                                      1
  c. 3554t > c                                  1
  p. i1185t                                     1
  pathogenic p. w748s                           1
  p. his945leu                                  1
  : c. 2890c > t                                1
  : p. trp748ser                                1
  and c. 3554t > a                              1
  : p. ile1185asn                               1
  i1185n.                                       1
  w748s.                                        1
  c. 2840a > g                                  1
  p. lys947arg                                  1
  c. 2890c > t                                  1
  c. 895a > c                                   1
  and c. 3626 _ 3629dupgata                     1
  t252i                                         1
  and p587l                                     1
  c. 2864a > g                                  1
  p. tyr955cys                                  1
  heterozygous c. 2693t > c                     1
  p. i898t                                      1
  p. f197s                                      1
  p. t914p                                      1
  p. ( f197s                                    1
  p. ( t914p )                                  1
  c. 911t [ g                                   1
  p. leu304arg                                  1
  c. 3287g > t                                  1
  p. arg1096leu                                 1
  r1096l                                        1
  c. 1789c > t                                  1
  p. arg597trp                                  1
  p. his1134tyr )                               1
  c. 3400c > t                                  1
  p. his1134tyr                                 1
  polg                                          1
  c. 3305a > c                                  1
  p. gln1102pro                                 1
  large deletion                                1
  exons 7 and 21                                1
  c. 3218c > t                                  1
  p [ pro1073leu ]                              1
  p. ( gly23serfs∗236 )                         1
  c. 752 c > t                                  1
  c. 1760 c > t                                 1

HPO_TERM  (#unique=1513)
  bilateral ptosis                              26
  dysarthria                                    20
  parkinsonism                                  19
  ptosis                                        17
  peo                                           16
  ataxia                                        13
  ophthalmoplegia                               12
  seizures                                      11
  mtdna depletion                               11
  hypotonia                                     11
  bradykinesia                                  10
  alpers syndrome                               10
  died                                          9
  dysphagia                                     9
  vomiting                                      8
  multiple mtdna deletions                      8
  status epilepticus                            8
  encephalopathy                                8
  external ophthalmoplegia                      8
  progressive bilateral ptosis                  8
  ovarian dysfunction                           8
  cerebellar atrophy                            7
  progressive external ophthalmoplegia          7
  myoclonus                                     7
  ragged red fibers                             7
  peripheral neuropathy                         6
  ragged - red fibers                           6
  areflexia                                     6
  epilepsy                                      6
  tremor                                        6
  postural instability                          6
  palatal tremor                                6
  gait ataxia                                   6
  epilepsia partialis continua                  6
  ophthalmoparesis                              6
  sando                                         6
  diplopia                                      6
  dystonia                                      6
  myopathy                                      6
  hypophonia                                    5
  neuropathy                                    5
  acute liver failure                           5
  hepatomegaly                                  5
  alpers disease                                5
  hepatic failure                               5
  myopathic                                     5
  cerebellar ataxia                             5
  dysphonia                                     5
  mitochondrial myopathy                        5
  lethargy                                      5
  severe hypotonia                              5
  fatigue                                       5
  myoclonic jerks                               5
  ovarian dysgenesis                            5
  bilateral cataracts                           5
  neuropathic pain                              5
  sensory ataxia                                4
  cortical blindness                            4
  psychomotor regression                        4
  lactic acidosis                               4
  pneumonia                                     4
  failure to thrive                             4
  exercise intolerance                          4
  thalamus                                      4
  progressive ataxia                            4
  severe bilateral ptosis                       4
  hearing loss                                  4
  headaches                                     4
  myocerebrohepatopathy                         4
  hepatopathy                                   4
  ragged red ﬁbers                              4
  polyneuropathy                                4
  shuffling gait                                3
  mitochondrial proliferation                   3
  depression                                    3
  visual aura                                   3
  slurred speech                                3
  liver dysfunction                             3
  respiratory failure                           3
  gliosis                                       3
  microvesicular steatosis                      3
  resting tremor                                3
  muscle weakness                               3
  facial dyskinesia                             3
  cognitive decline                             3
  cox - deficient                               3
  ascites                                       3
  focal seizures                                3
  acute disseminated encephalomyelitis          3
  basal ganglia                                 3
  severely encephalopathic                      3
  positive romberg sign                         3
  migraine                                      3
  dysphasia                                     3
  dystonic ulnar deviation                      3
  steatosis                                     3
  astrogliosis                                  3
  atrophy                                       3
  cataracts                                     3
  adult - onset                                 3
  non - convulsive status epilepticus           3
  camptocormia                                  3
  emesis                                        3
  demyelinating sensorimotor polyneuropathy     3
  action tremor                                 2
  distal weakness                               2
  facial masking                                2
  rigidity                                      2
  cytochrome c oxidase [UNK] deficient fibers   2
  rrfs                                          2
  elevated lactate                              2
  headache                                      2
  elevated csf lactate                          2
  cox - deficient fibers                        2
  refractory seizures                           2
  astrocytosis                                  2
  demyelination                                 2
  coagulopathy                                  2
  jaundice                                      2
  neuronal loss                                 2
  migraines                                     2
  liver failure                                 2
  proximal muscle weakness                      2
  cerebellar and sensory ataxia                 2
  complete ophthalmoplegia                      2
  parkinsonian                                  2
  cognitive impairment                          2
  recurrent hypoketotic hypoglycaemia           2
  hypoglycemia                                  2
  restriction of eye movements                  2
  subsarcolemmal accumulation of abnormal mitochondria 2
  recurrent vomiting                            2
  psychomotor retardation                       2
  cerebral atrophy                              2
  severe depression                             2
  multiple focal areas of t2 prolongation       2
  somnolent                                     2
  generalized status epilepticus                2
  sensory ataxic neuropathy                     2
  double vision                                 2
  ragged red fiber                              2
  impaired joint position sense                 2
  sensory axonopathy                            2
  positive antinuclear antibodies               2
  csf protein content was elevated              2
  brain cortical atrophy                        2
  progressive hepatic failure                   2
  depletion of mtdna                            2
  fatal                                         2
  encephalopathic                               2
  focal occipital status epilepticus            2
  confusion                                     2
  epileptic seizures                            2
  sensory [UNK] motor axonal neuropathy         2
  hyperlactatemia                               2
  moderate ketosis                              2
  cirrhosis                                     2
  bile ductular proliferation                   2
  refractory mixed type seizures                2
  micronodular cirrhosis                        2
  spongiosis                                    2
  developmental delay                           2
  external ophthalmoparesis                     2
  distal muscle weakness                        2
  3 - methylglutaconic acid                     2
  diffuse encephalopathy                        2
  complete external ophthalmoplegia             2
  axonal neuropathy                             2
  multiple mitochondrial dna deletions          2
  clumsiness                                    2
  cortical atrophy                              2
  choreoathetotic                               2
  hyperintensities in the thalamus              2
  cerebellum                                    2
  jerky torticollis                             2
  head tremor                                   2
  aphasia                                       2
  bilateral hearing loss                        2
  severe ophthalmoparesis                       2
  lack of cox activity                          2
  with alpers syndrome                          2
  frequent falls                                2
  hypoesthesia                                  2
  sando syndrome                                2
  gastroenteritis                               2
  poor feeding                                  2
  early cirrhosis                               2
  focal necrosis                                2
  absent deep tendon reﬂexes                    2
  hepatocellular dysfunction                    2
  fatty degeneration                            2
  tube feeding                                  2
  bilateral hypertrophic olivary degeneration   2
  childish behaviour                            2
  oocyte was abnormal with the presence of two polar bodies and granular cytoplasm 2
  infertility                                   2
  menstrual cycle was irregular                 2
  wide - based gait                             2
  positive romberg test                         2
  mega cisterna magna                           2
  cox - negative                                2
  low set ears                                  2
  bilateral clubfeet                            2
  cleft palate                                  2
  basal ganglia shows hyperintensity within bilateral lentiform nuclei 2
  gait disturbance                              2
  recurrent bowel obstruction                   2
  leukoencephalopathy                           2
  pes cavus                                     2
  sensory axonal neuropathy                     2
  severe external ophthalmoplegia               2
  progressive bilateral eye weakness            2
  drooped eyelids                               2
  optic atrophy                                 2
  dyskinesia                                    2
  hypomimia                                     2
  severe epilepsy                               2
  secondarily generalized focal crisis          2
  weakness                                      2
  variability in muscle ﬁber size               2
  cytochrome c oxidase deﬁciency                2
  photophobia                                   2
  progressive gait instability                  2
  pectus excavatum                              2
  muscle atrophy                                2
  intrauterine growth restriction               2
  cox - deficient muscle fibers                 2
  mr - spectroscopy showed an increased lactate peak 2
  asymmetric tremor                             2
  paresthesia                                   2
  cox - negative fibers                         2
  clumsy                                        2
  myalgias                                      2
  dystonic toe curling,                         1
  reduced arm swing                             1
  hyporeflexia                                  1
  foot numbness,                                1
  axonal predominantly sensory neuropathy.      1
  cogwheeling                                   1
  stooping                                      1
  lactate was elevated                          1
  dystonic toe curling                          1
  stiffness                                     1
  numbness in her feet                          1
  stooped posture,                              1
  sensorimotor, predominantly sensory neuropathy 1
  generalized cerebral                          1
  respiratory chain complexes containing mtdna - encoded subunits were decreased 1
  cytochrome c oxidase deficient                1
  focal fiber [UNK] type grouping               1
  postural - action tremor                      1
  sensory polyneuropathy                        1
  anxiety                                       1
  cytochrome c oxidase [UNK] negative           1
  head jerking to the left                      1
  visual scintillations                         1
  right sided headache                          1
  problems with coordination                    1
  left homonymous hemi anopia                   1
  left - sided focal sei zures with secondary generalization 1
  sta tus epilepticus                           1
  cognitive deficits in memory                  1
  motor apraxia                                 1
  left - sided homonymous hemianopia            1
  cognitive deficit                             1
  csf showed a mild increase in protein         1
  right sided occipital lesion                  1
  eeg showed focal sharp waves                  1
  sen sory axonal peripheral neuropathy         1
  subsar colemmal accumulation of mitochondria  1
  cytochrome c oxidase ( cox ) - deficient fibers 1
  strokelike episode                            1
  occipital localization                        1
  melas                                         1
  exposure to valproic acid                     1
  occipital lobe showed neuronal loss           1
  spongiform degeneration                       1
  prominent white matter changes                1
  multifocal partial seizures                   1
  eegs showed slow activity                     1
  continuous spike wave over the central, parietal, and occipital 1
  edema                                         1
  decreased ﬁbrinogen                           1
  abnormalities of alanine transaminase         1
  aspartate transaminase                        1
  total bilirubin                               1
  direct bilirubin                              1
  total protein                                 1
  albumin                                       1
  ﬁbrinogen                                     1
  oligoclone bands were positive                1
  abnormal signal in white matter was more extensive 1
  diﬀuse brain atrophy                          1
  brainstem auditory evoked potential showed increased i [UNK] iii latency 1
  decreased sensory and motor nerve conduct velocity 1
  sponginess                                    1
  focal, multifocal, and generalized seizures   1
  intractable epilepsy                          1
  liver disease                                 1
  respiratory chain defect                      1
  cortical signal, particularly in the occipital lobes 1
  stepwise deterioration                        1
  memory impairment                             1
  left - sided hemiparesis                      1
  liver function test results became abnormal   1
  aminotransferase, 396 u / l                   1
  y - glutamyltransferase, 1234 u / l           1
  aspartate aminotransferase, 322 u / l         1
  electroencephalograms showed frequent epileptiform discharges 1
  continued diffuse slowing                     1
  cortical and subcortical white matter and basal ganglia 1
  cerebrospinal fluid showed persistently low glucose levels ( 11 - 65 mg / dl ) 1
  elevated protein levels                       1
  generalized aminoaciduria                     1
  perivascular lymphocytic cuffing              1
  neurologic decline                            1
  extensive neuronal loss                       1
  also in the basal ganglia and brainstem       1
  extensive steatosis and fresh necrosis        1
  marked deficiency of the respiratory chain enzymes 1
  cerebral cortex showing activated astrocytes  1
  after commencing sodium valproate             1
  eeg showed sharp and slow wave focus in the right 1
  persistent vomiting                           1
  gcs on admission was 3                        1
  blood sugar unrecordable                      1
  deranged liver function tests                 1
  prolonged clotting                            1
  elevated ammonia                              1
  high plasma lactate                           1
  plasma lactate remained elevated              1
  abnormal white matter signal in the occipital and medial temporal lobes 1
  hepatic dysfunction progressed                1
  alpers - huttenlocher disease                 1
  high lactate                                  1
  hepatic dysfunction                           1
  high signal intensity within occipital and temporal lobes 1
  orthostatic tremor                            1
  progressive dysphonia                         1
  rhinolalia                                    1
  wasting                                       1
  tendon are - flexia                           1
  dys - phagia                                  1
  chronic diarrhoea                             1
  severe body weight loss                       1
  extra - pyramidal rigidity                    1
  cogwheel sign                                 1
  hypomimic face                                1
  proximal muscle weak - ness                   1
  creatine kinase ( ck ) level was 2156         1
  axonal sensory neu - ropathy                  1
  standing tremor                               1
  severe anxiety                                1
  panic attacks                                 1
  progressive cognitive dysfunction             1
  psychotic features                            1
  variation of fiber caliber                    1
  nuclear centralization                        1
  absent reactivity to cytochrome c oxidase     1
  levodopa - responsive pseudo - orthostatic tremor 1
  rrf                                           1
  worsening of diplopia                         1
  limb myoclonus                                1
  loss of reﬂexes                               1
  sensory disturbance                           1
  focal epilepsy                                1
  inferior olivary nuclei                       1
  mild external ophthalmoplegia                 1
  facial dyskinesias                            1
  marked hypertrophic degeneration of the inferior olives 1
  refractory epilepsy                           1
  cerebral coma                                 1
  multiple mitochondrial ( mt ) dna deletions   1
  gradually deteriorated                        1
  progressive cerebellar signs,                 1
  worsening of the myoclonus                    1
  transient liver dysfunction                   1
  refractory focal motor status                 1
  ventilatory assistance                        1
  elevated liver transaminase                   1
  vegetative state                              1
  hyperintensities and swelling of deep grey matter nuclei 1
  coma                                          1
  multiorgan failure                            1
  basal ganglia show hyperintense and swollen deep grey matter nuclei 1
  hyperintensity and swelling of the cortical grey and subcortical white matter 1
  spongiform changes in the cerebrum            1
  white matter spongiosis of the cerebellum     1
  brainstem bleeding                            1
  multiple lacunar ischaemic cortical infarcts  1
  cox - negative areas                          1
  left - sided ptosis                           1
  progressed                                    1
  difficulty performing fine motor tasks        1
  increasing dysphagia                          1
  proximal myopathy                             1
  sensorimotor neuropathy                       1
  peg tube                                      1
  recurrent aspiration                          1
  deficient in the mtdna - encoded cytochrome c oxidase ( cox 1
  sn was almost devoid of pigment               1
  sn neuronal loss                              1
  pons there was mild loss of neurones          1
  cerebellum was affected                       1
  loss of purkinje cells                        1
  neurone loss in the dentate nucleus           1
  ragged - red fibres                           1
  fatal hepatic dysfunction                     1
  liver mtdna depletion                         1
  fatal liver dysfunction                       1
  hypoketotic hypoglycaemia                     1
  glycaemia                                     1
  mild ketonuria                                1
  hypoglycaemia                                 1
  lethargic                                     1
  hypotonic                                     1
  weight was 8. 5 kg ( < 3rd centile            1
  height was 75 cm ( < 3rd centile              1
  hyperecogenic liver                           1
  macro                                         1
  microvesicular ( 10 % ) steatoses             1
  intracytoplasmic microvesicles                1
  hyperplasia of kupffer cells                  1
  portal and periportal fibrosis                1
  diffuse steatosis                             1
  liver cells have foamy cytoplasm              1
  feeding difficulties                          1
  an influenza a infection                      1
  elevations of plasma tyrosine                 1
  glutamine                                     1
  alanine                                       1
  serum lactate was increased                   1
  increased lactate - to - pyruvate ratio       1
  abnormalities in liver synthetic function     1
  elevated prothrombin time                     1
  partial thromboplastin time of 58 seconds     1
  conjugated bilirubin increased                1
  elevations in liver hepatocellular enzymes    1
  worsening liver failure                       1
  elevated cerebrospinal fluid ( csf ) protein  1
  nonspecific extraaxial fluid collection       1
  enhancement of the nerve roots of the cauda equina 1
  neurodegenerative disorder                    1
  microsteatosis                                1
  accumulation of subsarcolemmal neutral lipid  1
  respiratory difficulty                        1
  pancreatitis                                  1
  renal tubulopathy                             1
  myofibers were abnormally small               1
  subsarcolemmal collections of mitochondria    1
  severe encephalopathy                         1
  choreo - athetoid                             1
  reduced complex iþiii                         1
  iiþiii                                        1
  iv in liver                                   1
  reduced complex iiþiii in muscle              1
  low - set ears                                1
  bilateral clubfoot                            1
  progressively blurred vision                  1
  diplopia,                                     1
  choking episodes                              1
  shortness of breath                           1
  symmetric ptosis                              1
  proximal limb weakness                        1
  reduced reflexes                              1
  multiple mitochondrial dna ( mtdna ) deletions 1
  myopathy.                                     1
  cytochrome c oxidase ( cox ) [UNK] deficient fibers 1
  mitochondrial cytopathy                       1
  multiple mtdna deletions in muscle            1
  of liver failure                              1
  200 u / l for ast                             1
  107 u / l for alt                             1
  507 u / l for ' y - glutamyl - transferase    1
  deterioration of liver function               1
  muscle tone was increased                     1
  serum lactate was slightly elevated           1
  persisting vomiting                           1
  reflux                                        1
  ileus                                         1
  small bowel obstruction                       1
  ast was still elevated                        1
  liver function rapidly detoriorated           1
  fibrinogen 80 mg / dl                         1
  thromboplastin time 15 %                      1
  ast up to about 1900 u / l                    1
  alt up to 350 u / l                           1
  total bilirubin 19, 1 mg / dl                 1
  eeg examination showed diffuse suppression    1
  , intestinal bleeding                         1
  haemorrhagic shock                            1
  fatal multi - organ failure                   1
  enlarged mitochondria                         1
  mitochondria with tubular cristae formations  1
  cytochrome - c - oxidase ( cox - ) activity was deficient 1
  accumulation of lipids                        1
  mitochondria were enlarged                    1
  irregular cristae                             1
  elevated                                      1
  csf neopterin,                                1
  il - 6                                        1
  il - 8                                        1
  ifn - c                                       1
  reduced csf 5 - methyltetrahydrofolate        1
  seizure                                       1
  cerebral folate deﬁciency                     1
  generalized brain edema                       1
  ischemic lesion in the left thalamus          1
  ammonia was increased intermittently          1
  lactate                                       1
  protein                                       1
  decreased 5 - methyltetrahydrofolate ( 5mthf ) concentration 1
  repeated status epilepticus                   1
  invasive ventilation                          1
  eeg pattern                                   1
  occipital rhythmic high - amplitude delta     1
  superimposed polyspikes                       1
  deteriorating clinical                        1
  intractable epileptic                         1
  swallowing difﬁculties                        1
  progressive facial masking                    1
  symmetric cogwheeling                         1
  shufﬂing gait                                 1
  positive pull test                            1
  incomplete cpeo                               1
  symmetric reduction of nigrostriatal dopamine transporters 1
  adem                                          1
  myelin basic protein                          1
  oligoclonal bands                             1
  neurologic degeneration                       1
  otitis media                                  1
  gadolinium enhancement                        1
  parietal lobe                                 1
  internal capsule                              1
  head of the caudate                           1
  middle cerebellar peduncle                    1
  elevated level of choline                     1
  depressed level of n - acetylaspartate        1
  elevated lactate doublet                      1
  loss of extraocular movement                  1
  spasticity                                    1
  hyperreflexia                                 1
  frontoparietal subcortical white matter       1
  genu of the corpus callosum                   1
  right internal capsule                        1
  anterior thalamus                             1
  bilateral cerebellar hemispheres              1
  bilateral cerebellar peduncles                1
  subacute hemorrhage                           1
  putamen                                       1
  elevated lactate level                        1
  depressed n - acetylaspartate level           1
  elevated choline level                        1
  gliotic white matter                          1
  foamy histiocytes                             1
  lymphocytes                                   1
  plasma cells                                  1
  loss of myelin                                1
  reactive astrocytes                           1
  glial fibrillary acidic protein               1
  active demyelinating process                  1
  increase in vanillate levels                  1
  myelin basic protein level was higher than 1000 ng / ml 1
  oligoclonal bands were present                1
  serum lactate level was slightly high         1
  diffuse infiltration by reactive astrocytes   1
  foamy macrophages                             1
  asymmetrical right ophthalmoplegia            1
  rapidly progressive demy -                    1
  elination                                     1
  diffuse atrophy of gray - matter structures   1
  electroencephalogram showed persistent generalized epileptiform discharges 1
  asymmetric myoclonic                          1
  progressively droopy eyelids                  1
  increasingly nasal                            1
  ragged blue fibers                            1
  cytochrome c oxidase [UNK] negative fibers    1
  paracrystalline ( ‘ parking lot ’ ) inclusions are noted within mitochondria 1
  lateral rectus palsies                        1
  nasal, ﬂaccid dysarthria                      1
  moderate facial weakness                      1
  tongue weakness                               1
  power was reduced in both deltoid muscles     1
  mild stocking loss to light touch             1
  absent vibratory sensation                    1
  decreased vibratory sensation                 1
  deep tendon reﬂexes were absent at the ankles 1
  gait was mildly wide - based                  1
  absent right sural and peroneal sensory nerve action potentials 1
  reduced superﬁcial radial snap                1
  non - irritative myopathy involving the proximal limbs 1
  myopathy involving the facial muscles         1
  mildly elevated glycosylated hemoglobin level 1
  free carnitine level was mildly elevated      1
  short - chain acyl carnitine level moderately increased 1
  rightsided clonic status epilepticus          1
  subcontinuous discharges of rythmic slow spike waves on the left hemisphere 1
  poor interaction                              1
  global hypotonia                              1
  no eye contact                                1
  elevated lactate / pyruvate ratio             1
  elevation of cerebrospinal ﬂuid ( csf ) lactate 1
  content of mtdna was slightly reduced in muscle 1
  recurrent episodes of tonic clonic seizures   1
  liver enlargement                             1
  increased level of gamma - glutamyltransferase 1
  prothrombine time and the clotting factors were decreased 1
  symmetrical thalamic t2 and flair hyperintense signals 1
  cytotoxic oedema within basal ganglia         1
  intractable epileptic encephalopathy          1
  t2 hyperintense signals of the cerebellar dentate nuclei 1
  spectrometry revealed an accumulation of lactic acid in csf 1
  diffuse hypotonia                             1
  lactate was mild elevated                     1
  defect of the activities of the complexes     1
  spasms                                        1
  increasing levels of transaminases            1
  ammoniemia                                    1
  hypoalbuminaemia                              1
  decrease of clotting factors                  1
  death                                         1
  meniere ’ s syndrome                          1
  episodic vertigo                              1
  tinnitus                                      1
  early age of onset                            1
  intractable seizures                          1
  global neurological deterioration             1
  partial motor status epilepticus              1
  abnormal liver function tests                 1
  focal area of restricted diffusion            1
  subtle gyral swelling                         1
  continuous paroxysmal lateralizing epileptiform discharges 1
  focal status                                  1
  complex i deﬁciency                           1
  generalized tonic clonic seizures             1
  rightsided homonymous paracentral scotoma     1
  bilateral lesions with increased signal intensity in the occipital cortex 1
  eeg showed slowing of the background activity 1
  continuous epileptic activity over the occipital areas 1
  condition worsened                            1
  simple left - sided partial motor seizures    1
  eegs remained highly abnormal                 1
  encephalopathic changes                       1
  continuous epileptic activity                 1
  impaired vision                               1
  suspected convulsions                         1
  generalized tonic - clonic seizures           1
  right - sided hemiparesis                     1
  partial motor seizures                        1
  hyperintense lesions of the left occipital cortex 1
  mesial temporal lobe                          1
  alpers                                        1
  paresis                                       1
  focal convulsions                             1
  convulsions                                   1
  post - ) ictal paresis                        1
  hyperkinetic movement disorder                1
  movement - induced pain                       1
  recurrent anxiety attacks                     1
  secondary generalized seizures                1
  premature amenorrhoea                         1
  below - average cognitive                     1
  sensory neuropathy                            1
  progressive cognitive deﬁcits                 1
  severe cognitive deﬁcits                      1
  recurrent headaches                           1
  personality changes                           1
  movement disorder                             1
  action - triggered myoclonus                  1
  progressive cerebellar ataxia                 1
  wheel - chair bound                           1
  chronic progressive external ophthalmoplegia  1
  severe axonal sensorimotor neuropathy         1
  mitochondrial recessive ataxia syndrome       1
  hyperkinetic movements                        1
  choreic                                       1
  jerky wrist and ﬁnger                         1
  polymini - myoclonus                          1
  intermittent facial and jaw opening dystonia  1
  trunk ataxia                                  1
  severe dysarthria                             1
  horizontal and vertical external ophthalmoplegia 1
  stand assisted                                1
  distal myopathy                               1
  depletion of muscle mitochondrial dna         1
  progressive weakness of the distal upper limbs 1
  reduced muscle strength                       1
  deep tendon reflexes were reduced in the upper extremities 1
  creatine kinase level was mildly increased    1
  electromyography consistently showed myopathic 1
  multiple system atrophy                       1
  progressive cerebellar syndrome               1
  slowing of vertical saccades                  1
  limb dysmetria                                1
  slowing of foot taps bilaterally              1
  impaired tandem gait                          1
  signiﬁcant weight loss                        1
  pontine                                       1
  t2 hyperintensities in the middle cerebellar peduncles 1
  fell                                          1
  hip fracture                                  1
  postural dizziness                            1
  orthostatic hypotension                       1
  urinary urgency                               1
  nocturia                                      1
  drooling of saliva                            1
  intermittent dysphagia                        1
  polyminimyoclonus                             1
  positive glabellar tap                        1
  brisk deep tendon reﬂexes                     1
  unable to walk unaided                        1
  jerky saccades                                1
  parieto - occipital lobe epilepsy             1
  focal parieto - occipital lobe seizures       1
  migraine headaches                            1
  impairment of visual perception               1
  deﬁcits in visual perception                  1
  other cognitive domains                       1
  multifocal cognitive dysfunction              1
  downbeat and horizontal nystagmus             1
  ocular dysmetria                              1
  right homonymous hemianopsia                  1
  atrophic optic nerves                         1
  right central facial nerve palsy              1
  bilateral weakness of foot dorsiﬂexion        1
  sensation was diminished                      1
  unable to perform ﬁnger - to - nose           1
  gait was unsteady, wide - based               1
  ataxic                                        1
  reﬂexes were hypoactive                       1
  mitochondrial spinocerebellar ataxia and epilepsy 1
  generalized epilepsy                          1
  long - standing learning difﬁculties          1
  myoclonic arm jerks                           1
  pancerebellar syndrome                        1
  progressive cognitive impairment              1
  thalamic and dentate nuclei t2 hyperintensity 1
  sensory axonal peripheral neuropathy          1
  adolescent - onset                            1
  cerebellar signs                              1
  infantile                                     1
  leigh ’ s encephalopathy                      1
  severe hypoglycemia                           1
  fasting hypoglycemia                          1
  liver insufficiency                           1
  alpha - foeto - protein ( afp ) levels were elevated 1
  isolated complex iv defect                    1
  mitochondrial dna depletion                   1
  infantile myocerebrohepatopathy               1
  respiratory insufficiency                     1
  liver edge palpable                           1
  generalized hypotonia                         1
  progressive jaundice                          1
  abdominal distension                          1
  increased liver echogenicity                  1
  increased alanine                             1
  alpha - foetoprotein levels were increased    1
  annular type fibrosis                         1
  cortical necrosis                             1
  reactive ( gemistocytic ) astrocytes          1
  neuronal cell dropout                         1
  chromatolysis                                 1
  fibrillary gliosis                            1
  pallor within the neuropil                    1
  focal loss of large - size neurons            1
  inferior colliculi displayed symmetric peculiar necrotizing lesions 1
  focal loss of purkinje cells                  1
  pallor of the internal granular cell layer    1
  dentate nucleus showed neuronal depletion     1
  spinal cord, myelin pallor                    1
  fasting - induced hypoketotic hypoglycemia    1
  nasogastric feeding                           1
  elevated csf protein                          1
  abnormal eeg                                  1
  high amplitude slow wave activity             1
  polyspike discharges                          1
  liver enzymatic respiratory chain defects     1
  complex iv showed a decreased activity        1
  degeneration in the brain                     1
  cerebellar features                           1
  depressed and absent reﬂexes in all limbs     1
  distal sensory loss of proprioception and vibration 1
  slowing with fatigability                     1
  urinary stress incontinence                   1
  bilateral total hip replacements              1
  sleep apnoea                                  1
  atrophy of the superior cerebellar vermis     1
  delayed pharyngeal phase                      1
  increasing sacral pains                       1
  increased drowsiness                          1
  intermittent nausea                           1
  worsening mobility                            1
  diﬃculty coping                               1
  drowsy                                        1
  febrile                                       1
  loss of pigmented neurons in the substantia nigra 1
  ventrolateral posterior nuclei of the thalamus 1
  caudate nucleus                               1
  loss of the purkinje cells in the superior cerebellar vermis 1
  atrophy of the gracile and cuneate nuclei     1
  atrophic dorsal nerve roots                   1
  axonal and myelin loss in the dorsal columns  1
  anterior nerve roots also appeared mildly atrophic 1
  severe axonal loss in dorsal columns          1
  substantia nigra showing severe loss of pigmented neurons and gliosis 1
  progressive sensory ataxic neuropathy         1
  ragged - red ” ﬁbres                          1
  cytochrome oxidase - negative ﬁbres           1
  3 - methylglutaconic aciduria                 1
  distal muscle weakness and atrophy            1
  syncopal episode                              1
  progressive muscle weakness                   1
  muscular atrophy                              1
  tonic [UNK] clonic convulsions                1
  post - ictal confusion                        1
  thin                                          1
  atrophy of arm and hand muscles               1
  pupils were irregular and sluggish            1
  lower extremities were diffusely thin         1
  power to be 0 / 5 at wrist ﬂexors and extensors 1
  0 / 5 at biceps                               1
  3 / 5 at triceps                              1
  and 4 / 5 at tibialis anterior                1
  absent reﬂexes in upper extremities           1
  ncv revealed diffusely decreased amplitude of compound motor action potentials 1
  chronic myopathic                             1
  severe myoﬁber degeneration                   1
  early - onset distal muscle weakness          1
  symptoms were progressive                     1
  severely cognitively impaired                 1
  cytochrome c oxidase - negative fibres        1
  moderate cortical atrophy                     1
  echolalia                                     1
  automatic laughter                            1
  general cognitive slowness                    1
  problems in understanding and following commands 1
  disorientation                                1
  tendon reflexes were weak                     1
  definite progression of the cognitive         1
  dementia                                      1
  progressive symmetric limb muscle weakness    1
  not able to move unaided                      1
  cytochrome c oxidase ( cox ) - negative fibres 1
  facial muscles                                1
  blood pyruvate was 172 μmol / litre           1
  cox - deficient fibres                        1
  progressive encephalopathy                    1
  cox - negative fibres                         1
  dementing                                     1
  sensory [UNK] ataxic neuropathy               1
  gastroparesis                                 1
  optic discs atrophy                           1
  ﬂaccid type dysarthria                        1
  lower limb muscle weakness                    1
  deep tendon jerks were absent                 1
  impaired vibratory and position sensation     1
  gait was staggering and wide - based          1
  cortical [UNK] subcortical atrophy            1
  absent sural snap                             1
  motor conduction velocities were decreased    1
  markedly delayed gastric emptying             1
  gastric and bowel distention                  1
  cox negative muscle ﬁbers                     1
  inferior olives                               1
  difficulties in balancing                     1
  broad - based gait                            1
  paresthesias                                  1
  mild slowing of nerve conduction velocities   1
  absent sensory potentials in hands and feet   1
  increased signal intensity                    1
  thalami                                       1
  cerebellar hemispheres                        1
  complete horizontal gaze palsy                1
  severe ataxia                                 1
  reflexes were absent                          1
  complete absence of proprioceptive sensation  1
  choreoathetotic movements                     1
  progressive shaking of the head               1
  feeling of imbalance on walking               1
  torticollis                                   1
  jerky head tremor                             1
  broad - based                                 1
  gait difficulty continued to progress         1
  jerks of her left arm                         1
  sensory disturbances in both arms             1
  generalized tonic [UNK] clonic seizure        1
  epileptic — myoclonic jerks                   1
  mild torticollis                              1
  myoclonic jerks of the left arm               1
  ataxic finger chase and heel - shin           1
  absent tendon reflexes                        1
  cerebellar white matter that were hyperintense 1
  electroencephalography showed mild diffuse slowing 1
  electromyography indicated a sensory neuronopathy 1
  symmetric hyperintense signal changes in cerebellum, dorsal of dentate nucleus 1
  atrophy of the cerebellum                     1
  hyperkinetic movement disorders               1
  severe headache                               1
  visual ﬂashing                                1
  speech difﬁculty                              1
  generalised seizures                          1
  parieto - occipital t2 - hyperintensities     1
  slightly confused                             1
  white cell count                              1
  myoglobin                                     1
  creatine kinase values were elevated          1
  polyspike - and - delta wave                  1
  t2 - hyperintense, oedemic lesions            1
  parieto - occipital region                    1
  homonymous right - sided visual ﬁeld defect   1
  cerebellar t2 - hyperintense white matter lesions 1
  thalamic and parieto - occipital lesions in t2 1
  speech disorder                               1
  left side weakness                            1
  difficulties in swallowing                    1
  mild dysphagia                                1
  myopathic pattern                             1
  mild increase in cholesterol levels           1
  left facial nerve palsy                       1
  mild left side hemiparesis                    1
  right pre - rolandic hyperintensity on t2weighted 1
  bilateral focal hyperintensities              1
  stabilized ischaemic lesions                  1
  increased level of serum creatine kinase      1
  moderate sensorimotor bilateral hypoacusia    1
  progressive worsening of ptosis               1
  sudden onset of aphasia                       1
  global aphasia                                1
  right hemiparesis                             1
  right babinski sign                           1
  bilateral opthalmoparesis                     1
  food dysphagia                                1
  small corticalsubcortical left temporo - occipital lesion with dwi restriction 1
  cortico - subcortical frontoparietal hyperintensity 1
  acute ischaemic lesion                        1
  bilateral old ischaemic lesions               1
  slight fiber size variability                 1
  nuclear centralizations                       1
  several fiber splittings                      1
  necrosis was observed in a few fibers         1
  cytochrome c oxidase ( cox ) - negative       1
  slightly retarded motor development           1
  balance problems                              1
  progressive fatal liver failure               1
  induced by treatment                          1
  motor development was delayed                 1
  gait was wide based                           1
  unsteady                                      1
  absences                                      1
  generalized spike and slow waves              1
  focal sharp waves in frontal                  1
  generalized fatigue                           1
  increased serum levels of transaminases       1
  ammonia                                       1
  had elevated blood lactate                    1
  acute, fulminant, and noncholestatic liver failure 1
  severe coagulopathy                           1
  abnormalities in thalamus and basal ganglia   1
  edema of thalamus and caput nucleus caudatus  1
  changes also present in globus pallidus       1
  signs of frontal and temporal atrophy         1
  demyelinating                                 1
  axonal sensitive polyneuropathy               1
  bilateral progressive ptosis                  1
  chronic axonal sensory polyneuropathy         1
  eo                                            1
  progressively worsened                        1
  complete eo                                   1
  proximal muscular deﬁcit                      1
  hyperlactacidemia                             1
  axonal sensory polyneuropathy                 1
  multiple dna deletions                        1
  acute visual loss                             1
  central scotoma                               1
  acute blurred vision                          1
  retro - ocular pain                           1
  thoracic myelitis                             1
  bilateral papillary loss of nerve ﬁbers       1
  ) ragged - red ﬁbers                          1
  pathological glycogen accumulation            1
  accumulation of subsarcolemmal mitochondria   1
  10 % negative cytochrome c oxidase            1
  progressive ophthalmoplegia                   1
  slowness of movements                         1
  progression of his rigidity                   1
  cytochrome c oxidase negative muscle fibers   1
  reduction of activities of complex i and iv   1
  cognition was mildly affected                 1
  anxiety and obsessive disorder                1
  creatine kinase levels were 217 u / l         1
  myoglobin was 360 u / l                       1
  late - onset parkinsonism                     1
  mood disorder                                 1
  ragged red ﬁber                               1
  - dopa responsive parkinsonism                1
  of ac                                         1
  ac, akinetic crisis                           1
  ac                                            1
  infections                                    1
  chronic diarrhea                              1
  weight loss                                   1
  dermatitis herpetiformis                      1
  weakness of upper extremities                 1
  cytochrome c oxidase ( cox ) negative fibers  1
  weakness continued to progress                1
  impaired upgaze                               1
  cachexia                                      1
  chronic persistent diarrhea                   1
  chronic myopathy                              1
  severe sensory neuropathy                     1
  hypogonadism                                  1
  hypothyroidism                                1
  osteopenia                                    1
  derangement of liver enzymes                  1
  alpers [UNK] huttenlocher syndrome            1
  reduction in activity in complexes i and iii  1
  low activity in complex iv                    1
  hepato - cerebral                             1
  bilateral enhancement of cranial nerves       1
  mild atrophy                                  1
  elevated transaminases                        1
  irritable                                     1
  mild ptosis                                   1
  poor head control                             1
  palpable liver edge                           1
  decreased muscle mass                         1
  central hypotonia                             1
  head circumference had decreased              1
  elevated levels of direct bilirubin           1
  aspartate aminotransferase 448 u / l          1
  alanine aminotransferase 167 u / l            1
  lactate dehydrogenase 2221 u / l              1
  plasma lactate level peaked at 12. 3 mmol /   1
  cerebrospinal ﬂuid protein level was 1113 mg / dl 1
  gallbladder sludge                            1
  bilateral enhancement of cranial nerves ( cn ) iii and v - x 1
  dorsal nerve root enhancement                 1
  prominent subarachnoid spaces                 1
  cortical and white matter atrophy             1
  decreasing head circumference                 1
  hypoxia                                       1
  tachycardia                                   1
  intermittent fevers                           1
  sepsis                                        1
  complex iii deﬁciency                         1
  severely cytochrome oxidase - deﬁcient muscle ﬁbers 1
  deﬁciencies in all etc complexes              1
  bilateral enhancement of cns iii, v, vi, vii, viii, and x 1
  bilateral cn ix enhancement                   1
  bilateral enhancement of multiple dorsal cervical nerve roots 1
  mild cortical atrophy                         1
  mild white matter atrophy                     1
  poor sucking                                  1
  frequent vomiting                             1
  developmental regression                      1
  signiﬁcant hypotonia                          1
  elevated protein                              1
  lactate levels in the cerebrospinal           1
  neurologic condition exacerbated              1
  ﬁbrosis were observed in the liver            1
  alzheimer type ii glia and loss of myelin     1
  poor weight gain                              1
  proximal dominant muscular weakness           1
  deep tendon reﬂexes were weak                 1
  variation in ﬁber type                        1
  diﬃculty in feeding                           1
  bouts of diarrhea                             1
  consciousness level decreased progressively   1
  mild cerebral atrophy                         1
  died of multiple organ failure                1
  caudal necrosis                               1
  diﬀuse foam cells                             1
  hepatic ﬁbrosis                               1
  spongy change was noted predominantly in the cerebral white matter 1
  neuronal loss in the cerebral and cerebellar cortex 1
  alzheimer type ii glia                        1
  sponginess were prominent in the substantia nigra 1
  linear necrosis was present in the bilateral caudate nucleus 1
  hepatocytes containing lipid droplets         1
  bile plugs                                    1
  alzheimer type ii astrocytosis                1
  hypertrophic olivary degeneration             1
  symmetric hyperintense signal abnormality and enlarged inferior olives 1
  signal changes in the thalami and deep cerebellar nuclei 1
  cerebellar vermis atrophy                     1
  sensorineural hearing loss                    1
  somnolence                                    1
  unsteady gait                                 1
  chronic hepatitis c                           1
  scanning speech                               1
  horizontal nystagmus                          1
  symmetric hyporeflexia                        1
  distal sensory loss                           1
  obsessive disorder                            1
  striatal dopamine deﬁciency                   1
  ragged red ﬁbres                              1
  multiple deletions of mtdna                   1
  behavioural                                   1
  cognitive abnormalities                       1
  bilateral cataract                            1
  learning difﬁculties                          1
  compulsive habits                             1
  progressive cognitive decline                 1
  degree of social impairment                   1
  hand rest tremor                              1
  loss of facial expression                     1
  right hand rest tremor                        1
  bilateral mild rigidity                       1
  behavioural and cognitive abnormalities       1
  irregular menstrual cycles                    1
  abnormal oocyte                               1
  non - syndromic ovarian dysfunction           1
  wheelchair - bound                            1
  cerebellar white matter signal abnormalities along with atrophy 1
  bilateral t2 hyperintense enlarged olives     1
  hyperintense inferior olives                  1
  moderate cerebellar and parietal cortical atrophy 1
  lesions in the thalamus                       1
  inferior olivary nucleus                      1
  generalized areﬂexia                          1
  abnormal leg pallesthesia                     1
  dysmetria                                     1
  left dysdiadochokinesia                       1
  distal symmetric sensorimotor neuropathy      1
  somatosensory evoked potentials were absent   1
  bilateral lesions in the dorsal thalami       1
  cerebellar white matter                       1
  left inferior olivary nucleus                 1
  hyperintense lesions in left inferior olivary nucleus 1
  progressive balance difficulties              1
  impaired gait and coordination                1
  hypoacusis                                    1
  esophoria                                     1
  positive romberg ’ s sign                     1
  chorea                                        1
  loss of vibration sense                       1
  broken up smooth pursuit                      1
  nystagmus                                     1
  dysconjugation of lateral eye movements       1
  hypometric saccades                           1
  restriction of vertical gaze                  1
  deficits in information processing speed      1
  mild sensorineuronal hearing loss             1
  mcp sign                                      1
  sensory axonal polyneuropathy                 1
  variation of fiber size                       1
  reduced activity was found in complex i       1
  fxtas                                         1
  symmetric mcp hyperintensities                1
  middle cerebellar peduncles                   1
  influenza - like symptoms                     1
  headaches with visual disturbances            1
  electroencephalogram ( eeg ) demonstrated continuous rhythmic ( 1 [UNK] 1. 5 hz ) high amplitude delta with superimposed spikes 1
  brain mri showed hyperintensities             1
  lumbar puncture revealed a lymphocytic pleocytosis 1
  refractory status epilepticus                 1
  progressive liver failure                     1
  repeated hypoglycaemias                       1
  liver transplantation                         1
  low ceruloplasmin and copper                  1
  liver showed a marked necrosis                1
  neurological deterioration                    1
  tetraparesis                                  1
  multifocal myoclonus                          1
  refractory partial seizures                   1
  t2 / flair hyperintensities                   1
  bilateral cerebellum                          1
  right frontal cortex                          1
  bilateral parietal cortex                     1
  lactate doublet peak                          1
  elevated lactate concentration                1
  partial status epilepticus                    1
  hyperintensities in cerebellum                1
  hyperintense lesion in right occipital lobe   1
  symmetrical hyperintense lesions in cerebellum 1
  generalized mild muscle weakness              1
  sensory deficit                               1
  absence of sensory response in both arms and legs 1
  neurogenic motor units                        1
  cerebellar and cerebral atrophy               1
  impaired balance and gait                     1
  blurred vision                                1
  eye movements were restricted in horizontal   1
  diffuse weakness was noted in arms and legs   1
  romberg was positive                          1
  generalized cerebral and cerebellar atrophy   1
  severe bilateral vestibulopathy ( bv          1
  serum ck level was 1. 5 normal                1
  electromyography was myopathic                1
  lactate peak in cerebrospinal fluid detected on spectroscopy 1
  fat deposition was prominent in superficial muscles 1
  poor oocyte quality                           1
  very poor respiratory effort                  1
  prompt intubation                             1
  mechanical ventilation                        1
  dysmorphic features                           1
  micrognathia                                  1
  maternal polyhydramnious                      1
  weak fetal movement                           1
  compromising her ventilation and feeding      1
  basal ganglia demonstrated decreased naa peak and increased choline peak 1
  poor respiratory effort                       1
  demyelinating peripheral neuropathy           1
  lower intelligence level                      1
  mild saccadic eye movement                    1
  speech was slurred                            1
  slightly weak in the neck and all four limbs  1
  areflexia in the lower limbs                  1
  muscle tonus was hypotonic                    1
  finger - nose - finger test demonstrated dysmetria 1
  romberg test was positive                     1
  mri revealed diffuse leukoencephalopathy      1
  nerve conduction studies showed reduced conduction velocities 1
  prolonged distal motor latencies              1
  decreased amplitude of compound muscle action potentials 1
  f wave minimum latency of these nerves was prolonged 1
  sural nerves were not evoked bilaterally      1
  diffuse leukoencephalopathy                   1
  mngie - like phenotype                        1
  axonal cmt                                    1
  progressive balance disturbance               1
  trouble walking heel - to - toe               1
  reduced vibration sensation                   1
  pinprick sensation ( reduced to the mid - calves ) 1
  supranuclear vertical gaze paresis            1
  right internuclear ophthalmoparesis           1
  reduced sensory amplitudes                    1
  chronic denervation in distal muscles         1
  cox negative fibres                           1
  type 2 diabetes                               1
  arterial hypertension                         1
  progressive distal lower limb weakness        1
  bilateral partial ptosis                      1
  limitation in lateral eye version             1
  decreased sensation on the plantar surface of the feet 1
  inability to walk in tandem                   1
  proximal paraparesis                          1
  generalised areflexia                         1
  distal hypoesthesia                           1
  hypopallesthesia of upper and lower limbs     1
  vertical diplopia in upgaze                   1
  a pill - rolling tremor                       1
  periventricular white matter disease          1
  absence of sensory nerve action potentials    1
  sensory neuronopathy                          1
  myopathic potentials                          1
  cytochrome c oxidase ( cox ) negative muscle fibres 1
  generalized tonic [UNK] clonic seizures       1
  nausea                                        1
  reduced vision                                1
  paraesthesia in both upper limbs              1
  continuous 2 hz polyspike - and - slow waves  1
  multifocal seizures                           1
  multifocal, asynchronous myoclonus            1
  difficulty in swallowing                      1
  progressive proximal limb weakness            1
  serum creatine kinase level was mildly elevated 1
  ragged - red “ fibers                         1
  dominance of type i muscle fibers             1
  increased focal punctate lipid inclusions within some of the myofibers 1
  upper limb mixed rest and postural tremor     1
  tremor gradually progressed                   1
  muscles weakness                              1
  cognitive deficits                            1
  bilateral blepharoptosis                      1
  hypopsia                                      1
  symmetrical proximal muscle weakness          1
  diminished deep tendon reflexes               1
  rest tremor in all limbs                      1
  reduced bilateral arm swing                   1
  bilateral optic atrophy                       1
  retinal nerve fiber layer thickness loss      1
  diffuse cortical atrophy                      1
  hypometabolism in parietal and occipital lobe 1
  severely decreased visual acuity              1
  loss in retinal nerve fiber layer thickness   1
  secondarily generalized focal crises          1
  epileptic event                               1
  infectious process                            1
  igm antiborrellia titers were high            1
  hypertonia                                    1
  motor deterioration                           1
  high blood lactate                            1
  increase of the subarachnoid space            1
  atrophy of the left hemisphere                1
  pale eye fundus                               1
  motor focal seizures                          1
  upper extremity myoclonus                     1
  motor disturbances                            1
  inexpressive facies                           1
  facial hypomimia                              1
  axial hypotonia                               1
  hypo / areflexia                              1
  distal limb increased tone                    1
  right tetraparesis                            1
  deambulation                                  1
  difficulties in handling objects              1
  drooling                                      1
  electroencephalogram disclosed spontaneously slow and poorly organized trace 1
  frequent paroxysmal outbreaks of highvoltage delta waves 1
  right predominance activated during sleep     1
  deep cerebral folate deficiency 3 nmol / l ( rv, 35 [UNK] 124 ) 1
  severe consciousness level                    1
  motor manifestations                          1
  infection with borrelia                       1
  focal motor status epilepticus                1
  lumbar puncture that showed elevated protein  1
  b2 microglobulin                              1
  progressive neurological symptoms             1
  persistent left upper and lower extremity focal seizures 1
  mental status ﬂuctuations                     1
  hypermetabolic activity in the right parietal lobe 1
  left cerebellum                               1
  extreme fatigue                               1
  nausea with vomiting                          1
  mental status changes                         1
  serum ast 161 iu / l                          1
  alt 283 iu / l                                1
  inr of 3. 1 with                              1
  microvascular steatosis                       1
  drug - induced hepatotoxicity                 1
  liver function continued to decline           1
  focal aware - motor seizures                  1
  arpeo                                         1
  severely reduced horizontal and vertical gaze 1
  limitation in both horizontal and vertical gaze 1
  complete horizontal ophthalmoplegia           1
  no upward gaze                                1
  limited downward gaze                         1
  bilateral sensory - neuronal hearing impairment 1
  atrophy of the mesencephalon, pedunculus cerebelli superior, and frontotemporal 1
  hearing deﬁcit                                1
  brain atrophy                                 1
  distance visual acuity was 0. 50 and 0. 40 logmar 1
  generalized rod on - bipolar dysfunction      1
  worsening anorexia                            1
  limitation of gross motor activities          1
  weakness of his respiratory muscles           1
  obstructive sleep apnea                       1
  chronic cough                                 1
  fed through a g - tube                        1
  wheelchair bound                              1
  aspiration pneumonia                           1
  bilateral hand contractures                   1
  failing to thrive                             1
  difficulty with extraocular eye movements     1
  corrected visual acuity was 0. 50 and 0. 40 logmar 1
  ergs showed an electronegative configuration  1
  sporadic headaches                            1
  difficulty eliciting tendon reflexes          1
  progressive myopathy                          1
  areflexia /                                   1
  selective rod on - bipolar cell dysfunction   1
  bilateral thalamic                            1
  cerebellar lesions                            1
  ataxic gait                                   1
  bilateral complete ophthalmoplegia            1
  global areflexia                              1
  hyperintense lesions in cerebellar white matter 1
  dorso - medial thalami                        1
  sensory motor ataxic neuropathy               1
  progressively unsteady                        1
  falls                                         1
  painful paresthesia                           1
  speech was progressively becoming more slurred 1
  speech was dysarthric                         1
  atrophy involving mainly the distal muscles   1
  proprioceptive impairment                     1
  severe axonal predominantly sensory peripheral neuropathy 1
  elevation in serum lactate                    1
  pyruvate 21. 66 mg / l                        1
  asymmetric ptosis                             1
  migraine type headache                        1
  secondary generalized seizure                 1
  decreased diffusion on right occipital lobe   1
  electroencephalography ( eeg ) presented focal slowing 1
  intractable focal                             1
  focal status epilepticus                      1
  generalized myoclonus                         1
  severe muscle pain                            1
  apnea                                         1
  prenatal onset                                1
  respiratory chain dysfunction                 1
  p. thr251ile                                  1
  pro587leu                                     1
  birth weight was 2380 g ( - 2. 5 sd ),        1
  head circumference was 33 cm ( - 1 sd ).      1
  severely floppy                               1
  sucking difficulties                          1
  blood lactate levels were increased           1
  base excess at −5                             1
  serum creatine kinase level ( s - ck ) was elevated 1
  anemia                                        1
  hypocalcemia                                  1
  absence of eye contact                        1
  infolded thumbs                               1
  backward bending of the head                  1
  dyskinetic movements                          1
  generalized muscular hypotonus                1
  no head control                               1
  absence of muscle tendon reflexes             1
  eeg was abnormal                              1
  decreased background activity of the left hemisphere 1
  deteriorating liver function                  1
  increasing lactic acidosis                    1
  serum levels of alanine transaminase increased 1
  aspartate transaminase to 142. 5 u / l        1
  international normalized ratio to 1. 4        1
  multi - organ failure                         1
  calcifications in the basal ganglia           1
  frontal white matter                          1
  calcifications in globus pallidus             1
  cytochrome c oxidase deficiency               1
  large fibers with mitochondrial proliferation 1
  deficiency of mtdna encoded complex iv        1
  dysfunction of the respiratory chain          1
  unable to keep up with his peers              1
  fatigue after exercise                        1
  bilateral eyelid drooping                     1
  numbness in both feet                         1
  weighed only 45 kg                            1
  nasal, flaccid dysarthria                     1
  horizontal ophthalmoparesis                   1
  muscle power was reduced in neck flexors      1
  bilateral sternocleidomastoid muscular atrophy 1
  fasciculation was observed in the bilateral pectoralis major 1
  excessive sweating                            1
  deep tendon reflexes were absent              1
  absent vibratory sensations                   1
  romberg sign were positive                    1
  elevated levels of lactic acid                1
  reduced compound muscle action potentials ( cmap ) in the lower limbs 1
  high - amplitude, longduration motor unit action potentials 1
  lower limb somatosensory evoked potentials were absent 1
  periventricular white matter hyperintensities 1
  hand tremors                                  1
  and dystonia                                  1
  sensory symptoms                              1
  impairment in balance and coordination        1
  leg tremor became more pronounced             1
  involuntary inversion of the feet             1
  right thumb flexion                           1
  head posturing                                1
  subtle rotatory nystagmus                     1
  bilateral dystonic inversion of the feet      1
  bradykinetic                                  1
  bilateral reduced vibration to the level of the ankles 1
  impairment of joint proprioception            1
  reflexes were absent with                     1
  high amplitude rhythmic tremor                1
  gait was broad - based                        1
  dystonic foot inversion                       1
  right toe became extensor                     1
  romberg ’ s test was positive                 1
  generalized sensory length - dependent axonal polyneuropathy 1
  bilateral foot drop                           1
  dystonic inversion of the feet                1
  joint proprioception was also impaired distally 1
  strabismus                                    1
  exotropia                                     1
  reduced cox activity                          1
  progressive anteroflexion of the trunk        1
  poor coordination                             1
  slow movement                                 1
  constipation                                  1
  bilateral bradykinesia                        1
  dopamine transporter inactivity in both putamina 1
  psychomotor delay                             1
  parkinsonian symptoms                         1
  sensorineural mixed hearing loss              1
  mild sensorineural hearing loss               1
  less athletic than her peers                  1
  diﬃculties climbing high steps                1
  distal lower extremity weakness               1
  after a streptococcus infection               1
  nonfatigable bilateral ptosis                 1
  vertical and horizontal ophthalmoplegia       1
  bifacial weakness                             1
  weakness in the proximal                      1
  intention tremor                              1
  electromyography demonstrated a myopathy      1
  moderate ﬁber size variation                  1
  atrophic and hypertrophic ﬁbers               1
  increased internal nuclei                     1
  fatty replacement of endomysial connective tissue 1
  ragged blue ﬁbers                             1
  absent cytochrome c oxidase ( cox ) activity  1
  mitochondria with abnormal cristae and crystalline inclusions 1
  cpeo                                          1
  ocular myopathy                               1
  cramps in extremities                         1
  numbness                                      1
  burning                                       1
  cramps in her feet                            1
  sensitivity to temperature                    1
  cold exacerbated                              1
  burning pain in her feet                      1
  action hand tremors                           1
  balance had deteriorated                      1
  aspiration                                    1
  retrosternal spasms during meals              1
  sleep apnea                                   1
  creatine kinase was mildly elevated           1
  baseline lactate was high                     1
  axonal, and sensory polyneuropathy            1
  sensory nerve action potentials of both sural nerves were absent 1
  neuropathic features                          1
  subsarcolemmal accumulation of mitochondria   1
  cox - negative muscle fibers                  1
  impairment of extraocular motility            1
  bilateral, adult - onset ptosis               1
  elevated serum lactic acid                    1
  subsarcolemmal mitochondrial accumulation     1

PATIENT  (#unique=42)
  patient                                       95
  case                                          42
  patient 1                                     10
  patient 2                                     10
  woman                                         9
  girl                                          7
  case 1                                        4
  proband                                       4
  child                                         3
  man                                           2
  patients 1                                    1
  patient demonstrates                          1
  polg                                          1
  child,                                        1
  female                                        1
  this girl                                     1
  growth decreased                              1
  boy                                           1
  12 months                                     1
  patient had                                   1
  right sided diffusion changes                 1
  right occipital                               1
  parietal lobes                                1
  right posterior thalamus                      1
  left sided epilepsia partialis continua       1
  progressive hepatopathy                       1
  unresponsive                                  1
  right arm tonic [UNK] clonic jerking          1
  extensive left hemispheric involvement        1
  neurological deterioration                    1
  seizures were intractable                     1
  tube feeding                                  1
  cortically blind                              1
  index patient                                 1
  our patient                                   1
  thirties                                      1
  . patient                                     1
  a patient                                     1
  other patient                                 1
  proband of family 1                           1
  proband in family 1                           1
  proband in family 2                           1"""

def parse_section(raw_text: str, section_name: str) -> List[Tuple[str, int]]:
    pat = re.compile(
        rf"{re.escape(section_name)}\s+\(#unique=.*?\)\n(?P<body>.*?)(?=\n\n[A-Z_ ]+\s+\(#unique=|\Z)",
        re.S,
    )
    m = pat.search(raw_text)
    if not m:
        return []
    body = m.group("body")
    items: List[Tuple[str, int]] = []
    for ln in body.splitlines():
        if not ln.strip():
            continue
        m2 = re.match(r"^(.*\S)\s+(\d+)\s*$", ln)
        if not m2:
            continue
        term = m2.group(1).strip()   
        cnt  = int(m2.group(2))
        items.append((term, cnt))
    return items

sections = {
    "AGE_DEATH":    parse_section(raw, "AGE_DEATH"),
    "AGE_FOLLOWUP": parse_section(raw, "AGE_FOLLOWUP"),
    "AGE_ONSET":    parse_section(raw, "AGE_ONSET"),
    "GENE":         parse_section(raw, "GENE"),
    "GENE_VARIANT": parse_section(raw, "GENE_VARIANT"),
    "HPO_TERM":     parse_section(raw, "HPO_TERM"),
    "PATIENT":      parse_section(raw, "PATIENT"),
}

class QuotaRoundRobinDeck:
    def __init__(self, items_with_counts: List[Tuple[str, int]]):
        if not items_with_counts:
            raise ValueError("empty items")
        self.items = [t for t, _ in items_with_counts]
        self.quota: Dict[str, int] = {t: c for t, c in items_with_counts}
        self.order = list(self.items) 
        self.idx = 0
        self.last: Optional[str] = None

    def _advance(self):
        self.idx = (self.idx + 1) % len(self.order)

    def _has_remaining(self) -> bool:
        return any(c > 0 for c in self.quota.values())

    def next_one(self) -> str:
        if not self._has_remaining():
            self.quota = {t: self.quota.get(t, 0) for t in self.items} 
        tried = 0
        n = len(self.order)
        while tried < n:
            t = self.order[self.idx]
            if self.quota.get(t, 0) > 0 and t != self.last:
                self.quota[t] -= 1
                self.last = t
                self._advance()
                return t
            self._advance()
            tried += 1
        tried = 0
        while tried < n:
            t = self.order[self.idx]
            if self.quota.get(t, 0) > 0:
                self.quota[t] -= 1
                self.last = t
                self._advance()
                return t
            self._advance()
            tried += 1
        self.quota = {t: c for t, c in zip(self.items, [1]*len(self.items))}
        return self.next_one()

    def next_k_unique_row(self, k: int) -> List[str]:
        if k <= 0:
            return []
        picked: List[str] = []
        seen = set()
        temp_last = self.last
        saved_idx, saved_last = self.idx, self.last
        saved_quota = dict(self.quota)

        tries = 0
        max_tries = len(self.order) * (k + 1)
        while len(picked) < k and tries < max_tries:
            t = self.next_one()
            if t not in seen and t != temp_last:
                picked.append(t)
                seen.add(t)
                temp_last = t
            tries += 1

        if len(picked) < k:
            self.idx, self.last, self.quota = saved_idx, saved_last, saved_quota
            while len(picked) < k:
                t = self.next_one()
                if t not in seen:
                    picked.append(t)
                    seen.add(t)
                else:
                    picked.append(t)
        self.last = picked[-1]
        return picked


hpo_pairs = sections["HPO_TERM"]
hpo_high = [(t, c) for t, c in hpo_pairs if c >= 5]
hpo_mid  = [t for t, c in hpo_pairs if 2 <= c <= 4]
hpo_low  = [t for t, c in hpo_pairs if c == 1]

deck_hpo_high = QuotaRoundRobinDeck(hpo_high)

def cycle_pick_unique(seq: List[str], start_idx: int, k: int) -> Tuple[List[str,], int]:
    if not seq or k <= 0:
        return [], start_idx
    n = len(seq)
    picked: List[str] = []
    seen = set()
    i = start_idx
    tries = 0
    max_tries = n + k
    while len(picked) < k and tries < max_tries:
        v = seq[i]
        i = (i + 1) % n
        if v not in seen:
            seen.add(v)
            picked.append(v)
        tries += 1
    while len(picked) < k:
        picked.append(seq[i])
        i = (i + 1) % n
    return picked, i


deck_patient    = QuotaRoundRobinDeck(sections["PATIENT"])
deck_age_death  = QuotaRoundRobinDeck(sections["AGE_DEATH"])
deck_gene       = QuotaRoundRobinDeck(sections["GENE"])
deck_age_onset  = QuotaRoundRobinDeck(sections["AGE_ONSET"])
deck_age_follow = QuotaRoundRobinDeck(sections["AGE_FOLLOWUP"])
deck_variants   = QuotaRoundRobinDeck(sections["GENE_VARIANT"])


K_PATIENT     = 1   
K_AGE_DEATH   = 1
K_GENE        = 1
K_AGE_ONSET   = 2
K_AGE_FOLLOW  = 2
K_VARIANTS    = 5
K_HPO_HIGH    = 3
K_HPO_MID     = 3
K_HPO_LOW     = 3

n_groups = 3000
out_path = "polg_synth_3000.jsonl"

mid_idx = 0
low_idx = 0

with open(out_path, "w", encoding="utf-8") as f:
    for _ in range(n_groups):
        patient       = deck_patient.next_k_unique_row(K_PATIENT)
        age_onset     = deck_age_onset.next_k_unique_row(K_AGE_ONSET)
        age_followup  = deck_age_follow.next_k_unique_row(K_AGE_FOLLOW)
        age_death     = deck_age_death.next_k_unique_row(K_AGE_DEATH)
        gene          = deck_gene.next_k_unique_row(K_GENE)
        gene_variants = deck_variants.next_k_unique_row(K_VARIANTS)

        hpo_high_row  = deck_hpo_high.next_k_unique_row(K_HPO_HIGH)
        hpo_mid_row,  mid_idx = cycle_pick_unique(hpo_mid,  mid_idx, K_HPO_MID)
        hpo_low_row,  low_idx = cycle_pick_unique(hpo_low,  low_idx, K_HPO_LOW)

        def maybe_scalar(lst: List[str], k: int):
            return lst[0] if k == 1 else lst

        rec = {
            "patient":      maybe_scalar(patient, K_PATIENT),
            "age_onset":    age_onset,
            "age_followup": age_followup,
            "age_death":    maybe_scalar(age_death, K_AGE_DEATH),
            "gene":         maybe_scalar(gene, K_GENE),
            "gene_variants": gene_variants,
            "hpo_high":     hpo_high_row,
            "hpo_mid":      hpo_mid_row,
            "hpo_low":      hpo_low_row,
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Done ->", out_path)

Done -> polg_synth_3000.jsonl


In [9]:
!pip install seqeval evaluate torchcrf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_cupti_cu1

In [10]:
# import json
from pathlib import Path

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
import evaluate


# 1. Load BIO datasets produced previously
BIO_DIR = Path("/kaggle/working/bio_outputs")

def load_jsonl(path: Path):
    """Return a list of dicts from a JSONL file."""
    with path.open(encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

train_examples = load_jsonl(BIO_DIR / "train.jsonl")
dev_examples   = load_jsonl(BIO_DIR / "dev.jsonl")
test_examples  = load_jsonl(BIO_DIR / "test.jsonl")

ds_splits = DatasetDict({
    "train":      Dataset.from_list(train_examples),
    "validation": Dataset.from_list(dev_examples),
    "test":       Dataset.from_list(test_examples),
})
print("Loaded dataset sizes:", {k: len(v) for k, v in ds_splits.items()})

# 2. Tokenizer & label mapping
tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    use_fast=True,
)

unique_labels = sorted({lab for ex in train_examples + dev_examples + test_examples
                        for lab in ex["labels"]})
label2id = {lab: i for i, lab in enumerate(unique_labels)}
id2label = {i: lab for lab, i in label2id.items()}

def tokenize_and_align_labels(ex):
    enc = tokenizer(
        ex["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=512,
        return_attention_mask=True,
    )
    enc["labels"] = [label2id[l] for l in ex["labels"]]
    return enc

ds_splits = ds_splits.map(
    tokenize_and_align_labels,
    batched=False,
    remove_columns=["tokens", "labels"],
)

# 3. Model
model = AutoModelForTokenClassification.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

# 4. Metrics
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    refs = p.label_ids
    true_labels = [
        [id2label[lid] for lid in seq if lid != -100] for seq in refs
    ]
    pred_labels = [
        [id2label[pid] for pid, lid in zip(pred_seq, ref_seq) if lid != -100]
        for pred_seq, ref_seq in zip(preds, refs)
    ]
    result = seqeval.compute(predictions=pred_labels, references=true_labels)
    return {
        "overall_precision": result["overall_precision"],
        "overall_recall":    result["overall_recall"],
        "overall_f1":        result["overall_f1"],
        "overall_accuracy":  result["overall_accuracy"],
    }

data_collator = DataCollatorForTokenClassification(tokenizer)


# 5. Training arguments and Trainer
training_args = TrainingArguments(
    output_dir="ner_pubmedbert",
    eval_strategy="steps",
    eval_steps=50,
    save_steps=500,
    logging_strategy="steps",
    logging_steps=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="overall_f1",
    greater_is_better=True,
    report_to=["none"],
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_splits["train"],
    eval_dataset=ds_splits["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 6. Train, evaluate, test
trainer.train()
trainer.evaluate()

test_metrics = trainer.predict(ds_splits["test"]).metrics
print("Test set metrics:", test_metrics)
predictions, labels, _ = trainer.predict(ds_splits["test"])
preds = predictions.argmax(-1)

true_labels = [
    [id2label[label_id] for label_id in seq if label_id != -100]
    for seq in labels
]
pred_labels = [
    [id2label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
    for pred_seq, label_seq in zip(preds, labels)
]

detailed_result = seqeval.compute(predictions=pred_labels, references=true_labels)

print("\n📋 Per-label classification report:")
for label, metrics in detailed_result.items():
    if label.startswith("overall_"):
        continue
    print(f"{label:20} | Precision: {metrics['precision']:.3f} | Recall: {metrics['recall']:.3f} | F1: {metrics['f1']:.3f}")


2025-07-31 20:56:28.974250: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753995389.151900      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753995389.205674      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loaded dataset sizes: {'train': 20394, 'validation': 169, 'test': 168}


Map:   0%|          | 0/20394 [00:00<?, ? examples/s]

Map:   0%|          | 0/169 [00:00<?, ? examples/s]

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Step,Training Loss,Validation Loss,Overall Precision,Overall Recall,Overall F1,Overall Accuracy
50,1.2526,0.613347,0.011494,0.003138,0.00493,0.844662
100,0.927,0.586129,0.044444,0.006276,0.010999,0.846289
150,0.8871,0.544769,0.101266,0.050209,0.067133,0.841815
200,0.8175,0.525279,0.116959,0.041841,0.061633,0.851216
250,0.7339,0.537705,0.11811,0.047071,0.067315,0.839194
300,0.751,0.482755,0.150735,0.042887,0.066775,0.854063
350,0.7166,0.471697,0.204082,0.062762,0.096,0.855374
400,0.6808,0.475432,0.175573,0.048117,0.075534,0.851939
450,0.6809,0.458547,0.238739,0.055439,0.089983,0.857543
500,0.6432,0.453757,0.182692,0.059623,0.089905,0.858086


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Test set metrics: {'test_loss': 0.13088427484035492, 'test_overall_precision': 0.5764705882352941, 'test_overall_recall': 0.4542294322132097, 'test_overall_f1': 0.5081011017498379, 'test_overall_accuracy': 0.9474973580555288, 'test_runtime': 2.7443, 'test_samples_per_second': 61.217, 'test_steps_per_second': 4.008}



📋 Per-label classification report:
AGE_DEATH            | Precision: 0.750 | Recall: 0.600 | F1: 0.667
AGE_FOLLOWUP         | Precision: 0.846 | Recall: 0.579 | F1: 0.688
AGE_ONSET            | Precision: 0.652 | Recall: 0.652 | F1: 0.652
GENE                 | Precision: 0.686 | Recall: 0.547 | F1: 0.609
GENE_VARIANT         | Precision: 0.472 | Recall: 0.321 | F1: 0.382
HPO_TERM             | Precision: 0.536 | Recall: 0.432 | F1: 0.478
PATIENT              | Precision: 0.828 | Recall: 0.632 | F1: 0.716


Save NER model

In [12]:
# Save NER model & tokenizer
trainer.save_model("ner_pubmedbert_saved")
tokenizer.save_pretrained("ner_pubmedbert_saved")

('ner_pubmedbert_saved/tokenizer_config.json',
 'ner_pubmedbert_saved/special_tokens_map.json',
 'ner_pubmedbert_saved/vocab.txt',
 'ner_pubmedbert_saved/added_tokens.json',
 'ner_pubmedbert_saved/tokenizer.json')

In [13]:
predictions, labels, _ = trainer.predict(ds_splits["test"])
preds = predictions.argmax(-1)

true_labels = [
    [id2label[label_id] for label_id in seq if label_id != -100]
    for seq in labels
]
pred_labels = [
    [id2label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
    for pred_seq, label_seq in zip(preds, labels)
]

detailed_result = seqeval.compute(predictions=pred_labels, references=true_labels)

print("\n Per-label classification report:")
for label, metrics in detailed_result.items():
    if label.startswith("overall_"):
        continue
    print(f"{label:20} | Precision: {metrics['precision']:.3f} | Recall: {metrics['recall']:.3f} | F1: {metrics['f1']:.3f}")


 Per-label classification report:
AGE_DEATH            | Precision: 0.750 | Recall: 0.600 | F1: 0.667
AGE_FOLLOWUP         | Precision: 0.846 | Recall: 0.579 | F1: 0.688
AGE_ONSET            | Precision: 0.652 | Recall: 0.652 | F1: 0.652
GENE                 | Precision: 0.686 | Recall: 0.547 | F1: 0.609
GENE_VARIANT         | Precision: 0.472 | Recall: 0.321 | F1: 0.382
HPO_TERM             | Precision: 0.536 | Recall: 0.432 | F1: 0.478
PATIENT              | Precision: 0.828 | Recall: 0.632 | F1: 0.716


In [14]:
from collections import defaultdict

def extract_entities(labels):
    spans = []
    start = None
    current_label = None
    for i, lab_id in enumerate(labels):
        label = id2label.get(lab_id, "O")
        if label.startswith("B-"):
            if current_label:
                spans.append((start, i - 1, current_label))
            start = i
            current_label = label[2:]
        elif label.startswith("I-") and current_label:
            continue
        else:
            if current_label:
                spans.append((start, i - 1, current_label))
                current_label = None
                start = None
    if current_label:
        spans.append((start, len(labels) - 1, current_label))
    return spans

def iou(a, b):
    inter = max(0, min(a[1], b[1]) - max(a[0], b[0]) + 1)
    union = max(a[1], b[1]) - min(a[0], b[0]) + 1
    return inter / union

def relaxed_match(pred_span, true_span):
    ps, pe, plabel = pred_span
    ts, te, tlabel = true_span
    if plabel != tlabel:
        return False
    if abs(ps - ts) <= 4 and abs(pe - te) <= 4:
        return True
    if iou((ps, pe), (ts, te)) >= 0.4:
        return True
    return False

def relaxed_compute_metrics(preds, refs):
    tp, fp, fn = 0, 0, 0
    label_metrics = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})

    for pred_seq, ref_seq in zip(preds, refs):
        pred_ents = extract_entities(pred_seq)
        true_ents = extract_entities(ref_seq)
        matched = set()

        for pred_ent in pred_ents:
            match_found = False
            for i, true_ent in enumerate(true_ents):
                if i in matched:
                    continue
                if relaxed_match(pred_ent, true_ent):
                    tp += 1
                    label_metrics[pred_ent[2]]["tp"] += 1
                    matched.add(i)
                    match_found = True
                    break
            if not match_found:
                fp += 1
                label_metrics[pred_ent[2]]["fp"] += 1

        for i, true_ent in enumerate(true_ents):
            if i not in matched:
                fn += 1
                label_metrics[true_ent[2]]["fn"] += 1

    precision = tp / (tp + fp + 1e-10)
    recall    = tp / (tp + fn + 1e-10)
    f1        = 2 * precision * recall / (precision + recall + 1e-10)

    print("\n Relaxed Per-label classification report:")
    for label, m in label_metrics.items():
        lp = m["tp"] / (m["tp"] + m["fp"] + 1e-10)
        lr = m["tp"] / (m["tp"] + m["fn"] + 1e-10)
        lf1 = 2 * lp * lr / (lp + lr + 1e-10)
        print(f"{label:20} | Precision: {lp:.3f} | Recall: {lr:.3f} | F1: {lf1:.3f}")

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

filtered_preds = []
filtered_labels = []

for pred_seq, label_seq in zip(preds, labels):
    filtered_pred = [p for p, l in zip(pred_seq, label_seq) if l != -100]
    filtered_label = [l for l in label_seq if l != -100]
    filtered_preds.append(filtered_pred)
    filtered_labels.append(filtered_label)

def clean_prediction_structure(labels):
    cleaned = []
    prev = "O"
    for i, label in enumerate(labels):
        if label.startswith("I-") and prev == "O":
            label = "B-" + label[2:]
        if label == "O" and i+2 < len(labels) and labels[i+1].startswith("B-") and labels[i+2].startswith("I-"):
            label = "I-" + labels[i+1][2:]
        cleaned.append(label)
        prev = label
    return cleaned

def fix_illegal_I(labels):
    fixed = []
    prev_type = "O"
    for label in labels:
        if label.startswith("I-"):
            if prev_type != label[2:]:
                label = "B-" + label[2:]
        fixed.append(label)
        if label.startswith("B-"):
            prev_type = label[2:]
        elif label.startswith("I-"):
            pass
        else:
            prev_type = "O"
    return fixed

def clean_and_fix_prediction_sequence(label_ids):
    labels = [id2label.get(lid, "O") for lid in label_ids]
    labels = clean_prediction_structure(labels)
    labels = fix_illegal_I(labels)
    return [label2id.get(l, 0) for l in labels]

filtered_preds_cleaned = [clean_and_fix_prediction_sequence(seq) for seq in filtered_preds]

print("\n Running relaxed evaluation on test set (with boundary repair & illegal-I fix)...")
relaxed_metrics = relaxed_compute_metrics(filtered_preds_cleaned, filtered_labels)
print("\n Relaxed test set metrics:", relaxed_metrics)



 Running relaxed evaluation on test set (with boundary repair & illegal-I fix)...

 Relaxed Per-label classification report:
PATIENT              | Precision: 0.812 | Recall: 0.684 | F1: 0.743
AGE_ONSET            | Precision: 0.548 | Recall: 0.739 | F1: 0.630
HPO_TERM             | Precision: 0.621 | Recall: 0.754 | F1: 0.681
GENE                 | Precision: 0.528 | Recall: 0.734 | F1: 0.614
GENE_VARIANT         | Precision: 0.556 | Recall: 0.566 | F1: 0.561
AGE_FOLLOWUP         | Precision: 0.800 | Recall: 0.632 | F1: 0.706
AGE_DEATH            | Precision: 0.571 | Recall: 0.800 | F1: 0.667

 Relaxed test set metrics: {'precision': 0.6182902584492427, 'recall': 0.7207415990729176, 'f1': 0.6655965756591568}


Relation extraction model

In [1]:
import json
from pathlib import Path

INPUT_DIR = Path("/kaggle/input/fullfull/annotations")
OUTPUT_FILE = Path("merged_full.jsonl")

merged = []

for full_path in sorted(INPUT_DIR.glob("*_full.jsonl")):
    filename = full_path.name
    with full_path.open("r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {filename}:{lineno}")
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"JSON decode error at {filename}:{lineno} — {e}")
                continue

            text = rec.get("text", "")
            spans = rec.get("spans", [])
            relations = rec.get("relations", [])

            if not spans:
                continue

            merged.append({
                "text": text,
                "spans": spans,
                "relations": relations
            })

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for entry in merged:
        fw.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Merged and saved {len(merged)} records (text + spans + relations) to: {OUTPUT_FILE.resolve()}")

Merged and saved 83 records (text + spans + relations) to: /kaggle/working/merged_full.jsonl


In [2]:
import json
from pathlib import Path
from itertools import product

INPUT_FILE = Path("merged_full.jsonl")
OUTPUT_FILE = Path("relation_binary.jsonl")

relation_data = []

with INPUT_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        text = record.get("text", "")
        spans = record.get("spans", [])
        relations = record.get("relations", [])

        for idx, span in enumerate(spans):
            span["id"] = idx

        pos2id = {
            (span["token_start"], span["token_end"]): span["id"]
            for span in spans
        }

        pos_set = set()
        rel_type_map = {}
        for rel in relations:
            head_lbl = rel["head_span"].get("label")
            child_lbl = rel["child_span"].get("label")
            if not (head_lbl == "PATIENT" and child_lbl in ("HPO_TERM", "GENE_VARIANT")):
                continue

            h_key = (rel["head_span"]["token_start"], rel["head_span"]["token_end"])
            c_key = (rel["child_span"]["token_start"], rel["child_span"]["token_end"])
            if h_key not in pos2id or c_key not in pos2id:
                continue

            head_id  = pos2id[h_key]
            child_id = pos2id[c_key]
            pos_set.add((head_id, child_id))
            rel_type_map[(head_id, child_id)] = rel.get("label", "unknown")

        for span1, span2 in product(spans, spans):
            if span1["id"] == span2["id"]:
                continue
            if span1["label"] != "PATIENT":
                continue
            if span2["label"] not in ("HPO_TERM", "GENE_VARIANT"):
                continue

            pair = (span1["id"], span2["id"])
            label = 1 if pair in pos_set else 0
            relation = rel_type_map.get(pair, "no_relation")

            relation_data.append({
                "text":       text,
                "head":       span1["text"],
                "head_type":  span1["label"],
                "child":      span2["text"],
                "child_type": span2["label"],
                "relation":   relation,
                "label":      label,
            })

with OUTPUT_FILE.open("w", encoding="utf-8") as fw:
    for item in relation_data:
        fw.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f" Generated {len(relation_data)} relation instances and saved to: {OUTPUT_FILE.resolve()}")

 Generated 244 relation instances and saved to: /kaggle/working/relation_binary.jsonl


In [3]:
import json
from pathlib import Path

# Input file
INPUT_FILE = Path("relation_binary.jsonl")

# Output files
POSITIVES_FILE = Path("positives_triples.jsonl")
NEGATIVES_FILE = Path("negatives_triples.jsonl")

with INPUT_FILE.open("r", encoding="utf-8") as infile, \
     POSITIVES_FILE.open("w", encoding="utf-8") as pos_out, \
     NEGATIVES_FILE.open("w", encoding="utf-8") as neg_out:

    for line in infile:
        rec = json.loads(line)
        triple = {
            "head":       rec["head"],
            "head_type":  rec["head_type"],
            "child":      rec["child"],
            "child_type": rec["child_type"],
            "relation":   rec["relation"],
            "label":      rec["label"],
        }
        if rec.get("label") == 1:
            pos_out.write(json.dumps(triple, ensure_ascii=False) + "\n")
        else:
            neg_out.write(json.dumps(triple, ensure_ascii=False) + "\n")

print(f" Positive triples saved to: {POSITIVES_FILE.resolve()}")
print(f" Negative triples saved to: {NEGATIVES_FILE.resolve()}")

 Positive triples saved to: /kaggle/working/positives_triples.jsonl
 Negative triples saved to: /kaggle/working/negatives_triples.jsonl


In [4]:
import json
import random
from pathlib import Path
import pandas as pd

# --- CONFIGURATION ---
# Path to your Excel file
INPUT_XLSX = Path("/kaggle/input/polgtable/subset_POLG.xlsx")
# Path where the JSONL output will be saved
OUTPUT_FILE = Path("polg_relations.jsonl")

# Fix random seed for reproducibility
random.seed(42)

# --- STEP 1: LOAD DATA ---
# Read the Excel sheet into a pandas DataFrame
df = pd.read_excel(INPUT_XLSX, engine="openpyxl")

# --- STEP 2: COLLECT POSITIVE RELATIONS PER ROW ---
# We'll build a list of dicts, one per row (patient),
# containing the patient ID, its HPO terms, and its gene variants.
row_positives = []
for _, row in df.iterrows():
    # Extract patient ID
    patient = str(row["Patient_ID_PMID_1"]).strip()

    # Split the HPO_Term column by semicolon into a list
    hpos = [
        term.strip()
        for term in str(row["HPO_Term"]).split(";")
        if term.strip()
    ]
    # Extract the two variant columns into a list
    variants = []
    for col in ("Paper_variant1", "Paper_variant2"):
        var = str(row.get(col, "")).strip()
        # Skip empty or 'nan' entries
        if var and var.lower() != "nan":
            variants.append(var)

    # Store this row's positives
    row_positives.append({
        "patient":  patient,
        "hpos":     hpos,
        "variants": variants
    })

# --- STEP 3: WRITE OUT POSITIVES + SAMPLED NEGATIVES ---
with OUTPUT_FILE.open("w", encoding="utf-8") as fout:
    # Iterate over each patient record
    for idx, rec in enumerate(row_positives):
        p = rec["patient"]
        pos_h = set(rec["hpos"])
        pos_v = set(rec["variants"])

        # 3a) Write all true (positive) PATIENT→HPO_TERM relations
        for h in pos_h:
            fout.write(json.dumps({
                "head":       p,
                "head_type":  "PATIENT",
                "child":      h,
                "child_type": "HPO_TERM",
                "relation":   "PATIENT_HPO_TERM",
                "label":      1
            }, ensure_ascii=False) + "\n")

        # 3b) Write all true (positive) PATIENT→GENE_VARIANT relations
        for v in pos_v:
            fout.write(json.dumps({
                "head":       p,
                "head_type":  "PATIENT",
                "child":      v,
                "child_type": "GENE_VARIANT",
                "relation":   "PATIENT_GENE_VARIANT",
                "label":      1
            }, ensure_ascii=False) + "\n")

        # 3c) Sample one other row (excluding current) for negatives
        other_idx = random.choice([i for i in range(len(row_positives)) if i != idx])
        other = row_positives[other_idx]

        # 3d) For each HPO in the sampled other row,
        #     if it's not in the current patient's positives,
        #     write a negative example PATIENT→HPO_TERM
        for h in other["hpos"]:
            if h in pos_h:
                continue
            fout.write(json.dumps({
                "head":       p,
                "head_type":  "PATIENT",
                "child":      h,
                "child_type": "HPO_TERM",
                "relation":   "no_relation",
                "label":      0
            }, ensure_ascii=False) + "\n")

        # 3e) Similarly, for each variant in the other row,
        #     if not a positive for this patient, write a negative
        for v in other["variants"]:
            if v in pos_v:
                continue
            fout.write(json.dumps({
                "head":       p,
                "head_type":  "PATIENT",
                "child":      v,
                "child_type": "GENE_VARIANT",
                "relation":   "no_relation",
                "label":      0
            }, ensure_ascii=False) + "\n")

print(f" Wrote positives + sampled negatives to {OUTPUT_FILE.resolve()}")

 Wrote positives + sampled negatives to /kaggle/working/polg_relations.jsonl


In [20]:
import json
import re
import random
from pathlib import Path

# --- CONFIG ---
# File containing existing sentences with head/child annotations
EXISTING_REL_FILE = Path("/kaggle/working/relation_binary.jsonl")
# File containing new triples from the spreadsheet
TRIPLES_FILE      = Path("/kaggle/working/polg_relations.jsonl")
# Where to write the augmented sentences
OUTPUT_FILE       = Path("/kaggle/working/augmented_from_templates.jsonl")

# Fix random seed for reproducibility
random.seed(0)

def make_template(text: str, head: str, child: str) -> str:
    """
    Replace exact occurrences of head and child in text with {head} and {child}
    to form a template. Returns None if either placeholder not inserted.
    """
    # escape regex metachars in head/child
    head_pat  = r"\b" + re.escape(head)  + r"\b"
    child_pat = r"\b" + re.escape(child) + r"\b"
    tpl = re.sub(head_pat, "{head}", text)
    tpl = re.sub(child_pat, "{child}", tpl)
    if "{head}" in tpl and "{child}" in tpl:
        return tpl
    return None

# --- STEP 1: EXTRACT TEMPLATES FROM EXISTING SENTENCES ---
templates = []
with EXISTING_REL_FILE.open("r", encoding="utf-8") as fin:
    for line in fin:
        rec = json.loads(line)
        text  = rec["text"]
        head  = rec["head"]
        child = rec["child"]
        tpl = make_template(text, head, child)
        if tpl:
            templates.append(tpl)

# Deduplicate templates
templates = list(dict.fromkeys(templates))
print(f"Extracted {len(templates)} unique templates")

# --- STEP 2: LOAD NEW TRIPLES ---
triples = []
with TRIPLES_FILE.open("r", encoding="utf-8") as fin:
    for line in fin:
        rec = json.loads(line)
        triples.append(rec)

# --- STEP 3: FILL TEMPLATES WITH NEW TRIPLES ---
with OUTPUT_FILE.open("w", encoding="utf-8") as fout:
    for tri in triples:
        # randomly choose a template
        tpl = random.choice(templates)
        # fill in
        sent = tpl.format(head=tri["head"], child=tri["child"])
        # emit
        out = {
            "text":       sent,
            "head":       tri["head"],
            "head_type":  tri["head_type"],
            "child":      tri["child"],
            "child_type": tri["child_type"],
            "relation":   tri["relation"],
            "label":      tri["label"]
        }
        fout.write(json.dumps(out, ensure_ascii=False) + "\n")

print(f" Generated augmented sentences saved to: {OUTPUT_FILE.resolve()}")

Extracted 232 unique templates
 Generated augmented sentences saved to: /kaggle/working/augmented_from_templates.jsonl


In [21]:
import json
from pathlib import Path

INPUT_FILES = [
    Path("relation_binary.jsonl"),
    Path("augmented_from_templates.jsonl")
]
OUTPUT_FILE = Path("train_relation_schema_numeric.jsonl")

def locate_span(text: str, substr: str):
    idx = text.find(substr)
    if idx < 0:
        raise ValueError(f"Cannot find '{substr}' in text")
    return idx, idx + len(substr)

with OUTPUT_FILE.open("w", encoding="utf-8") as fout:
    for inpath in INPUT_FILES:
        with inpath.open("r", encoding="utf-8") as fin:
            for line in fin:
                rec = json.loads(line)
                text  = rec["text"]
                head  = rec["head"]
                child = rec["child"]
                
                hs, he = locate_span(text, head)
                cs, ce = locate_span(text, child)
                
                entities = [
                    {"start": hs, "end": he, "label": rec["head_type"],  "text": head},
                    {"start": cs, "end": ce, "label": rec["child_type"], "text": child}
                ]
                
                rel_label = int(rec.get("label", 0)) 
                relations = [{
                    "head":  0, 
                    "child": 1,  
                    "label": rel_label
                }]
                
                out = {
                    "text":      text,
                    "entities":  entities,
                    "relations": relations
                }
                fout.write(json.dumps(out, ensure_ascii=False) + "\n")

print(" Wrote numeric‐relation schema to", OUTPUT_FILE.resolve())

 Wrote numeric‐relation schema to /kaggle/working/train_relation_schema_numeric.jsonl


Ensure that there are only two relationships: 0/1

In [22]:
import json
from pathlib import Path

INPUT = Path("train_relation_schema_numeric.jsonl")
OUTPUT = Path("train_relation_re.jsonl")

with INPUT.open(encoding="utf-8") as fin, OUTPUT.open("w", encoding="utf-8") as fout:
    for line in fin:
        rec = json.loads(line)
        rel = rec["relations"][0]
        numeric = int(rec.get("label", rel.get("label", 0)))
        rel["label"] = numeric
        if "label" in rec:
            del rec["label"]
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(" Wrote cleaned RE dataset to", OUTPUT.resolve())

 Wrote cleaned RE dataset to /kaggle/working/train_relation_re.jsonl


In [6]:
import json
from pathlib import Path

# Input files: original annotations and generated data
INPUT_FILES = [
    Path("relation_binary.jsonl"),
    Path("/kaggle/input/relation-generate/relation_generate.jsonl")
]
# Temporary and final output files
INTERMEDIATE = Path("train_relation.jsonl")
OUTPUT       = Path("train_re.jsonl")

def locate_span(text: str, substr: str):
    """
    Find the start and end character offsets of `substr` within `text`.
    Raises ValueError if `substr` is not found.
    """
    idx = text.find(substr)
    if idx < 0:
        raise ValueError(f"Cannot find '{substr}' in text")
    return idx, idx + len(substr)

# Phase 1: Build numeric-relation schema, skipping records where spans can't be located
skipped = 0
with INTERMEDIATE.open("w", encoding="utf-8") as fout:
    for inpath in INPUT_FILES:
        with inpath.open("r", encoding="utf-8") as fin:
            for line in fin:
                rec = json.loads(line)
                text, head, child = rec["text"], rec["head"], rec["child"]
                try:
                    hs, he = locate_span(text, head)
                    cs, ce = locate_span(text, child)
                except ValueError as e:
                    skipped += 1
                    # Optionally log the error:
                    # print("Skipping record:", e)
                    continue

                entities = [
                    {"start": hs, "end": he, "label": rec["head_type"],  "text": head},
                    {"start": cs, "end": ce, "label": rec["child_type"], "text": child}
                ]
                rel_label = int(rec.get("label", 0))
                relations = [{"head": 0, "child": 1, "label": rel_label}]
                out = {"text": text, "entities": entities, "relations": relations}
                fout.write(json.dumps(out, ensure_ascii=False) + "\n")

print(f"Phase 1 complete, skipped {skipped} records where spans could not be located.")

# Phase 2: Clean up relation labels and remove any top-level 'label' key
with INTERMEDIATE.open("r", encoding="utf-8") as fin, OUTPUT.open("w", encoding="utf-8") as fout:
    for line in fin:
        rec = json.loads(line)
        rel = rec["relations"][0]
        # Ensure the relation label is numeric and remove any top-level label
        numeric = int(rec.get("label", rel.get("label", 0)))
        rel["label"] = numeric
        rec.pop("label", None)
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote cleaned RE dataset to", OUTPUT.resolve())


Phase 1 complete, skipped 664 records where spans could not be located.
Wrote cleaned RE dataset to /kaggle/working/train_re.jsonl


Training RE model

In [8]:
!pip install seqeval evaluate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16

In [14]:
import json
from pathlib import Path

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import evaluate

# Load relation dataset (text/entities/relations with numeric label
REL_FILE = Path("train_re.jsonl")
rel_examples = [json.loads(line) for line in REL_FILE.open(encoding="utf-8")]

# Build flat examples: text, head_text, child_text, label
flat = []
for rec in rel_examples:
    text = rec["text"]
    rel = rec["relations"][0]
    head_ent  = rec["entities"][rel["head"]]["text"]
    child_ent = rec["entities"][rel["child"]]["text"]
    flat.append({"text": text, "head": head_ent, "child": child_ent, "label": rel["label"]})

random.seed(42)
random.shuffle(flat)

rel_ds = Dataset.from_list(flat).train_test_split(test_size=0.3, seed=42)
# further split train into train/dev
train_test = rel_ds["train"].train_test_split(test_size=0.22222, seed=42)
rel_ds = DatasetDict({
    "train":      train_test["train"],
    "validation": train_test["test"],
    "test":       rel_ds["test"]
})

model_dir = Path("/kaggle/input/ner-pubmed/ner_pubmedbert_saved")

re_tokenizer = AutoTokenizer.from_pretrained(
    model_dir,
    use_fast=True,
    local_files_only=True
)

re_model = AutoModelForSequenceClassification.from_pretrained(
    model_dir,
    num_labels=2,
    ignore_mismatched_sizes=True,
    local_files_only=True
)

def preprocess_re(batch):
    inputs = [
        f"head: {h} [SEP] child: {c} [SEP] sentence: {t}"
        for t,h,c in zip(batch["text"], batch["head"], batch["child"])
    ]
    enc = re_tokenizer(inputs, padding="max_length", truncation=True, max_length=256)
    enc["labels"] = batch["label"]
    return enc

rel_ds = rel_ds.map(preprocess_re, batched=True,
                   remove_columns=["text","head","child","label"])

# 4. Metrics for RE
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_re_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        "accuracy":  accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall":    recall.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1":        f1.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

# 5. Train RE
re_args = TrainingArguments(
    output_dir="re_pubmedbert",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_only_model=True,     
    save_total_limit=1,      
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_strategy="steps",
    logging_steps=20,
    report_to=[]
)


re_trainer = Trainer(
    model=re_model,
    args=re_args,
    train_dataset=rel_ds["train"],
    eval_dataset=rel_ds["validation"],
    tokenizer=re_tokenizer,
    compute_metrics=compute_re_metrics
)

re_trainer.train()
# Evaluate on test
test_metrics = re_trainer.evaluate(eval_dataset=rel_ds["test"])
print("RE Test set metrics:", test_metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/ner-pubmed/ner_pubmedbert_saved and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/ner-pubmed/ner_pubmedbert_saved and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([15]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([15, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

  re_trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5344,0.212439,0.909091,0.896907,0.966667,0.930481
2,0.1521,0.238978,0.923077,0.891089,1.0,0.942408
3,0.1241,0.443731,0.909091,0.873786,1.0,0.932642
4,0.1026,0.193775,0.958042,0.9375,1.0,0.967742
5,0.0286,0.289638,0.951049,0.927835,1.0,0.962567
6,0.0577,0.24877,0.944056,0.93617,0.977778,0.956522
7,0.0156,0.352252,0.944056,0.93617,0.977778,0.956522
8,0.0387,0.315926,0.937063,0.926316,0.977778,0.951351
9,0.0203,0.342263,0.944056,0.93617,0.977778,0.956522
10,0.0267,0.374507,0.944056,0.93617,0.977778,0.956522




RE Test set metrics: {'eval_loss': 0.21820537745952606, 'eval_accuracy': 0.9636363636363636, 'eval_precision': 0.9661016949152542, 'eval_recall': 0.9771428571428571, 'eval_f1': 0.9715909090909091, 'eval_runtime': 2.4896, 'eval_samples_per_second': 110.458, 'eval_steps_per_second': 3.615, 'epoch': 10.0}


In [15]:
from sklearn.metrics import classification_report
import numpy as np

preds_output = re_trainer.predict(rel_ds["test"])
preds = np.argmax(preds_output.predictions, axis=-1)
labels = preds_output.label_ids

print(classification_report(labels, preds, digits=4))




              precision    recall  f1-score   support

           0     0.9592    0.9400    0.9495       100
           1     0.9661    0.9771    0.9716       175

    accuracy                         0.9636       275
   macro avg     0.9626    0.9586    0.9605       275
weighted avg     0.9636    0.9636    0.9636       275

