In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json, re, argparse, torch
from datasets import load_dataset
from tqdm import tqdm


In [4]:
ENTITY_RE = re.compile(
    r"\[\s*(?P<text>.+?)\s*\|\s*(?P<etype>[^|\]]+?)\s*(?:\|\s*(?P<rels>.+?))?\s*\]"
)
REL_PAIR_RE = re.compile(r"(?P<rname>[a-zA-Z_]+)\s*=\s*(?P<arg>[^;|]+)")

In [5]:
def parse_tanl(tanl_text):
    """
    Returns: entities(list), relations(list)
    entities: [{text, type, canonical}]
    relations: [{subj_idx, pred, obj_idx or obj_text}]
    """
    entities = []
    # capture entities in order of appearance
    for m in ENTITY_RE.finditer(tanl_text):
        text = m.group('text').strip()
        etype = m.group('etype').strip()
        entities.append({
            "text": text, "type": etype,
            "canonical": text.lower()
        })
    # naive relation recovery: within each bracket, relations are attribute-like.
    # We link by nearest surface-form match to an entity (can be improved).
    # Build name->idx map (many-to-one: use first occurrence).
    name2idx = {}
    for i,e in enumerate(entities):
        name2idx.setdefault(e["canonical"], i)

    relations = []
    for m in ENTITY_RE.finditer(tanl_text):
        subj_text = m.group('text').strip().lower()
        subj_idx = name2idx.get(subj_text, None)
        rels = m.group('rels')
        if subj_idx is None or not rels:
            continue
        for r in REL_PAIR_RE.finditer(rels):
            rname = r.group('rname').strip()
            arg = r.group('arg').strip()
            obj_idx = name2idx.get(arg.lower(), None)
            relations.append({
                "subj_idx": subj_idx,
                "pred": rname,
                # keep both: an index if we have one, and raw text for fallback
                "obj_idx": obj_idx,
                "obj_text": arg
            })
    return entities, relations

In [6]:
parse_tanl("We propose a [ convolutional model | method | used for = chemical interactions ] to detect [ chemical interactions | other scientific term ] in [ scientific abstracts | material | evaluate for = chemical interactions ]. The [ model | generic | compare = SOTA models ] prove to be more efficient than other [ SOTA models | method ] like [ Graph-based model ones | method | hyponym of = SOTA models ]")

([{'text': 'convolutional model',
   'type': 'method',
   'canonical': 'convolutional model'},
  {'text': 'chemical interactions',
   'type': 'other scientific term',
   'canonical': 'chemical interactions'},
  {'text': 'scientific abstracts',
   'type': 'material',
   'canonical': 'scientific abstracts'},
  {'text': 'model', 'type': 'generic', 'canonical': 'model'},
  {'text': 'SOTA models', 'type': 'method', 'canonical': 'sota models'},
  {'text': 'Graph-based model ones',
   'type': 'method',
   'canonical': 'graph-based model ones'}],
 [{'subj_idx': 0,
   'pred': 'for',
   'obj_idx': 1,
   'obj_text': 'chemical interactions'},
  {'subj_idx': 2,
   'pred': 'for',
   'obj_idx': 1,
   'obj_text': 'chemical interactions'},
  {'subj_idx': 3, 'pred': 'compare', 'obj_idx': 4, 'obj_text': 'SOTA models'},
  {'subj_idx': 5, 'pred': 'of', 'obj_idx': 4, 'obj_text': 'SOTA models'}])

In [3]:
ds = load_dataset("allenai/scico")['test']

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


0000.parquet:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


0000.parquet:   0%|          | 0.00/2.34M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


0000.parquet:   0%|          | 0.00/4.96M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/221 [00:00<?, ? examples/s]

In [2]:
torch.cuda.is_available()

True