This script converts brat annotation files to inception.
The file typesystem.xml is mandatory and contains the description of the ADR annotation and relations

Procedure:
1 - make new inception project
2 - import both layers (ADR_Entity, ADR_Relation) in the new inception project
3 - import both constrains (ADR_Constrains, ADR_Relation_Constrains) in the new inception project
4 - change the entry_folder in the script to where .txt and .ann
5 - import the new generated .xmi documents using "UIMA CAS XMI (XML 1.0)"

In [1]:
#TODO: multiple spans, split sentences 
#!pip install dkpro-cassis

In [2]:
entry_folder = "/Users/abdelkaderalkadour/Desktop/DFKI/Inception+/ADR_project_entities_relations/brat_files_entities_and_relations"
type_file = "./typesystem.xml"

In [3]:
from cassis import load_typesystem, Cas
import os

In [4]:
class Attribute:
  def __init__(self, atr):
      self.id = atr.split("\t")[0]
      self.type = atr.split("\t")[1].split(" ")[0]
      self.target = atr.split("\t")[1].split(" ")[1]
      if len(atr.split("\t")[1].split(" ")) > 2: #attribute without value
          self.value = atr.split("\t")[1].split(" ")[2].strip()
      else:
          self.value = ""
  def __repr__(self):
        return "Attribute(id='{}', type='{}', target={}, value={})".format(self.id, self.type, self.target, self.value)

class Entity:
  def __init__(self, ann):
      self.id = ann.split("\t")[0]
      self.type = ann.split("\t")[1].split(" ")[0].lower()

      if len(ann.split("\t")[1].split(" ")) > 3: #multiple spans
          spans = ann.split("\t")[1].replace(";", " ")
          self.span = []
          for i in range(1, len(spans.split(" ")), 2):
              self.span.append((int(spans.split(" ")[i]), int(spans.split(" ")[i+1])))
      else:
        self.span = [(int(ann.split("\t")[1].split(" ")[1]), int(ann.split("\t")[1].split(" ")[2]) )]

      self.value = ann.split("\t")[2].strip()
      self.attributes = []
      self.relations = []
  def __repr__(self):
        return "Entity(id='{}', type='{}', span={}, value='{}', attributes={}, relations={}')".format(self.id, self.type, self.span, self.value, self.attributes, self.relations)

class Relation:
  def __init__(self, rel):
      self.id = rel.split("\t")[0]
      self.type = rel.split("\t")[1].split(" ")[0]
      self.arg1 = rel.split("\t")[1].split(" ")[1].split(":")[1]
      self.arg2 = rel.split("\t")[1].split(" ")[2].strip().split(":")[1]

  def __repr__(self):
        return "Relation(id='{}', type='{}', arg1={}, arg2={})".format(self.id, self.type, self.arg1, self.arg2)


In [5]:
class BratParser:
  def __init__(self, path):
      annotation_content = open(path, "r")
      annotation_content = annotation_content.readlines()
      self.entities = []
      entities = []
      attributes = []
      relations = []

      for ann in annotation_content:
          if ann[0] == "A":
              attributes.append(Attribute(ann))
          elif ann[0] == "T":
              entities.append(Entity(ann))
          elif ann[0] == "R":
              relations.append(Relation(ann))

      for att in attributes:
          if len([ent for ent in entities if ent.id == att.target]) > 0:
              entity = [ent for ent in entities if ent.id == att.target][0]
              entity.attributes.append(att)

      for rel in relations:
          entity = [ent for ent in entities if ent.id == rel.arg1][0]
          entity.relations.append(rel)
          entity = [ent for ent in entities if ent.id == rel.arg2][0]
          entity.relations.append(rel)

      self.entities = entities

  def __repr__(self):
      return "Entities({})".format(self.entities)


In [6]:
files = [i for i in os.listdir(entry_folder) if ".txt" in i]

def do_anno(_ann, _cas, _type):
    if len(_ann.attributes) == 0: # no attribute
        _cas.add(ADR(begin=_ann.span[0][0], end=_ann.span[0][1], Label=_type))
    else:
        for atr in _ann.attributes: # add anno. for each attribute
            _cas.add(ADR(
                begin=_ann.span[0][0],
                end=_ann.span[0][1],
                Label=_type,
                Time=atr.value))

with open(type_file, "rb") as f:
    ts = load_typesystem(f)

Sentence = ts.get_type("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
Token = ts.get_type("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token")
ADR = ts.get_type("webanno.custom.ADR_entity")
ADR_REL = ts.get_type("webanno.custom.ADR_relation")

for file in files:
    print(file)
    all_relations = []
    id_relations = dict()

    cas = Cas(typesystem=ts) #parse the typesystem.xmi
    text = "".join(
        open(os.path.join(entry_folder, file), "r").readlines()
    ).strip()
    cas.sofa_string = text

    k = 0
    for token in text.split(" "):
        cas.add(Token(begin=k, end=k + len(token))) # add token
        k = k + len(token)

    ann_num = len(text.split(" ")) + 2 # there are 2 ids in the beginning of the file
    annotations = BratParser(os.path.join(entry_folder, file.replace("txt", "ann")))

    for ann in annotations.entities:


        if ann.type == "timex3":
            do_anno(ann, cas, "time")
            id_relations[ann.id] = ann_num # add the id of the token to annotate the relation
            ann_num = ann_num + 1
            if len(ann.relations) != 0:
                all_relations.extend(ann.relations) # add relation

        elif ann.type == "drug":
            do_anno(ann, cas, "drug")
            id_relations[ann.id] = ann_num
            ann_num = ann_num + 1
            if len(ann.relations) != 0:
                all_relations.extend(ann.relations)

        elif ann.type == "disorder":
            do_anno(ann, cas, "disorder")
            id_relations[ann.id] = ann_num
            ann_num = ann_num + 1
            if len(ann.relations) != 0:
                all_relations.extend(ann.relations)

        elif ann.type == "evaluation":
            do_anno(ann, cas, "evaluation")
            id_relations[ann.id] = ann_num
            ann_num = ann_num + 1
            if len(ann.relations) != 0:
                all_relations.extend(ann.relations)

        else:
            do_anno(ann, cas, ann.type)
            id_relations[ann.id] = ann_num
            ann_num = ann_num + 1



    unique_relations = []
    for i in all_relations: # get only the unique relations
        if i.id not in [o.id for o in unique_relations]:
            unique_relations.append(i)

    for i in unique_relations:
        cas.add(
            ADR_REL(
                Dependent=id_relations[i.arg1],
                Governor=id_relations[i.arg2],
                relation=i.type,
            )
        )

    cas.add(Sentence(begin=0, end=len(text)))
    cas.to_xmi(os.path.join(entry_folder, file.replace("txt", "xmi")))


CATAFLAM.3.txt
ARTHROTEC.112.txt
