In [None]:
!unzip -q VLSP2020_RE_train.zip

## Preprocessor

There are 5 annotation errors that needs fixing:
1. Entity with punctuation: Normally, the annotation splits each word into each row by space. However if a word is followed immediately by a punctuation and that word is a part of an entity, the annotator will duplicate that word into the next row.
```
# 23357000.conll
1-173	807-810	văn	*[8]	ORGANIZATION[8]	AFFILIATION	1-166[7_8]
1-174	811-816	phòng	*[8]	ORGANIZATION[8]	_	_
1-175	817-821	kiến	*[8]	ORGANIZATION[8]	_	_
1-176	822-826	trúc	*[8]	ORGANIZATION[8]	_	_
1-177	827-833	1+1>2;	_	_	_	_
1-177.1	827-832	1+1>2	*[8]	ORGANIZATION[8]	AFFILIATION	1-166[7_8]
```
2. Not separated by space: There are some cases that the annotator left two words with a space in between in a row. *(currently ignored the whole file)*
```
# 23352816.conll
1-153	756-763	. VMISS	*	ORGANIZATION	_	_	_
```
3. Inter-sentence relations: The task is intra-sentence but there are still some relations between entities belong to different sentences.
4. Relation annotation not only in the first row of the entity: If an entity is annotated with a relation, that relation should be inserted to the first row of that entity. But there are some cases, such as when getting the entity with punctuation error (see the example in that case), the entity annotation is interrupted and then continue with reinserting the relation.
5. Relation not link to the first word of the entity: A relation should be linked to the first word of the other entity but when the other entity annotation is interrupted by the first error, the relation is linked to the row of the duplicated word with no punctuation.
```
1-738	3318-3324	Trưởng	*[18]	ORGANIZATION[18]	PART – WHOLE	1-733[17_18]
1-739	3325-3330	phòng	*[18]	ORGANIZATION[18]	_	_
1-740	3331-3335	Giáo	*[18]	ORGANIZATION[18]	_	_
1-741	3336-3339	dục	*[18]	ORGANIZATION[18]	_	_
1-742	3340-3346	chuyên	*[18]	ORGANIZATION[18]	_	_
1-743	3347-3354	nghiệp,	_	_	_	_
1-743.1	3347-3353	nghiệp	*[18]	ORGANIZATION[18]	PART – WHOLE	1-733[17_18]
1-744	3355-3357	Sở	*[19]	ORGANIZATION[19]	AFFILIATION|PART – WHOLE	1-733[17_19]|1-743.1[18_19]
1-745	3358-3363	GD-ĐT	*[19]	ORGANIZATION[19]	_	_
1-746	3364-3368	tỉnh	*[19]	ORGANIZATION[19]	_	_
1-747	3369-3373	Nghệ	*[19]	ORGANIZATION[19]	_	_
1-748	3374-3376	An	*[19]	ORGANIZATION[19]	_	_
```

In [19]:
import os
import pandas as pd
import numpy as np

"""
final_format:

0 for <entity1> <sep> <entity2> <sep> <sentence>
where all instances of <entity1> and <entity2> are replaced by the corresponding tokens in <sentence>.

1 for open tag and close tag for each entity are inserted into the sentence.
"""

class VlspPreprocessor:
  def __init__(self, src_dir, drop_samples_with_no_relations=True, final_format=0):
    self.dataset = []
    self.label2id = {}

    for root, _, files in os.walk(src_dir):
      for file in files: # currently the structure is one tsv file per subfolder, but this loop is in case there are more
        if os.path.join("VLSP2020_RE_train", "23352816.conll") in root: # the tsv file in this subfolder is heavily corrupted, just drop it for now
          return
        if file.endswith(".tsv"):
          self.root = root
          self.process_tsv(os.path.join(root, file))

    if drop_samples_with_no_relations:
      self._drop_samples_with_no_relations()
    self._handle_inter_sentence_relations()
    self._build_id2label()

  def process_tsv(self, tsv_dir):
    df = pd.read_csv(tsv_dir, sep="\t", comment="#", quotechar="\t", header=None)
    if len(df.columns) < 8: # relation columns are missing
      return
    df.columns = ["ann_idx", "range", "word", "param", "entity", "relation", "rel_heads", "-"]

    self._handle_word_with_entity_subword(df)

    dataset_offset = len(self.dataset)
    self._extract_sentences(df)
    self._extract_entities(df, dataset_offset)
    self._extract_relations(df, dataset_offset)

  def _handle_word_with_entity_subword(self, df):
    error_indices = df[df["ann_idx"].shift(-1).apply(lambda i: i is not None and ".1" in i)]["word"].index
    offset = 0
    for idx in error_indices:
      idx += offset
      entity_word = df.iloc[idx + 1]["word"]
      prefix, suffix = df.iloc[idx]["word"].split(entity_word, 1)

      if idx > 0 and df.iloc[idx - 1]["entity"] == df.iloc[idx + 1]["entity"]:
        df.loc[idx + 1, "relation"] = "_"
        df.loc[idx + 1, "rel_heads"] = "_"

      if suffix != "":
        df.loc[idx + 1.5] = ["_", "_", suffix, "_", "_", "_", "_", np.nan]
        offset += 1

      if prefix != "":
        df.loc[idx, "word"] = prefix
      else:
        df = df.drop(idx)
        offset -= 1

      df = df.sort_index().reset_index(drop=True)

  def _extract_sentences(self, df):
    sentence = []
    for word in df["word"].values:
      word = str(word) #  in VLSP2020_RE_dev/23352623.conll, the word "nan" counts as a float ¯\_(ツ)_/¯
      sentence.append(word)
      if word.endswith("."):
        self.dataset.append({ "sentence": sentence })
        sentence = []
    self.dataset.append({ "sentence": sentence })

  def _extract_entities(self, df, dataset_offset):
    offset = 0
    sample_idx = dataset_offset
    entity_df = df[df["entity"] != "_"]["entity"]
    for entity, idx in zip(entity_df.values, entity_df.index):
      while idx >= offset + len(self.dataset[sample_idx]["sentence"]):
        offset += len(self.dataset[sample_idx]["sentence"])
        sample_idx += 1

      if "entities" not in self.dataset[sample_idx]:
        self.dataset[sample_idx]["entities"] = {}

      if entity in self.dataset[sample_idx]["entities"]:
        self.dataset[sample_idx]["entities"][entity].append(idx - offset)
      else:
        self.dataset[sample_idx]["entities"][entity] = [idx - offset]

  def _extract_relations(self, df, dataset_offset):
    offset = 0
    sample_idx = dataset_offset
    relation_df = df[df["relation"] != "_"][["relation", "entity", "rel_heads"]]
    for (relations, entity, rel_heads), idx in zip(relation_df.values, relation_df.index):
      relations = relations.split("|")
      rel_heads = rel_heads.split("|")
      for i in range(len(relations)):
        while idx >= offset + len(self.dataset[sample_idx]["sentence"]):
          offset += len(self.dataset[sample_idx]["sentence"])
          sample_idx += 1

        if "relations" not in self.dataset[sample_idx]:
          self.dataset[sample_idx]["relations"] = []

        other_entity = df[df["ann_idx"] == rel_heads[i].split("[")[0]]["entity"].values[0]
        self.dataset[sample_idx]["relations"].append((entity, other_entity, relations[i]))

        if relations[i] not in self.label2id:
          self.label2id[relations[i]] = len(self.label2id)

        if relations[i] == "*":
          print(sample_idx, dataset_offset, len(relations), rel_heads, self.root)
          print((entity, other_entity, relations[i]))

  def _handle_inter_sentence_relations(self, dataset_offset=0):
    for sample in self.dataset[dataset_offset:]:
      if "relations" not in sample:
        continue
      intra_relations = []
      for x, y, r in sample["relations"]:
        if x in sample["entities"] and y in sample["entities"]:
          intra_relations.append((x, y, r))
      sample["relations"] = intra_relations

  def _drop_samples_with_no_relations(self, dataset_offset=0):
    clean_dataset = []
    for sample in self.dataset[dataset_offset:]:
      if "relations" in sample:
        clean_dataset.append(sample)
    self.dataset = [*self.dataset[:dataset_offset], *clean_dataset]

  def _build_id2label(self):
    self.id2label = {v: k for k, v in self.label2id.items()}

In [20]:
dir = os.path.join("VLSP2020_RE_train", "VLSP2020", "VLSP2020_RE_train")
preprocessor = VlspPreprocessor(dir)
preprocessor.label2id

544 535 4 ['1-315[18_19]', '1-301[15_19]', '1-307[16_19]', '1-310[17_19]'] VLSP2020_RE_train\VLSP2020\VLSP2020_RE_train\23351515.conll
('LOCATION[19]', 'LOCATION[17]', '*')
1668 1662 1 ['1-198[0_13]'] VLSP2020_RE_train\VLSP2020\VLSP2020_RE_train\23351856.conll
('LOCATION[13]', 'PERSON', '*')
1668 1662 1 ['1-198[0_14]'] VLSP2020_RE_train\VLSP2020\VLSP2020_RE_train\23351856.conll
('LOCATION[14]', 'PERSON', '*')


{'AFFILIATION': 0,
 'LOCATED': 1,
 'PART – WHOLE': 2,
 'PERSONAL - SOCIAL': 3,
 '*': 4}

In [11]:
from transformer import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("albert-base-v2", use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=4)

3376