# VLSP Dataset Preparation

## Preparing Environment

In [1]:
import py_vncorenlp
import os

vncorenlp_dir = "D:/Projects/albert-imdb/vncorenlp"
project_dir = "D:/Projects/albert-imdb"


# uncomment this line if VnCoreNLP has not been downloaded
# py_vncorenlp.download_model(save_dir=vncorenlp_path)

# load VnCoreNLP
vncorenlp_model = py_vncorenlp.VnCoreNLP(save_dir=vncorenlp_dir, annotators=["wseg"])

# change directory back to project
os.chdir(project_dir)

## Preprocessor Implementation

There are 5 annotation errors that needs fixing:
1. Entity with punctuation: Normally, the annotation splits each word into each row by space. However if a word is followed immediately by a punctuation and that word is a part of an entity, the annotator will duplicate that word into the next row.
```
# 23357000.conll
1-173	807-810	văn	*[8]	ORGANIZATION[8]	AFFILIATION	1-166[7_8]
1-174	811-816	phòng	*[8]	ORGANIZATION[8]	_	_
1-175	817-821	kiến	*[8]	ORGANIZATION[8]	_	_
1-176	822-826	trúc	*[8]	ORGANIZATION[8]	_	_
1-177	827-833	1+1>2;	_	_	_	_
1-177.1	827-832	1+1>2	*[8]	ORGANIZATION[8]	AFFILIATION	1-166[7_8]
```
2. Not separated by space: There are some cases that the annotator left two words with a space in between in a row. *(currently ignored the whole file)*
```
# 23352816.conll
1-153	756-763	. VMISS	*	ORGANIZATION	_	_	_
```
3. Inter-word_list relations: The task is intra-sentence but there are still some relations between entities belong to different sentences.
4. Relation annotation not only in the first row of the entity: If an entity is annotated with a relation, that relation should be inserted to the first row of that entity. But there are some cases, such as when getting the entity with punctuation error (see the example in that case), the entity annotation is interrupted and then continue with reinserting the relation.
5. Relation not link to the first word of the entity: A relation should be linked to the first word of the other entity but when the other entity annotation is interrupted by the first error, the relation is linked to the row of the duplicated word with no punctuation.
```
1-738	3318-3324	Trưởng	*[18]	ORGANIZATION[18]	PART – WHOLE	1-733[17_18]
1-739	3325-3330	phòng	*[18]	ORGANIZATION[18]	_	_
1-740	3331-3335	Giáo	*[18]	ORGANIZATION[18]	_	_
1-741	3336-3339	dục	*[18]	ORGANIZATION[18]	_	_
1-742	3340-3346	chuyên	*[18]	ORGANIZATION[18]	_	_
1-743	3347-3354	nghiệp,	_	_	_	_
1-743.1	3347-3353	nghiệp	*[18]	ORGANIZATION[18]	PART – WHOLE	1-733[17_18]
1-744	3355-3357	Sở	*[19]	ORGANIZATION[19]	AFFILIATION|PART – WHOLE	1-733[17_19]|1-743.1[18_19]
1-745	3358-3363	GD-ĐT	*[19]	ORGANIZATION[19]	_	_
1-746	3364-3368	tỉnh	*[19]	ORGANIZATION[19]	_	_
1-747	3369-3373	Nghệ	*[19]	ORGANIZATION[19]	_	_
1-748	3374-3376	An	*[19]	ORGANIZATION[19]	_	_
```

In [31]:
import pandas as pd
import numpy as np

"""
format_code:

0 for <entity1> <sep> <entity2> <sep> <sentence>
where all instances of <entity1> and <entity2> are replaced by the corresponding tokens in <sentence>.

1 for open tag and close tag for each entity are inserted into the sentence.
"""
class VlspPreprocessor:
  special_token = {
    # for format code 0
    "SEP": "<sep>",
    "PERSON[1]": "<person1/>",
    "PERSON[2]": "<person2/>",
    "ORGANIZATION[1]": "<organization1/>",
    "ORGANIZATION[2]": "<organization2/>",
    "LOCATION[1]": "<location1/>",
    "LOCATION[2]": "<location2/>",
    
    # for format code 1
    "<PERSON>": "<person>",
    "</PERSON>": "</person>",
    "<ORGANIZATION>": "<organization>",
    "</ORGANIZATION>": "</organization>",
    "<LOCATION>": "<location>",
    "</LOCATION>": "</location>",
  }
  
  eos_punctuation = [".", "?", "!"] # need updating end of sentence punctuations to be more legit
  
  def __init__(self, drop_no_relation_samples: bool=True, format_code: {0, 1}=0, run_vncorenlp_wseg: bool=False):
    self.dataset = []
    
    self.label2id = {}
    self.id2label = {}
    self.drop_no_relation_samples = drop_no_relation_samples
    
    self.unformatted_offset = 0
    self.format_code = format_code
    self.run_vncorenlp_wseg = run_vncorenlp_wseg
    
    self.sentences = []
    self.special_token_lists = []
    self.labels = []
    
  def __len__(self) -> int:
    return len(self.sentences)
  
  def execute_all(self, src_dir: str, drop_no_relation_samples: bool=None, format_code: {0, 1}=None, run_vncorenlp_wseg: bool=None):
    print(f"Executing {src_dir}...")
    self.load(src_dir, drop_no_relation_samples=drop_no_relation_samples)
    print("Done loading.")
    self.format(format_code=format_code, run_vncorenlp_wseg=run_vncorenlp_wseg)
    print("Done formatting.")
    print("✅ Done all.")
    
  def load(self, src_dir: str, drop_no_relation_samples: bool=None): # this function is designed to execute many times on many directories
    if drop_no_relation_samples is not None:
      self.drop_no_relation_samples = drop_no_relation_samples
    
    corpus_offset = len(self.dataset)
    
    for root, _, files in os.walk(src_dir):
      for file in files: # currently the structure is one tsv file per subfolder, but this loop is in case there are more
        if os.path.join("VLSP2020_RE_train", "23352816.conll") in root: # the tsv file in this subfolder is heavily corrupted, just drop it for now
          continue
        if file.endswith(".tsv"):
          self.process_tsv(os.path.join(root, file))

    if self.drop_no_relation_samples:
      self._drop_no_relation_samples()
    self._handle_inter_sentence_relations(corpus_offset)
    
    self._build_id2label()
  
  def format(self, format_code: {0, 1}=None, run_vncorenlp_wseg: bool=None):
    if format_code is not None:
      self.format_code = format_code
    if run_vncorenlp_wseg is not None:
      self.run_vncorenlp_wseg = run_vncorenlp_wseg
    
    self._format(self.unformatted_offset)
    if run_vncorenlp_wseg:
      self._run_vncorenlp_wseg(self.unformatted_offset)
    self.unformatted_offset = len(self.dataset)

  def process_tsv(self, tsv_dir: str):
    df = pd.read_csv(tsv_dir, sep="\t", comment="#", quotechar="\t", header=None)
    if len(df.columns) < 8: # relation columns are missing
      return
    df.columns = ["ann_idx", "range", "word", "param", "entity", "relation", "rel_heads", "-"]

    self._handle_word_with_entity_subword(df)

    dataset_offset = len(self.dataset)
    self._extract_sentences(df)
    self._extract_entities(df, dataset_offset)
    self._extract_relations(df, dataset_offset)

  def _handle_word_with_entity_subword(self, df):
    error_indices = df[df["ann_idx"].shift(-1).apply(lambda i: i is not None and ".1" in str(i))]["word"].index
    offset = 0
    for idx in error_indices:
      idx += offset
      entity_word = df.iloc[idx + 1]["word"]
      prefix, suffix = df.iloc[idx]["word"].split(entity_word, 1)

      if idx > 0 and df.iloc[idx - 1]["entity"] == df.iloc[idx + 1]["entity"]:
        df.loc[idx + 1, "relation"] = "_"
        df.loc[idx + 1, "rel_heads"] = "_"

      if suffix != "":
        df.loc[idx + 1.5] = ["_", "_", suffix, "_", "_", "_", "_", np.nan]
        offset += 1

      if prefix != "":
        df.loc[idx, "word"] = prefix
      else:
        df = df.drop(idx)
        offset -= 1

      df = df.sort_index().reset_index(drop=True)

  def _extract_sentences(self, df):
    sentence = []
    for word in df["word"].values:
      word = str(word) #  in VLSP2020_RE_dev/23352623.conll, the word "nan" counts as a float ¯\_(ツ)_/¯
      sentence.append(word)
      if word[-1] in VlspPreprocessor.eos_punctuation:
        self.dataset.append({ "word_list": sentence })
        sentence = []
    self.dataset.append({ "word_list": sentence })

  def _extract_entities(self, df, dataset_offset: int):
    offset = 0
    sample_idx = dataset_offset
    entity_df = df[df["entity"] != "_"]["entity"]
    for entity, idx in zip(entity_df.values, entity_df.index):
      while idx >= offset + len(self.dataset[sample_idx]["word_list"]):
        offset += len(self.dataset[sample_idx]["word_list"])
        sample_idx += 1

      if "entities" not in self.dataset[sample_idx]:
        self.dataset[sample_idx]["entities"] = {}

      if entity in self.dataset[sample_idx]["entities"]:
        self.dataset[sample_idx]["entities"][entity].append(idx - offset)
      else:
        self.dataset[sample_idx]["entities"][entity] = [idx - offset]

  def _extract_relations(self, df, dataset_offset: int):
    offset = 0
    sample_idx = dataset_offset
    relation_df = df[df["relation"] != "_"][["relation", "entity", "rel_heads"]]    
    for (relations, entity, rel_heads), idx in zip(relation_df.values, relation_df.index):
      relations = relations.split("|")
      rel_heads = rel_heads.split("|")
      for i in range(len(relations)):
        if relations[i] == "*": # some errors occured when annotating relations in VLSP2020_RE_train\23351515.conll, VLSP2020_RE_train\23351856.conll. drop those for now
          continue
        
        while idx >= offset + len(self.dataset[sample_idx]["word_list"]):
          offset += len(self.dataset[sample_idx]["word_list"])
          sample_idx += 1

        if "relations" not in self.dataset[sample_idx]:
          self.dataset[sample_idx]["relations"] = []

        other_entity = df[df["ann_idx"] == rel_heads[i].split("[")[0]]["entity"].values[0]
        self.dataset[sample_idx]["relations"].append((entity, other_entity, relations[i]))

        if relations[i] not in self.label2id:
          self.label2id[relations[i]] = len(self.label2id)

  def _handle_inter_sentence_relations(self, dataset_offset: int=0):
    for sample in self.dataset[dataset_offset:]:
      if "relations" not in sample:
        continue
      assert "entities" in sample
      
      intra_relations = []
      for x, y, r in sample["relations"]:
        if x in sample["entities"] and y in sample["entities"] \
          and "MISCELLANEOUS" not in x and "MISCELLANEOUS" not in y: # ignore miscellaneous entities for now
          intra_relations.append((x, y, r))
      sample["relations"] = intra_relations

  def _drop_no_relation_samples(self, dataset_offset: int=0):
    clean_dataset = []
    for sample in self.dataset[dataset_offset:]:
      if "relations" in sample:
        clean_dataset.append(sample)
    self.dataset = [*self.dataset[:dataset_offset], *clean_dataset]
  
  def _build_id2label(self):
    self.id2label = {v: k for k, v in self.label2id.items()}
  
  def _format(self, dataset_offset: int):
    for sample in self.dataset[dataset_offset:]:
      if "relations" not in sample:
        continue
      assert "entities" in sample
      
      for ent_1, ent_2, rel_type in sample["relations"]:
        assert ent_1 in sample["entities"] and ent_2 in sample["entities"]
        
        self.labels.append(self.label2id[rel_type])
        
        x = 0 # entity 1 position pointer
        y = 0 # entity 2 position pointer
        ent_1_list = sample["entities"][ent_1]
        ent_2_list = sample["entities"][ent_2]
        
        ent_1 = ent_1.split("[")[0]
        ent_2 = ent_2.split("[")[0]
        sentence = []
        
        if self.format_code == 0:
          if ent_1 == ent_2:
            ent_1 = f"{ent_1}[1]"
            ent_2 = f"{ent_2}[2]"
          else:
            ent_1 = f"{ent_1}[1]"
            ent_2 = f"{ent_2}[1]"
          
          sentence = [
            VlspPreprocessor.special_token[ent_1],
            VlspPreprocessor.special_token["SEP"],
            VlspPreprocessor.special_token[ent_2],
            VlspPreprocessor.special_token["SEP"],
          ]

        for idx, word in enumerate(sample["word_list"]):
          if self.format_code == 0:
            if x < len(ent_1_list) and ent_1_list[x] == idx:
              if x == 0 or ent_1_list[x - 1] < ent_1_list[x] - 1:
                sentence.append(VlspPreprocessor.special_token[ent_1])
              x += 1
            elif y < len(ent_2_list) and ent_2_list[y] == idx:
              if y == 0 or ent_2_list[y - 1] < ent_2_list[y] - 1:
                sentence.append(VlspPreprocessor.special_token[ent_2])
              y += 1
            else:
              sentence.append(word)
          elif self.format_code == 1:
            if x < len(ent_1_list) and ent_1_list[x] == idx:
              if x == 0 or ent_1_list[x - 1] < ent_1_list[x] - 1:
                sentence.append(VlspPreprocessor.special_token[f"<{ent_1}>"])
                
            if y < len(ent_2_list) and ent_2_list[y] == idx:
              if y == 0 or ent_2_list[y - 1] < ent_2_list[y] - 1:
                sentence.append(VlspPreprocessor.special_token[f"<{ent_2}>"])

            sentence.append(word)
            
            if x < len(ent_1_list) and ent_1_list[x] == idx:
              if x == len(ent_1_list) - 1 or ent_1_list[x] + 1 < ent_1_list[x + 1]:
                sentence.append(VlspPreprocessor.special_token[f"</{ent_1}>"])
              x += 1

            if y < len(ent_2_list) and ent_2_list[y] == idx:
              if y == len(ent_2_list) - 1 or ent_2_list[y] + 1 < ent_2_list[y + 1]:
                sentence.append(VlspPreprocessor.special_token[f"</{ent_2}>"])
              y += 1
          else:
            raise ValueError("VlspPreprocessor is using some non-predefined format code.")
        
        self.sentences.append(" ".join(sentence))
        if self.format_code == 0:
          self.special_token_lists.append([
            VlspPreprocessor.special_token[ent_1],
            VlspPreprocessor.special_token[ent_2],
            VlspPreprocessor.special_token["SEP"],
          ])
        elif self.format_code == 1:
          self.special_token_lists.append([
            VlspPreprocessor.special_token[f"<{ent_1}>"],
            VlspPreprocessor.special_token[f"</{ent_1}>"],
            VlspPreprocessor.special_token[f"<{ent_2}>"],
            VlspPreprocessor.special_token[f"</{ent_2}>"],
          ])
        else:
          raise ValueError("VlspPreprocessor is using some non-predefined format code.")

  def _run_vncorenlp_wseg(self, dataset_offset: int=0):
    for idx in range(dataset_offset, len(self.sentences)):
      self.sentences[idx] = vncorenlp_model.word_segment(self.sentences[idx])[0]
      for token in self.special_token_lists[idx]:
        wseg_token = vncorenlp_model.word_segment(token)[0]
        self.sentences[idx] = self.sentences[idx].replace(wseg_token, token)
  
  def clear(self):
    self.dataset.clear()
    self.label2id.clear()
    self.id2label.clear()
    self.unformatted_offset = 0
    self.sentences.clear()
    self.labels.clear()

## Execution

In [32]:
dir = os.path.join("VLSP2020", "VLSP2020_RE_train")
preprocessor = VlspPreprocessor()
preprocessor.execute_all(dir, run_vncorenlp_wseg=True, format_code=0)

Executing VLSP2020\VLSP2020_RE_train...
Done loading.
Done formatting.
✅ Done all.


In [33]:
len(preprocessor)

2613

In [34]:
preprocessor.sentences

['<organization1/> <sep> <person1/> <sep> Ảnh minh_hoạ Thứ_trưởng <organization1/> <person1/> đã có ý_kiến về vấn_đề này .',
 '<organization1/> <sep> <person1/> <sep> Liên_quan đến vấn_đề lãnh_đạo ngành Giáo_dục trao_đổi , thầy <person1/> – Hiệu_trưởng <organization1/> , Đồng_Tháp – chia_sẻ : Nội_dung phối_hợp nhà_trường và gia_đình có nhiều lĩnh_vực với mục_đích cuối là nâng cao chất_lượng giáo_dục .',
 '<location1/> <sep> <organization1/> <sep> Liên_quan đến vấn_đề lãnh_đạo ngành Giáo_dục trao_đổi , thầy Nguyễn_Văn_Định – Hiệu_trưởng <organization1/> , <location1/> – chia_sẻ : Nội_dung phối_hợp nhà_trường và gia_đình có nhiều lĩnh_vực với mục_đích cuối là nâng cao chất_lượng giáo_dục .',
 '<organization1/> <sep> <person1/> <sep> Cũng về câu_chuyện lạm_thu liên_quan đến Ban đại_diện cha_mẹ học_sinh , TS <person1/> – Chủ_tịch <organization1/> , nguyên Hiệu_trưởng Trường THPT Đinh_Tiên_Hoàng ( Hà_Nội ) – cho rằng : Câu_chuyện về lạm_thu phải giải_quyết được từ 3 phía , đó là : Người đứn

In [38]:
print(preprocessor.label2id)
pd.Series(preprocessor.labels).value_counts()

{'AFFILIATION': 0, 'LOCATED': 1, 'PART – WHOLE': 2, 'PERSONAL - SOCIAL': 3}


2    1167
0     745
1     601
3     100
dtype: int64

In [None]:
# TODO: Export json files