In [16]:
import pandas as pd


def read_iob2_file(path):
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

# creating the dictionary of labels and their indices
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]

In [23]:
train_data = [r"data\UNER\ceb_gja-ud-test.iob2", 
              r"data\UNER\da_ddt-ud-test.iob2",
              r"data\UNER\de_pud-ud-test.iob2",
              r"data\UNER\en_ewt-ud-test.iob2",
              r"data\UNER\hr_set-ud-test.iob2",
              r"data\UNER\pt_pud-ud-test.iob2",
              r"data\UNER\ru_pud-ud-test.iob2",
              r"data\UNER\sk_snk-ud-test.iob2",
              r"data\UNER\sr_set-ud-test.iob2",
              r"data\UNER\sv_pud-ud-test.iob2",
              r"data\UNER\tl_trg-ud-test.iob2",
              r"data\UNER\zh_pud-ud-test.iob2"
]


for train in train_data:
    train_data = read_iob2_file(train)
    lang = train[10:14]
  
    # get the tag dictionary
    label_indices = Vocab()
    tags_column = train_data["tags"]

    for tags in tags_column:
        for tag in tags:
            label_indices.getIdx(tag, add=True)

    label_list = sorted(label_indices.idx2word)
    print("Labels for", lang, ": ", label_list)

Labels for ceb_ :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-PER', 'O']
Labels for da_d :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for de_p :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for en_e :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for hr_s :  ['B-LOC', 'B-ORG', 'B-OTH', 'B-PER', 'I-LOC', 'I-ORG', 'I-OTH', 'I-PER', 'O']
Labels for pt_p :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for ru_p :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for sk_s :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for sr_s :  ['B-LOC', 'B-ORG', 'B-OTH', 'B-PER', 'I-LOC', 'I-ORG', 'I-OTH', 'I-PER', 'O']
Labels for sv_p :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
Labels for tl_t :  ['B-LOC', 'B-PER', 'I-LOC', 'O']
Labels for zh_p :  ['B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O']
