In [3]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.tokenize import WhitespaceTokenizer

In [4]:
# LABELED_DATA_PATH = "./data/Baseline/"
# persons_files = glob.glob(os.path.join(LABELED_DATA_PATH, "PER*"))
TRAIN_PATH = "train_sents.csv"
TEST_PATH = "test_sents.csv"

train_df = pd.read_csv(TRAIN_PATH, index_col=0)
test_df = pd.read_csv(TEST_PATH, index_col=0)

train_df = train_df.drop(["gender"], axis=1)
test_df = test_df.drop(["gender"], axis=1)

In [5]:
def get_bio_labels(sentences):
    all_labels = []
    all_sentences = []
    for sentence in tqdm(sentences):
        current_labels = []
        current_sentence = []
        ptr = 0
        while ptr < len(sentence):
            if sentence[ptr] == "|O|" or sentence[ptr] == "|P|" or sentence[ptr] == "|L|":
                ptr += 1
                continue
            current_sentence.append(sentence[ptr])
            if ptr + 1 == len(sentence):
                current_labels.append("O")
            else:
                if sentence[ptr + 1] == "|O|":
                    if current_labels and current_labels[-1] in ("B_ORG", "I_ORG"):
                        current_labels.append("I_ORG")
                    else:
                        current_labels.append("B_ORG")
                elif sentence[ptr + 1] == "|L|":
                    if current_labels and current_labels[-1] in ("B_LOC", "I_LOC"):
                        current_labels.append("I_LOC")
                    else:
                        current_labels.append("B_LOC")
                elif sentence[ptr + 1] == "|P|":
                    if current_labels and current_labels[-1] in ("B_PER", "I_PER"):
                        current_labels.append("I_PER")
                    else:
                        current_labels.append("B_PER")
                else:
                    current_labels.append("O")
            ptr += 1
        all_labels.append(current_labels)
        all_sentences.append(current_sentence)
    
    return all_sentences, all_labels

In [6]:
def markup_sentences(sents: pd.DataFrame):
    languages = {}

    for language in sents.columns:
        sentences = sents[language]
        # sentences = sentences.str.replace('.', ' . ')
        # sentences = sentences.str.replace(',', ' , ')
        # sentences = sentences.str.replace('(', ' ( ')
        # sentences = sentences.str.replace(')', ' ) ')
        # sentences = sentences.str.replace('«', ' « ')
        # sentences = sentences.str.replace('»', ' » ')
        tk = WhitespaceTokenizer()
        tokenized = sentences.apply(tk.tokenize)
        sentences, labels = get_bio_labels(tokenized)
        
        languages[language] = {
            "tokens": sentences,
            "labels": labels
        }
        
    for language in languages:
        assert len(languages[language]["tokens"]) == len(languages[language]["labels"])
        
    return pd.DataFrame(languages)

In [7]:
train_markup = markup_sentences(train_df)
test_markup = markup_sentences(test_df)

100%|██████████| 8595/8595 [00:00<00:00, 74821.18it/s]
100%|██████████| 8595/8595 [00:00<00:00, 81499.41it/s]
100%|██████████| 8595/8595 [00:00<00:00, 81496.83it/s]
100%|██████████| 8595/8595 [00:00<00:00, 94794.20it/s]
100%|██████████| 8595/8595 [00:00<00:00, 72344.47it/s]
100%|██████████| 8595/8595 [00:00<00:00, 36551.64it/s]
100%|██████████| 8595/8595 [00:00<00:00, 72071.69it/s]
100%|██████████| 8595/8595 [00:00<00:00, 68836.28it/s]
100%|██████████| 8595/8595 [00:00<00:00, 64080.19it/s]
100%|██████████| 8595/8595 [00:00<00:00, 34378.72it/s]
100%|██████████| 8595/8595 [00:00<00:00, 75136.82it/s]
100%|██████████| 2149/2149 [00:00<00:00, 82628.01it/s]
100%|██████████| 2149/2149 [00:00<00:00, 79286.78it/s]
100%|██████████| 2149/2149 [00:00<00:00, 79433.52it/s]
100%|██████████| 2149/2149 [00:00<00:00, 76910.78it/s]
100%|██████████| 2149/2149 [00:00<00:00, 76745.11it/s]
100%|██████████| 2149/2149 [00:00<00:00, 86344.22it/s]
100%|██████████| 2149/2149 [00:00<00:00, 65333.16it/s]
100%|█████

In [8]:
train_markup.to_csv("train_markup.csv")
test_markup.to_csv("test_markup.csv")