In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

Все, что понадобится - только pandas. Файлы устроены следующим образом:

1. PER: PER_sents_TEN - предложения с пропусками. Пропуски могут быть [male] или [female]. Все колонки названы по языкам. PER_names - соответствующие имена, в дополнительной колонке gender указывается пол имени. 
2. LOC: LOC_NOM_1984sents - контексты. Пропуски выглядят всегда как [???], все колонки названы по языкам. LOC_NOM_names - соответствующие имена. 
3. ORG: ORG_1095sents - предложения,  пропуски тоже [???]. ORG_names - имена. 

Все датафреймы полностью параллельны, все колонки называются одинаково (но порядок может быть разный). 

Разметку можно добавлять по-разному, как удобно, но добавляться она должна в момент, когда собираются предложения. Можно сперва все токенизировать (по словам или BPE сразу), а потом собирать. Я написала только примерный код, чтобы сама идея была наглядной. Он просто собирает предложения в сырые строки без разметки. 

Можно совмещать имена с предложениями по уникальности, такой код написан ниже. 

In [30]:
def replacer(sents, names, r, entity):
    for index, row in sents.iterrows():
        for column in sents.columns:
            name = names.loc[index % len(names), column]
            name = name.lstrip().rstrip()
            name += " "
            if (entity == "person"):
                name = re.sub(' ', " |P| ", name)
            elif entity == "organization":
                name = re.sub(' ', " |O| ", name)
            elif entity == "location":
                name = re.sub(' ', " |L| ", name)
            else:
                raise NotImplementedError

            row[column] = row[column].replace(r, name)

In [33]:
RANDOM_STATE = 42
TEST_SIZE = 0.2

person_names = pd.read_csv('readyTEN/PER_names.csv')
person_sents = pd.read_csv('readyTEN/PER_sents_TEN.csv')

train_person_names, test_person_names = train_test_split(person_names, test_size=TEST_SIZE, random_state=RANDOM_STATE)
train_person_sents, test_person_sents = train_test_split(person_sents, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [34]:
def merge_sents_names(sents, names):
    sents['gender'] = sents['pl'].apply(lambda x: 'f' if '[female]' in x else 'm')
    malenames = names[names.gender == 'm'].reset_index(drop=True)
    malesents = sents[sents.gender == 'm'].reset_index(drop=True)
    replacer(malesents, malenames, '[male]', entity="person")
    femalenames = names[names.gender == 'f'].reset_index(drop=True)
    femalesents = sents[sents.gender == 'f'].reset_index(drop=True)
    replacer(femalesents, femalenames, '[female]', entity="person")
    sents = pd.concat([malesents, femalesents])
    sents = sents.sample(frac=1).reset_index(drop=True)
    
    return sents

In [35]:
train_person_sents = merge_sents_names(train_person_sents, train_person_names)
test_person_sents = merge_sents_names(test_person_sents, test_person_names)

In [38]:
train_person_sents.to_csv('train_person_sents.csv')
test_person_sents.to_csv('test_person_sents.csv')

In [18]:
# LOC, nom
location_names = pd.read_csv('readyTEN/LOC_NOM_names.csv')
location_sents = pd.read_csv('readyTEN/LOC_NOM_1984sents.csv')

train_location_names, test_location_names = train_test_split(location_names, test_size=TEST_SIZE, random_state=RANDOM_STATE)
train_location_sents, test_location_sents = train_test_split(location_sents, test_size=TEST_SIZE, random_state=RANDOM_STATE)

train_location_sents.reset_index(drop=True, inplace=True)
test_location_sents.reset_index(drop=True, inplace=True)
train_location_names.reset_index(drop=True, inplace=True)
test_location_names.reset_index(drop=True, inplace=True)

replacer(train_location_sents, train_location_names, '[???]', entity="location")
replacer(test_location_sents, test_location_names, '[???]', entity="location")

In [40]:
train_location_sents.to_csv('train_location_sents.csv')
test_location_sents.to_csv('test_location_sents.csv')

In [31]:
# ORG
organization_names = pd.read_csv('readyTEN/ORG_names.csv')
organization_sents = pd.read_csv('readyTEN/ORG_1095sents.csv')

train_organization_names, test_organization_names = train_test_split(organization_names, test_size=TEST_SIZE, random_state=RANDOM_STATE)
train_organization_sents, test_organization_sents = train_test_split(organization_sents, test_size=TEST_SIZE, random_state=RANDOM_STATE)

train_organization_sents.reset_index(drop=True, inplace=True)
test_organization_sents.reset_index(drop=True, inplace=True)
train_organization_names.reset_index(drop=True, inplace=True)
test_organization_names.reset_index(drop=True, inplace=True)

replacer(train_organization_sents, train_organization_names, '[???]', entity="organization")
replacer(test_organization_sents, test_organization_names, '[???]', entity="organization")

In [41]:
train_organization_sents.to_csv('train_organization_sents.csv')
test_organization_sents.to_csv('test_organization_sents.csv')

In [44]:
train_sents = pd.concat([train_person_sents, train_location_sents, train_organization_sents])
test_sents = pd.concat([test_person_sents, test_location_sents, test_organization_sents])

In [45]:
train_sents.to_csv("train_sents.csv")
test_sents.to_csv("test_sents.csv")