In [1]:
from pathlib import Path
from copy import deepcopy
from random import choice, shuffle
import pandas as pd
import json
import matplotlib.pyplot as plt

In [2]:
root = Path('../../../data/NER/')
raw_root = root / 'raw'
processed_root = root / 'processed'

In [3]:
# mapping title -> name
ydf1 = pd.read_csv(raw_root / 'you_anime_characters_refs.csv', sep='\t')
ydf1['anime_url'] = ydf1.page_link.apply(lambda s: s.split('/')[-1])

ydf2 = pd.read_csv(raw_root / 'you_anime_refs.csv', sep='\t')
ydf2['anime_url'] = ydf2.link.apply(lambda s: s.split('/')[-1])

ydf = pd.merge(ydf1, ydf2, on='anime_url')
ydf = ydf['title name'.split()]

ydf.title = ydf.title.apply(str.strip)
ydf.name = ydf.name.apply(str.strip)

ydf.head(3)

Unnamed: 0,title,name
0,Атака титанов,Эрен Йегер
1,Атака титанов,Микаса Аккерман
2,Атака титанов,Армин Арлерт


In [4]:
names = ydf.name.drop_duplicates().tolist()
print(len(names))
names[:5]

63065


['Эрен Йегер', 'Микаса Аккерман', 'Армин Арлерт', 'Леви', 'Саша Блауз']

# Read data

In [5]:
def read_conll(path):
    """
    Returns list of sentences. Sentence is a list of pairs (word, NE-label)
    """

    data = open(path, encoding="utf8")
    data = data.read().split('\n\n')[1:]
    data = list(map(str.splitlines, data))
    get_token_and_ne = lambda s: (s.split()[0], s.split()[-1])
    data = list(map(lambda sent: list(map(get_token_and_ne, sent)), data))
    return data

In [6]:
sents = read_conll(processed_root / 'comments/original/raw.txt')
with_character = lambda sent: any(map(lambda token: token[-1] == 'PER', sent))

sents_characters = list(filter(with_character, sents))
len(sents_characters) / len(sents)

0.11923411662315056

In [18]:
print(sents_characters[:2])
len(sents_characters)

[[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Канаме', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')], [('Амур', 'PER'), ('толль', 'PER')]]


137

# Augment

In [7]:
def get_rand_character(characters):
    char = choice(characters)
    if len(char.split()) > 1:
        char = choice(char.split())
    return char

In [25]:
augmented_sents_characters = []
AUGMENTATION_ROUNDS = 10

for i in range(AUGMENTATION_ROUNDS):
    for sent in sents_characters:
        sent = deepcopy(sent)
        # augment sentence replacing one name with another
        for i, token in enumerate(sent):
            if token[-1] == 'PER':
                token = list(token)
                token[0] = get_rand_character(names)
                sent[i] = tuple(token)
        augmented_sents_characters.append(sent)

[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Канаме', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')]
---------------------
[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Ryou', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')]
[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Канаме', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')]
---------------------
[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Хибики', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')]
[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Канаме', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')]
---------------------
[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Эйсаку', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')]
[('2', 'O'), ('фрейм', 'O'), ('у', 'O'), ('Канаме', 'PER'), ('встал', 'O'), ('на', 'O'), ('себя', 'O'), ('самого', 'O'), ('Оо', 'O')

In [9]:
len(augmented_sents_characters)

1370

# Mix data with augmented and save

In [10]:
mixed = augmented_sents_characters + sents
shuffle(mixed)

In [11]:
token_to_str = lambda token: '%s _ _ %s' % token
sent_to_str_tokens = lambda sent: '\n'.join(list(map(token_to_str, sent)))

mixed = list(map(sent_to_str_tokens, mixed))
with open(processed_root / f'comments/augmented_{AUGMENTATION_ROUNDS}/raw.txt', 'w') as f:
    f.write('\n\n'.join(mixed))