Не особо удачная попытка найти пейринги в метаинформации манги на её главной странице. Такие данные очень редко указаны, но могут быть полезны для создания датасета/валидации алгоритмов NER

In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
from pairing_utils import get_pairing

In [None]:
root = '../characters_parser/data/'
path = root + 'merged_grouple_data.csv'
df = pd.read_csv(path)
print(df.shape)
print(df[~df.volumes_data_file.isna()].shape)
df = df[~df.volumes_data_file.isna()]
df.head()

In [3]:
def load_data(page_file, volumes_file):
    page = root + 'grouple_manga_pages/' + page_file
    volumes = root + 'grouple_merged_volumes/' + volumes_file
    return {'page': json.load(open(page)),
            'volumes': json.load(open(volumes))}

In [4]:
c = 0
for i, (name, page, volumes) in enumerate(df.to_records(index=False)):
    if c > 5:
        break
    
    data = load_data(page, volumes)
    url = data['page']['url']
    descr = data['page']['description'].replace('\n', ' ')
    is_pairing = 'пейринг' in descr.lower()

    if is_pairing:
        print(url, get_pairing(descr))
        c += 1

https://mintmanga.live/free__dj___awkward_sleepover ('Харука', 'Рин')
https://mintmanga.live/yowamushi_pedal_dj___juicy_hold_hole ('Тодо', 'Макисима')
https://mintmanga.live/katekyo_hitman_reborn__dj___a_game_for_new_suits ('Ямамото', 'Гокудера')
https://mintmanga.live/durarara___dj___biting_because_you_re_mine ('Shizuo', 'Izaya')
https://mintmanga.live/vse_vmeste ('Кенсей', 'Шуухей')
https://mintmanga.live/black_butler_dj___cheshire None


In [5]:
class Manga:
    def __init__(self, characters, comments):
        self.characters = characters
        self.comments = comments

In [16]:
dataset = []

for i, (name, page, volumes) in enumerate(df.to_records(index=False)):
    data = load_data(page, volumes)
    url = data['page']['url']
    descr = data['page']['description']
    is_pairing = 'пейринг' in descr.lower()

    if is_pairing:
        res = get_pairing(descr)
        if res is not None:
            p1, p2 = res
            print(p1, '+', p2)
            break

Харука + Рин


In [18]:
comms = data['volumes']['comments']
comms[:10]

['Какой Хару на первом фрейме... Вах просто *о*',
 'зубы .ахаха',
 'опечаточка забавная. особенно в тему к "плохому японскому"',
 'Какая страница',
 'правильное решение',
 'ща поможешь',
 'какой цёёмик :З',
 'молодец помог*О*',
 'прям помесьХаруки и Соске @_@',
 'Щас все будет ринушка ты только подожди']

In [72]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
stemmer = SnowballStemmer("russian")

In [108]:
class Comment:
    def __init__(self, characters, comments):
        self.comments = list(map(self.clear, comments))
        self.labels = list(map(lambda s: ['O' for _ in s.split()], self.comments))
        for character in characters:
            character = stemmer.stem(character)
            for comm_i, comm in enumerate(self.comments):
                comm = self.stem(comm).lower()
                for i, word in enumerate(comm.split()):
                    if (word in character and len(word)>=3) or word == character:
                        self.labels[comm_i][i] = 'PER'
    
    def clear(self, text, stem=False):
        text = self.split_pairings(text)
        text = self.tokenize(text)
        text = self.del_spec_symbols(text)
        if stem:
            text = self.stem(text)
        return text

    def split_pairings(self, text):
        s = text[0]
        for i, c in enumerate(text[1:], start=1):
            prev = text[i-1]
            if c.isalpha() and prev.isalpha():
                if c.isupper() and not prev.isupper():
                    s += ' '
            s += c
        return s
    
    def tokenize(self, text):
        text = tokenizer.tokenize(text)
        text = ' '.join(text)
        return text
    
    def del_spec_symbols(self, text):
        ranges = 'ая АЯ az AZ 09'.split()
        allowed = ''
        for cfrom, cto in ranges:
            chars = list(map(chr, range(ord(cfrom), ord(cto)+1))) # range of symbols
            allowed += ''.join(chars)
        punkt = ',.!? '
        
        s = text[0] if text[0] in allowed else ''
        for i, c in enumerate(text[1:], start=1):
            prev = text[i-1]
            if prev in punkt and c in punkt:
                # do not allow sequential punktuation such as ...
                continue
            if c in allowed or c in punkt:
                s += c
        return s.strip()
    
    def stem(self, text):
        def stem_cased(word):
            if word[0].isupper():
                return stemmer.stem(word).capitalize()
            else:
                return stemmer.stem(word)
            
        text = list(filter(lambda s: len(s)>0, text.split()))
        text = [stem_cased(w) for w in text]
        return ' '.join(text)

In [109]:
dataset = []
n = 0

def hotfix(pname):
    for i, c in enumerate(pname[1:], start=1):
        if c.isupper():
            return pname[:i]
    return pname
        
for i, (name, page, volumes) in enumerate(df.to_records(index=False)):
    data = load_data(page, volumes)
    url = data['page']['url']
    descr = data['page']['description']
    is_pairing = 'пейринг' in descr.lower()

    if n > 5:
        break
    
    if is_pairing:
        res = get_pairing(descr)
        if res is not None:
            n += 1
            p1, p2 = res
            p2 = hotfix(p2)
            print(p1, '+', p2)
            dataset.append(Comment((p1, p2), data['volumes']['comments']))

Харука + Рин
Тодо + Макисима
Ямамото + Гокудера
Shizuo + Izaya
Кенсей + Шуухей
Эспада + Гриммиджоу


In [None]:
list(map(lambda obj: list(zip(obj.comments, obj.labels)), dataset))

In [1]:
cleaner = Comment()
def clean(text):
    text = cleaner.split_pairings(text)
    text = cleaner.tokenize(text)
    text = cleaner.del_spec_symbols(text)
    text = cleaner.stem(text)
    return text

list(map(clean, comms))

NameError: name 'Comment' is not defined