Первое задание

Загружаем тексты

In [1]:
import gzip

from dataclasses import dataclass
from typing import Iterator

@dataclass
class Text:
    label: str
    title: str
    text: str


def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

texts = list(read_texts("data/news.txt.gz"))


In [2]:
from yargy import Parser, rule, and_, or_
from yargy.predicates import gram, gte, lte, is_capitalized
from yargy.pipelines import morph_pipeline
from yargy.interpretation import fact
from yargy.relations import gnc_relation

ENTRY = fact("Entry", ["full_name", "birthday", "birth_place"])
NAME = fact("Name", ["first", "last", "patronimic"])
BIRTHDAY = fact("date", ["d", "m", "y"])
BIRTH_PLACE = fact("Birth_place", ["place"])


DAY = and_(gte(1), lte(31)) # >= 1 и <=31
MONTH = morph_pipeline(["Январь", "Февраль", "Март", "Апрель", "Май", "Июнь", "Июль", "Август", "Сентябрь", "Октябрь", "Ноябрь", "Декабрь"])
YEAR = and_(gte(1), lte(2024)) # >= 1 и <= 2024

# Немного учтем возможность разного порядка написания ФИО

name_surn_patr = rule(
    gram("Name").interpretation(NAME.first.inflected()),
    gram("Surn").interpretation(NAME.last.inflected()),
    gram("Patr").interpretation(NAME.patronimic.inflected()).optional()
).interpretation(NAME)

surn_name_patr = rule(
    gram("Surn").interpretation(NAME.last.inflected()),
    gram("Name").interpretation(NAME.first.inflected()),
    gram("Patr").interpretation(NAME.patronimic.inflected()).optional()
).interpretation(NAME)

birthday_words = rule(
    DAY.interpretation(BIRTHDAY.d).optional(),
    MONTH.interpretation(BIRTHDAY.m).optional(),
    morph_pipeline(["в"]).optional(),
    YEAR.interpretation(BIRTHDAY.y).optional(),
    morph_pipeline(["Году"]).optional()
).interpretation(BIRTHDAY).optional()

birthday_nums = rule(
    DAY.interpretation(BIRTHDAY.d).optional(),
    '.',
    and_(gte(1), lte(12)).interpretation(BIRTHDAY.m).optional(),
    '.',
    YEAR.interpretation(BIRTHDAY.y).optional(),
    morph_pipeline(["Года"]).optional()
).interpretation(BIRTHDAY).optional()

birth_place = rule(
    morph_pipeline(["в"]),
    is_capitalized().interpretation(BIRTH_PLACE.place).optional()
).interpretation(BIRTH_PLACE).optional()

entry = rule(
    or_(
    name_surn_patr.interpretation(ENTRY.full_name),
    surn_name_patr.interpretation(ENTRY.full_name)
    ),
    morph_pipeline(["Родился", "был рожден", "появился на свет"]),

    # Так как могут идти в любом порядке

    or_(
        or_(
            birthday_nums.interpretation(ENTRY.birthday),
            birthday_words.interpretation(ENTRY.birthday),
        ),
        birth_place.interpretation(ENTRY.birth_place)
    ),
    or_(
        or_(
             birthday_nums.interpretation(ENTRY.birthday),
            birthday_words.interpretation(ENTRY.birthday),
        ),
        birth_place.interpretation(ENTRY.birth_place)
    )
).interpretation(ENTRY)

parser = Parser(entry)


In [3]:
text = "Иванов Иван Иванович родился 18.01.1978 в Белграде"

for match in parser.findall(text):
    print(match.fact)

Entry(full_name=Name(first='иван', last='иванов', patronimic='иванович'), birthday=date(d='18', m='01', y='1978'), birth_place=Birth_place(place='Белграде'))


In [4]:
text = "Иванов Иван Иванович родился 18 января 1978 года в Белграде"

for match in parser.findall(text):
    print(match.fact)

Entry(full_name=Name(first='иван', last='иванов', patronimic='иванович'), birthday=date(d='18', m='января', y='1978'), birth_place=Birth_place(place='Белграде'))


In [3]:
from tqdm import tqdm 

for text in tqdm(texts, disable=False):
    try:
        for match in parser.findall(text.text):
         print(match.fact)
    except:
       # empty
       pass

  0%|          | 0/10000 [00:00<?, ?it/s]

  4%|▍         | 436/10000 [00:18<06:29, 24.58it/s]

Entry(full_name=Name(first='андрей', last='курносенко', patronimic=None), birthday=None, birth_place=Birth_place(place='Севастополе'))


  5%|▌         | 544/10000 [00:23<05:44, 27.47it/s]

Entry(full_name=Name(first='иосиф', last='кобзон', patronimic=None), birthday=date(d=None, m=None, y=None), birth_place=None)


 11%|█         | 1074/10000 [00:48<06:07, 24.27it/s]

Entry(full_name=Name(first='анна', last='матисон', patronimic=None), birthday=None, birth_place=None)


 18%|█▊        | 1841/10000 [01:25<06:58, 19.51it/s]

Entry(full_name=Name(first='яковлевюрий', last='яковлев', patronimic=None), birthday=date(d=None, m=None, y='1928'), birth_place=Birth_place(place='Москве'))


 25%|██▌       | 2522/10000 [01:59<05:09, 24.16it/s]

Entry(full_name=Name(first='николай', last='караченцов', patronimic=None), birthday=date(d='27', m='октября', y='1944'), birth_place=Birth_place(place='Москве'))


 30%|██▉       | 2997/10000 [02:21<05:23, 21.67it/s]

Entry(full_name=Name(first='сергей', last='довлатов', patronimic=None), birthday=date(d=None, m=None, y='1941'), birth_place=None)


 33%|███▎      | 3295/10000 [02:36<04:40, 23.92it/s]

Entry(full_name=Name(first='аня', last='титов', patronimic=None), birthday=date(d=None, m=None, y='1984'), birth_place=None)


 38%|███▊      | 3783/10000 [03:00<03:58, 26.04it/s]

Entry(full_name=Name(first='игорь', last='доценко', patronimic=None), birthday=date(d=None, m=None, y='1953'), birth_place=Birth_place(place='Хмельницкой'))


 39%|███▉      | 3904/10000 [03:06<04:03, 25.01it/s]

Entry(full_name=Name(first='андрей', last='сердюков', patronimic=None), birthday=date(d=None, m=None, y='1962'), birth_place=None)


 39%|███▉      | 3940/10000 [03:08<05:09, 19.60it/s]

Entry(full_name=Name(first='анатолий', last='сморгонский', patronimic=None), birthday=date(d=None, m=None, y='1979'), birth_place=None)


 46%|████▌     | 4619/10000 [03:41<03:58, 22.60it/s]

Entry(full_name=Name(first='инна', last='лиснянская', patronimic=None), birthday=date(d=None, m=None, y='1928'), birth_place=Birth_place(place='Баку'))


 48%|████▊     | 4823/10000 [03:52<04:29, 19.17it/s]

Entry(full_name=Name(first='колин', last='дэвис', patronimic=None), birthday=date(d=None, m=None, y=None), birth_place=None)


 60%|██████    | 6011/10000 [04:50<03:04, 21.59it/s]

Entry(full_name=Name(first='эрик', last='хобсбаум', patronimic=None), birthday=date(d=None, m=None, y='1917'), birth_place=None)


 64%|██████▎   | 6373/10000 [05:08<03:07, 19.35it/s]

Entry(full_name=Name(first='татьяна', last='самойлов', patronimic=None), birthday=date(d='4', m='мая', y='1934'), birth_place=Birth_place(place='Ленинграде'))


 65%|██████▌   | 6528/10000 [05:15<02:57, 19.55it/s]

Entry(full_name=Name(first='владимир', last='высоцкий', patronimic=None), birthday=date(d=None, m=None, y='1938'), birth_place=Birth_place(place='Москве'))


 69%|██████▉   | 6886/10000 [05:33<02:21, 22.06it/s]

Entry(full_name=Name(first='мэри', last='дональдсон', patronimic=None), birthday=None, birth_place=Birth_place(place='Хобарте'))


 74%|███████▍  | 7375/10000 [05:56<01:57, 22.32it/s]

Entry(full_name=Name(first='борис', last='васильев', patronimic=None), birthday=date(d='21', m='мая', y='1924'), birth_place=Birth_place(place='Смоленске'))


 74%|███████▍  | 7401/10000 [05:57<01:51, 23.33it/s]

Entry(full_name=Name(first='расул', last='гамзатов', patronimic=None), birthday=date(d=None, m=None, y='1923'), birth_place=None)


 87%|████████▋ | 8690/10000 [07:00<00:52, 25.07it/s]

Entry(full_name=Name(first='евгений', last='гришковец', patronimic=None), birthday=date(d=None, m=None, y='1967'), birth_place=Birth_place(place='Кемерово'))


 87%|████████▋ | 8719/10000 [07:01<00:53, 23.99it/s]

Entry(full_name=Name(first='лев', last='дуров', patronimic=None), birthday=date(d='23', m='декабря', y='1931'), birth_place=None)


 88%|████████▊ | 8808/10000 [07:06<00:53, 22.44it/s]

Entry(full_name=Name(first='дэниел', last='эдельман', patronimic=None), birthday=None, birth_place=Birth_place(place='Нью'))


 89%|████████▉ | 8927/10000 [07:12<00:48, 22.06it/s]

Entry(full_name=Name(first='зинаида', last='серебрякова', patronimic=None), birthday=date(d='28', m='июня', y='1913'), birth_place=Birth_place(place='Царском'))


 92%|█████████▏| 9171/10000 [07:24<00:37, 22.29it/s]

Entry(full_name=Name(first='игорь', last='кваша', patronimic=None), birthday=date(d=None, m=None, y='1933'), birth_place=Birth_place(place='Москве'))


 93%|█████████▎| 9253/10000 [07:29<00:32, 22.98it/s]

Entry(full_name=Name(first='полина', last='жеребцов', patronimic=None), birthday=date(d=None, m=None, y='1985'), birth_place=Birth_place(place='Грозном'))


 93%|█████████▎| 9306/10000 [07:31<00:31, 21.86it/s]

Entry(full_name=Name(first='юрий', last='трофимов', patronimic=None), birthday=date(d=None, m=None, y='1940'), birth_place=None)


 98%|█████████▊| 9791/10000 [07:55<00:09, 22.18it/s]

Entry(full_name=Name(first='в', last='брак', patronimic=None), birthday=None, birth_place=None)


 99%|█████████▊| 9862/10000 [07:59<00:07, 18.07it/s]

Entry(full_name=Name(first='алексей', last='ремизов', patronimic=None), birthday=date(d=None, m=None, y='1877'), birth_place=Birth_place(place='Москве'))


100%|██████████| 10000/10000 [08:05<00:00, 20.58it/s]
