## Load RAW text

In [1]:
from __future__ import annotations

from pathlib import Path
import re
import pandas as pd
from tqdm import tqdm


# What comes in:
# - Nothing (this cell defines configuration and folders).
#
# What it does:
# - Defines file paths, creates output directories, sets sampling size and seed.
#
# What goes out:
# - Constants: RAW_PATH, PROCESSED_DIR, ARTIFACTS_DIR, SEED, SAMPLE_PARAGRAPHS


# Paths
RAW_PATH = Path("../data/raw/war_and_peace_ru.txt")
PROCESSED_DIR = Path("data/processed")
ARTIFACTS_DIR = Path("data/artifacts")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Sampling
SEED = 42
SAMPLE_PARAGRAPHS = 50

In [2]:
# quick check raw text

# What comes in:
# - RAW_PATH (text file path)
#
# What it does:
# - Reads raw text and prints basic diagnostics and a preview.
#
# Example input:
# - "data/raw/war_and_peace_ru.txt" containing the novel text
#
# Example output:
# - chars: 1901009
# - lines: 7687
# - preview: first ~800 characters

raw_text = RAW_PATH.read_text(encoding="utf-8")
print("chars:", len(raw_text))
print("lines:", raw_text.count("\n") + 1)
print("preview:\n", raw_text[:800])

chars: 1901009
lines: 7687
preview:
 Лев Николаевич Толстой
Война и мир

Первый вариант романа

От автора

  
   Я пишу до сих пор только о князьях, графах, министрах, сенаторах и их детях и боюсь, что и вперед не будет других лиц в моей истории.
   Может быть, это нехорошо и не нравится публике; может быть, для нее интереснее и поучительнее история мужиков, купцов, семинаристов, но, со всем моим желанием иметь как можно больше читателей, я не могу угодить такому вкусу, по многим причинам.
   Во-первых, потому, что памятники истории того времени, о котором я пишу, остались только в переписке и записках людей высшего круга грамотных; даже интересные и умные рассказы, которые мне удалось слышать, слышал я только от людей того же круга.
   Во-вторых, потому, что жизнь купцов, кучеров, семинаристов, каторжников и мужиков для меня


## Clean + segment paragraphs

In [3]:
# What comes in:
# - raw_text (full book text as one string)
#
# What it does:
# - Splits the text into paragraph-like segments using blank lines.
# - Normalizes whitespace inside segments.
# - Produces a DataFrame with (segment_id, text).
#
# Example input:
# - "Лев Николаевич Толстой\nВойна и мир\n\nПервый вариант романа\n\n..."
#
# Example output:
# - DataFrame rows:
#   segment_id=0, text="Лев Николаевич Толстой\nВойна и мир"
#   segment_id=1, text="Первый вариант романа"
#   ...

def split_into_paragraphs(text: str) -> List[str]:
    chunks = re.split(r"\n\s*\n+", text.strip())
    cleaned: List[str] = []
    for ch in chunks:
        ch = ch.strip()
        if not ch:
            continue
        # Keep line breaks inside a paragraph, but remove excessive spaces
        ch = re.sub(r"[ \t]+", " ", ch)
        cleaned.append(ch)
    return cleaned

paragraphs = split_into_paragraphs(raw_text)

df_segments = pd.DataFrame(
    {
        "segment_id": list(range(len(paragraphs))),
        "text": paragraphs,
    }
)

print("segments:", len(df_segments))
display(df_segments.head(20))

segments: 392


Unnamed: 0,segment_id,text
0,0,Лев Николаевич Толстой\nВойна и мир
1,1,Первый вариант романа
2,2,От автора
3,3,"Я пишу до сих пор только о князьях, графах, ми..."
4,4,ЧАСТЬ ПЕРВАЯ
5,5,I
6,6,"-- Ну что, князь, Генуя и Лукка стали не больш..."
7,7,II
8,8,Гостиная Анны Павловны начала понемногу наполн...
9,9,III


In [4]:
# What comes in:
# - df_segments: DataFrame with all paragraph-like segments
#
# What it does:
# - Defines helpers to:
#   1) drop front matter by cutting everything before "ЧАСТЬ ПЕРВАЯ"
#   2) remove structural markers like "I", "II", "ЧАСТЬ ПЕРВАЯ"
#
# Example input:
# - df_segments with rows containing "ЧАСТЬ ПЕРВАЯ", "I", "II"
#
# Example output:
# - boolean decisions for is_structural_segment(...)
# - cut_to_main_text_df(df) returns only story segments

def is_structural_segment(text: str) -> bool:
    t = text.strip()

    # Part headers like "ЧАСТЬ ПЕРВАЯ"
    if re.fullmatch(r"ЧАСТЬ\s+[А-ЯЁ]+", t):
        return True

    # Roman numerals used as chapter markers: I, II, III, IV, V, ...
    if re.fullmatch(r"[IVXLCDM]+", t):
        return True

    return False


In [5]:
# - cut_to_main_text_df(df) returns only story segments

def cut_to_main_text_df(df: pd.DataFrame) -> pd.DataFrame:
    # Keep segments strictly after the first occurrence of "ЧАСТЬ ПЕРВАЯ" if present
    mask = df["text"].str.strip().eq("ЧАСТЬ ПЕРВАЯ")
    if not mask.any():
        return df.copy()
    start_seg_id = int(df.loc[mask, "segment_id"].iloc[0])
    return df[df["segment_id"] > start_seg_id].copy()

In [6]:
# What comes in:
# - df_segments: all segments
# - cut_to_main_text_df, is_structural_segment
#
# What it does:
# - Cuts off front matter (everything before "ЧАСТЬ ПЕРВАЯ").
# - Removes structural segments (roman numerals and part headers).
# - Saves the resulting "story-only" segments to CSV for reuse.
#
# Example input:
# - df_segments including: "Лев Николаевич Толстой...", "ЧАСТЬ ПЕРВАЯ", "I", "-- Ну что, князь..."
#
# Example output:
# - df_segments_story head() shows story paragraphs like dialogues and narration
# - data/processed/segments_story.csv is created

df_segments_story = cut_to_main_text_df(df_segments)
df_segments_story = df_segments_story[~df_segments_story["text"].apply(is_structural_segment)].copy()

print("segments total:", len(df_segments))
print("segments story:", len(df_segments_story))

display(df_segments_story.head(15))

df_segments_story.to_csv(PROCESSED_DIR / "segments_story.csv", index=False)


segments total: 392
segments story: 239


Unnamed: 0,segment_id,text
6,6,"-- Ну что, князь, Генуя и Лукка стали не больш..."
8,8,Гостиная Анны Павловны начала понемногу наполн...
10,10,Вечер Анны Павловны был пущен. Веретена с разн...
12,12,Новое лицо это был молодой князь Андрей Болкон...
14,14,Анна Павловна попросила виконта подождать ее и...
16,16,"Конец истории виконта был следующий:\n ""Герцог..."
18,18,"-- Вся нация умрет за своего императора, за ве..."
20,20,Поблагодарив Анну Павловну за ее прелестный ве...
22,22,"Пьер, приехав вперед, как домашний человек, пр..."
23,23,Х


## NER extraction (PERSON)

In [7]:
# What comes in:
# - Nothing
#
# What it does:
# - Downloads (once) and loads the Stanza Russian NER pipeline.
# - Prepares an NLP object for PERSON extraction.
#
# Example input:
# - sentence: "Так говорила Анна Павловна Шерер..."
#
# Example output:
# - nlp object that can produce entities with type "PER"

import stanza

# Download models once (cached in ~/.stanza)
stanza.download("ru")

nlp = stanza.Pipeline(
    lang="ru",
    processors="tokenize,ner",
    tokenize_no_ssplit=True,  # We already control paragraph/sentence splitting
    use_gpu=False
)

nlp

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2026-02-01 12:17:44 INFO: Downloaded file to /Users/newuser/stanza_resources/resources.json
2026-02-01 12:17:44 INFO: Downloading default packages for language: ru (Russian) ...
2026-02-01 12:17:45 INFO: File exists: /Users/newuser/stanza_resources/ru/default.zip
2026-02-01 12:17:47 INFO: Finished downloading models and saved to /Users/newuser/stanza_resources
2026-02-01 12:17:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2026-02-01 12:17:47 INFO: Downloaded file to /Users/newuser/stanza_resources/resources.json
2026-02-01 12:17:48 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| ner       | wikiner   |

2026-02-01 12:17:48 INFO: Using device: cpu
2026-02-01 12:17:48 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2026-02-01 12:17:51 INFO: Loading: ner
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2026-02-01 12:17:52 INFO: Done loading processors!


<stanza.pipeline.core.Pipeline at 0x15a225510>

In [8]:
# What comes in:
# - df_segments_story: DataFrame with story paragraphs
# - nlp: Stanza pipeline
#
# What it does:
# - Runs PERSON NER on each paragraph.
# - Extracts mention-level entities with character offsets.
#
# Example input:
# - text="Так говорила ... Анна Павловна Шерер ..."
#
# Example output:
# - rows like:
#   segment_id=6
#   span_start=16
#   span_end=36
#   mention_text="Анна Павловна Шерер"
#   label="PER"

rows = []

for r in tqdm(df_segments_story.itertuples(index=False), total=len(df_segments_story)):
    seg_id = int(r.segment_id)
    text = str(r.text)

    doc = nlp(text)

    for ent in doc.ents:
        if ent.type == "PER":
            rows.append(
                {
                    "segment_id": seg_id,
                    "span_start": ent.start_char,
                    "span_end": ent.end_char,
                    "mention_text": ent.text,
                    "label": ent.type,
                }
            )

df_mentions = pd.DataFrame(rows)

print("mentions:", len(df_mentions))
display(df_mentions.head(30))


100%|██████████| 239/239 [28:03<00:00,  7.04s/it]

mentions: 12747





Unnamed: 0,segment_id,span_start,span_end,mention_text,label
0,6,79,89,Буонапарте,PER
1,6,477,496,Анна Павловна Шерер,PER
2,6,534,549,Марии Федоровны,PER
3,6,586,593,Василия,PER
4,6,627,640,Анна Павловна,PER
5,6,1087,1097,Аннa Шерер,PER
6,6,1550,1563,Анне Павловне,PER
7,6,2045,2058,Анна Павловна,PER
8,6,2338,2343,Ежели,PER
9,6,2553,2565,Новосильцева,PER


## Analysis (counts, top names, examples, errors)

In [9]:
# What comes in:
# - df_mentions: mention-level entities
# - df_segments_story: source paragraphs
#
# What it does:
# - Computes top person surface forms by frequency.
# - Shows example paragraphs for the most frequent mentions.
#
# Example input:
# - df_mentions with many rows for "Анна Павловна", "князь Василий", "Пьер"
#
# Example output:
# - top_mentions DataFrame with mention_text and counts
# - printed examples of paragraphs for the top mentions

top_mentions = (
    df_mentions["mention_text"]
    .value_counts()
    .rename_axis("mention_text")
    .reset_index(name="count")
)

display(top_mentions.head(30))

# Show a few example paragraphs for the top 5 mentions
top5 = top_mentions.head(5)["mention_text"].tolist()

for name in top5:
    seg_ids = df_mentions.loc[df_mentions["mention_text"] == name, "segment_id"].unique()[:3]
    print("\n---")
    print("mention_text:", name)
    print("example segment_ids:", seg_ids.tolist())
    for sid in seg_ids:
        para = df_segments_story.loc[df_segments_story["segment_id"] == sid, "text"].iloc[0]
        print("\nsegment_id:", int(sid))
        print(para[:600])


Unnamed: 0,mention_text,count
0,Пьер,769
1,Андрей,547
2,Наташа,448
3,Николай,296
4,Князь Андрей,253
5,Андрея,224
6,Борис,182
7,Кутузов,168
8,Долохов,164
9,Пьерa,153



---
mention_text: Пьер
example segment_ids: [8, 12, 16]

segment_id: 8
Гостиная Анны Павловны начала понемногу наполняться. Приехала высшая знать Петербурга, люди самые разнородные по возрастам и характерам, но одинаковые по обществу, в каком все жили; приехал дипломат граф З. в звездах и орденах всех иностранных дворов, княгиня Л., отцветающая красавица, жена посланника; вошел дряхлый генерал, стуча саблей и кряхтя; вошла дочь князя Василия, красавица Элен, заехавшая за отцом, чтобы с ним вместе ехать на праздник посланника. Она была в шифре и бальном платье. Приехала и известная как самая обворожительная женщина Петербурга молодая, маленькая княгиня Болконская,

segment_id: 12
Новое лицо это был молодой князь Андрей Болконский, муж маленькой княгини. Не столько по тому, что молодой князь приехал так поздно, но все-таки был принят хозяйкой самым любезным образом, сколько по тому, как он вошел в комнату, было видно, что он один из тех светских молодых людей, которые так избалованы све

## Export to CSV

In [10]:
# What comes in:
# - df_mentions: person mentions
# - df_segments_story: story segments
#
# What it does:
# - Writes mention-level CSV.
# - Writes surface-form frequency CSV.
#
# Example output files:
# - data/artifacts/person_mentions.csv
# - data/artifacts/person_surface_counts.csv

mentions_path = ARTIFACTS_DIR / "person_mentions.csv"
counts_path = ARTIFACTS_DIR / "person_surface_counts.csv"

df_mentions.to_csv(mentions_path, index=False, encoding="utf-8")
top_mentions.to_csv(counts_path, index=False, encoding="utf-8")

print("saved:", mentions_path)
print("saved:", counts_path)

saved: data/artifacts/person_mentions.csv
saved: data/artifacts/person_surface_counts.csv


## Normalize mentions -> build person candidates (first name, last name, title)

In [11]:
# What comes in:
# - df_mentions: columns [segment_id, span_start, span_end, mention_text, label]
#
# What it does:
# - Cleans surface forms (fix latin lookalikes, normalize whitespace).
# - Extracts title/role prefix (knyaz, graf, etc) if present.
# - Lemmatizes name tokens (using pymorphy2) to reduce case inflections.
# - Produces normalized mention table suitable for grouping into persons.
#
# Example input:
# - mention_text = "Князь Андрея"
# - mention_text = "Пьерa" (latin 'a')
# - mention_text = "Ах"
#
# Example output:
# - mention_text_clean="Князь Андрея"
#   title="князь"
#   name_core="Андрея"
#   name_lemma="Андрей"
#   canonical_key="андрей"
#   is_noise=False
#
# - mention_text_clean="Пьера"
#   title=None
#   name_core="Пьера"
#   name_lemma="Пьер"
#   canonical_key="пьер"
#   is_noise=False
#
# - mention_text_clean="Ах"
#   is_noise=True

from typing import Dict, List, Optional, Tuple
import pymorphy3

morph = pymorphy3.MorphAnalyzer()

# Titles / roles to detect as a prefix
TITLE_PREFIXES = [
    "князь", "княгиня", "княжна",
    "граф", "барон", "виконт",
    "император", "государь",
    "фрейлина", "адъютант",
    "мсье", "мадам",
]

# Noise words that sometimes appear as PER by mistake
NOISE_STOPWORDS = {
    "ах", "эжели", "ну", "да", "нет", "о", "эй",
}

# Latin lookalikes often found in OCR or mixed encodings
LATIN_TO_CYR = str.maketrans({
    "A": "А", "a": "а",
    "B": "В",
    "C": "С", "c": "с",
    "E": "Е", "e": "е",
    "H": "Н", "h": "н",
    "K": "К",
    "M": "М",
    "O": "О", "o": "о",
    "P": "Р", "p": "р",
    "T": "Т",
    "X": "Х", "x": "х",
    "Y": "У", "y": "у",
})

CYRILLIC_RE = re.compile(r"[А-ЯЁа-яё]")

def fix_latin_lookalikes(text: str) -> str:
    # Replace latin letters with similar Cyrillic letters
    return text.translate(LATIN_TO_CYR)

def normalize_whitespace(text: str) -> str:
    # Collapse whitespace and normalize quotes
    t = text.strip()
    t = re.sub(r"\s+", " ", t)
    return t

def split_title_prefix(text: str) -> Tuple[Optional[str], str]:
    # Extract a single title prefix if present at the beginning
    t = text.strip()
    t_lower = t.lower()
    for title in TITLE_PREFIXES:
        if t_lower.startswith(title + " "):
            return title, t[len(title):].strip()
    return None, t

def lemmatize_token(token: str) -> str:
    # Lemmatize one token using pymorphy2
    # Keep proper noun capitalization as in lemma when possible
    parses = morph.parse(token)
    if not parses:
        return token
    lemma = parses[0].normal_form
    # Capitalize lemma for readability if original looks like a name
    if token[:1].isupper():
        lemma = lemma[:1].upper() + lemma[1:]
    return lemma

def lemmatize_name(name_core: str) -> str:
    # Lemmatize each token in the core name
    tokens = [t for t in re.split(r"[\s\-]+", name_core) if t]
    lemmas = [lemmatize_token(t) for t in tokens]
    return " ".join(lemmas).strip()

def is_noise_name(name_core: str) -> bool:
    # Fast noise checks
    if not name_core:
        return True
    if not CYRILLIC_RE.search(name_core):
        return True
    if name_core.lower() in NOISE_STOPWORDS:
        return True
    # Too short tokens are often noise
    if len(name_core) <= 2:
        return True
    return False

norm_rows: List[Dict] = []

for r in tqdm(df_mentions.itertuples(index=False), total=len(df_mentions)):
    raw = str(r.mention_text)
    clean = normalize_whitespace(fix_latin_lookalikes(raw))

    title, core = split_title_prefix(clean)

    noise = is_noise_name(core)

    lemma = lemmatize_name(core) if not noise else core
    canonical_key = lemma.lower() if not noise else None

    norm_rows.append({
        "segment_id": int(r.segment_id),
        "span_start": int(r.span_start),
        "span_end": int(r.span_end),
        "mention_text": raw,
        "mention_text_clean": clean,
        "title": title,
        "name_core": core,
        "name_lemma": lemma,
        "canonical_key": canonical_key,
        "is_noise": bool(noise),
    })

df_mentions_norm = pd.DataFrame(norm_rows)

print("mentions_norm:", len(df_mentions_norm))
print("noise_rate:", df_mentions_norm["is_noise"].mean())
display(df_mentions_norm.head(30))


100%|██████████| 12747/12747 [00:01<00:00, 9012.04it/s]


mentions_norm: 12747
noise_rate: 0.015376166941241077


Unnamed: 0,segment_id,span_start,span_end,mention_text,mention_text_clean,title,name_core,name_lemma,canonical_key,is_noise
0,6,79,89,Буонапарте,Буонапарте,,Буонапарте,Буонапарта,буонапарта,False
1,6,477,496,Анна Павловна Шерер,Анна Павловна Шерер,,Анна Павловна Шерер,Анна Павлович Шерер,анна павлович шерер,False
2,6,534,549,Марии Федоровны,Марии Федоровны,,Марии Федоровны,Мария Фёдорович,мария фёдорович,False
3,6,586,593,Василия,Василия,,Василия,Василий,василий,False
4,6,627,640,Анна Павловна,Анна Павловна,,Анна Павловна,Анна Павлович,анна павлович,False
5,6,1087,1097,Аннa Шерер,Анна Шерер,,Анна Шерер,Анна Шерер,анна шерер,False
6,6,1550,1563,Анне Павловне,Анне Павловне,,Анне Павловне,Анне Павлович,анне павлович,False
7,6,2045,2058,Анна Павловна,Анна Павловна,,Анна Павловна,Анна Павлович,анна павлович,False
8,6,2338,2343,Ежели,Ежели,,Ежели,Ежели,ежели,False
9,6,2553,2565,Новосильцева,Новосильцева,,Новосильцева,Новосильцев,новосильцев,False


## Group mentions into person candidates (v0): first name, last name, titles


In [12]:
# What comes in:
# - df_mentions_norm with canonical_key, title, name_lemma
#
# What it does:
# - Builds person candidates by grouping normalized mentions.
# - Heuristic surname extraction:
#   - If name_lemma has 2+ tokens, treat the last token as last_name.
#   - Otherwise last_name is None (to be filled later).
# - Aggregates title frequencies per person candidate.
# - Computes a simple confidence score [0..1].
#
# Example input:
# - name_lemma="Кутузов" -> first="Кутузов", last=None (single token, likely a surname used alone)
# - name_lemma="Андрей Болконский" -> first="Андрей", last="Болконский"
#
# Example output:
# - persons_v0 DataFrame with:
#   person_id, first_name, last_name, canonical_full_name, titles_json, evidence_count, confidence

import json

def split_first_last(name_lemma: str) -> Tuple[Optional[str], Optional[str]]:
    tokens = [t for t in name_lemma.split(" ") if t]
    if len(tokens) == 1:
        return tokens[0], None
    # Use first token as first name and last token as last name (v0 heuristic)
    return tokens[0], tokens[-1]

df_work = df_mentions_norm[~df_mentions_norm["is_noise"]].copy()

# Add first_name / last_name columns
first_last = df_work["name_lemma"].apply(split_first_last)
df_work["first_name"] = first_last.apply(lambda x: x[0])
df_work["last_name"] = first_last.apply(lambda x: x[1])

# Person key: prefer full lemma form (more stable than only first name)
df_work["person_key"] = df_work["name_lemma"].str.lower()

# Aggregate titles per person_key
title_counts = (
    df_work.dropna(subset=["title"])
    .groupby(["person_key", "title"])
    .size()
    .reset_index(name="count")
)

title_map: Dict[str, List[Dict]] = {}
for r in title_counts.itertuples(index=False):
    title_map.setdefault(r.person_key, []).append({"title": r.title, "count": int(r.count)})

# Aggregate evidence count and pick canonical_full_name as the most frequent clean mention
person_agg = (
    df_work.groupby("person_key")
    .agg(
        evidence_count=("person_key", "size"),
        canonical_full_name=("mention_text_clean", lambda s: s.value_counts().index[0]),
        first_name=("first_name", lambda s: s.value_counts().index[0]),
        last_name=("last_name", lambda s: s.value_counts().index[0] if s.notna().any() else None),
    )
    .reset_index()
)

# Confidence heuristic (v0)
# - more evidence -> higher confidence
# - having last name -> higher confidence
def confidence_v0(evidence_count: int, has_last: bool) -> float:
    # Evidence saturates quickly
    base = min(1.0, (evidence_count / 50.0) ** 0.5)
    bonus = 0.15 if has_last else 0.0
    return float(min(1.0, base + bonus))

person_ids = [f"P{str(i).zfill(5)}" for i in range(1, len(person_agg) + 1)]
person_agg["person_id"] = person_ids

person_agg["titles_roles"] = person_agg["person_key"].map(lambda k: title_map.get(k, []))
person_agg["confidence"] = person_agg.apply(
    lambda r: confidence_v0(int(r.evidence_count), r.last_name is not None),
    axis=1
)

# Reorder columns
persons_v0 = person_agg[[
    "person_id",
    "first_name",
    "last_name",
    "canonical_full_name",
    "titles_roles",
    "evidence_count",
    "confidence",
]].sort_values(["evidence_count"], ascending=False)

display(persons_v0.head(50))
print("persons_v0:", len(persons_v0))


Unnamed: 0,person_id,first_name,last_name,canonical_full_name,titles_roles,evidence_count,confidence
66,P00067,Андрей,,Андрей,"[{'title': 'князь', 'count': 254}]",1180,1.0
1199,P01200,Пьер,,Пьер,"[{'title': 'мсье', 'count': 3}]",1157,1.0
968,P00969,Наташа,,Наташа,[],687,1.0
1000,P01001,Николай,,Николай,[],425,1.0
851,P00852,Марья,,Марья,"[{'title': 'княжна', 'count': 108}]",314,1.0
204,P00205,Борис,,Борис,[],301,1.0
738,P00739,Кутузов,,Кутузов,"[{'title': 'адъютант', 'count': 1}]",296,1.0
950,P00951,Наполеон,,Наполеон,"[{'title': 'император', 'count': 4}]",291,1.0
1329,P01330,Соня,,Соня,[],233,1.0
244,P00245,Василий,,Василий,"[{'title': 'князь', 'count': 40}]",214,1.0


persons_v0: 1612


## Export persons_v0 and aliases_v0 to CSV/JSON

In [13]:
# What comes in:
# - persons_v0 DataFrame
# - df_work DataFrame (normalized mentions without noise)
#
# What it does:
# - Exports:
#   1) persons_v0.csv (one row per person candidate)
#   2) aliases_v0.csv (aliases and counts per person)
#   3) persons_v0.jsonl (optional, convenient for later LLM-as-judge)
#
# Example output files:
# - data/artifacts/persons_v0.csv
# - data/artifacts/aliases_v0.csv
# - data/artifacts/persons_v0.jsonl

# Map person_key -> person_id
key_to_pid = dict(zip(persons_v0["person_id"], persons_v0["canonical_full_name"]))
# We also need person_key -> person_id, so build it from persons_v0
person_key_to_id = dict(zip(person_agg["person_key"], person_agg["person_id"]))

# Alias counts
aliases_v0 = (
    df_work.groupby(["person_key", "mention_text_clean"])
    .size()
    .reset_index(name="count")
)
aliases_v0["person_id"] = aliases_v0["person_key"].map(person_key_to_id)
aliases_v0 = aliases_v0[["person_id", "mention_text_clean", "count"]].sort_values(["count"], ascending=False)

# Save
persons_csv_path = ARTIFACTS_DIR / "persons_v0.csv"
aliases_csv_path = ARTIFACTS_DIR / "aliases_v0.csv"
persons_jsonl_path = ARTIFACTS_DIR / "persons_v0.jsonl"

persons_v0.to_csv(persons_csv_path, index=False, encoding="utf-8")
aliases_v0.to_csv(aliases_csv_path, index=False, encoding="utf-8")

with persons_jsonl_path.open("w", encoding="utf-8") as f:
    for r in persons_v0.itertuples(index=False):
        obj = {
            "person_id": r.person_id,
            "first_name": r.first_name,
            "last_name": r.last_name,
            "canonical_full_name": r.canonical_full_name,
            "titles_roles": r.titles_roles,
            "evidence_count": int(r.evidence_count),
            "confidence": float(r.confidence),
        }
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print("saved:", persons_csv_path)
print("saved:", aliases_csv_path)
print("saved:", persons_jsonl_path)

display(aliases_v0.head(30))


saved: data/artifacts/persons_v0.csv
saved: data/artifacts/aliases_v0.csv
saved: data/artifacts/persons_v0.jsonl


Unnamed: 0,person_id,mention_text_clean,count
1702,P01200,Пьер,769
89,P00067,Андрей,547
1388,P00969,Наташа,448
1430,P01001,Николай,296
92,P00067,Князь Андрей,254
1703,P01200,Пьера,232
91,P00067,Андрея,224
330,P00205,Борис,182
1058,P00739,Кутузов,168
689,P00466,Долохов,164
