In [None]:
# ============================================================
# COLAB: "BEST-IN-CLASS" DATASET BUILDER (Lomonosov events)
# Тема: ИС планирования и анализа научных мероприятий (МУ им. Витте)
# Источник: https://lomonosov-msu.ru/<lang>/event/<id>/ (+ /menu/)
# ============================================================
!pip -q install beautifulsoup4 lxml pandas tqdm python-dateutil dateparser faker pyarrow

import re
import time
import json
import random
import hashlib
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import dateparser
from faker import Faker
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/315.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# -------------------------
# НАСТРОЙКИ (под себя)
# -------------------------
BASE = "https://lomonosov-msu.ru"
LANG = "rus"  # rus / eng / ukr
SEARCH_URL = f"{BASE}/{LANG}/event/search"

TARGET_EVENTS = 350
MIN_FOCUS_EVENTS = 12          # хотим не меньше N событий, связанных с МУ Витте (по ключам)
SCAN_BACK_IDS = 6500           # больше => дольше, но больше охват
SLEEP_SEC = 0.35
TIMEOUT = 25

# Включить дисковый кэш HTML (важно для повторных запусков)
USE_DISK_CACHE = True

FOCUS_ORG_KEYWORDS = [
    "МУ имени С. Ю. Витте",
    "Московский университет имени С.Ю. Витте",
    "Московский университет имени С. Ю. Витте",
    "muiv",
    "Витте",
]

# Синтетика
SYN_SEED = 42
AVG_EVENTS_PER_PARTICIPANT = (1, 3)  # участник посещает 1..3 мероприятия (для реалистичной сети)
SYN_PARTICIPANTS_PER_EVENT = (35, 160)

OUT_DIR = Path("/content/science_events_dataset")
RAW_DIR = OUT_DIR / "_raw_html"
OUT_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

PARSER_VERSION = "v2.0-menu+main-privacy-safe"
BUILD_TS = pd.Timestamp.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")

random.seed(SYN_SEED)
Faker.seed(SYN_SEED)
fake = Faker("ru_RU")

In [None]:
# -------------------------
# HTTP с ретраями + "вежливые" запросы
# -------------------------
def make_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(
        total=4,
        backoff_factor=0.7,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=("GET", "HEAD"),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    s.headers.update({
        "User-Agent": "Mozilla/5.0 (Colab; academic dataset builder; respectful scraping)",
        "Accept-Language": "ru,en;q=0.8",
    })
    return s

session = make_session()

def _cache_path(url: str) -> Path:
    h = hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]
    return RAW_DIR / f"{h}.html"

def fetch_html(url: str) -> Optional[str]:
    """
    Возвращает HTML страницы.
    - При USE_DISK_CACHE=True читает/пишет в файловый кэш
    - Делает ретраи через настроенный session
    """
    cp = _cache_path(url)
    if USE_DISK_CACHE and cp.exists():
        return cp.read_text(encoding="utf-8", errors="ignore")

    try:
        r = session.get(url, timeout=TIMEOUT)
        if r.status_code != 200:
            return None
        html = r.text
        if USE_DISK_CACHE:
            cp.write_text(html, encoding="utf-8")
        return html
    except Exception:
        return None

In [None]:
# -------------------------
# ДАТЫ / ТЕКСТ / PII-редакция
# -------------------------
MONTH_ABBR = {
    "янв": "января", "фев": "февраля", "мар": "марта", "апр": "апреля",
    "май": "мая", "июн": "июня", "июл": "июля", "авг": "августа",
    "сен": "сентября", "сент": "сентября", "окт": "октября", "ноя": "ноября", "дек": "декабря",
}

def normalize_ru_text(s: str) -> str:
    s = re.sub(r"\s+", " ", (s or "").strip())
    s = s.replace("–", "—")            # en dash -> em dash
    # аккуратно заменим " - " на " — " (чтобы не ломать URL/слова)
    s = re.sub(r"\s-\s", " — ", s)
    # заменим сокращения месяцев (Апр -> апреля)
    def repl(m):
        token = m.group(0)
        low = token.lower().strip(".")
        return MONTH_ABBR.get(low, token)
    s = re.sub(r"\b[А-Яа-я]{3,4}\.?\b", repl, s)
    return s

def safe_lines_from_soup(soup: BeautifulSoup) -> List[str]:
    text = soup.get_text("\n")
    lines = [re.sub(r"\s+", " ", ln).strip() for ln in text.splitlines()]
    return [ln for ln in lines if ln]

def parse_time_range(line: str) -> Optional[Tuple[str, str]]:
    m = re.search(r"(\d{1,2}:\d{2})\s*[-—]\s*(\d{1,2}:\d{2})", line)
    return (m.group(1), m.group(2)) if m else None

def dp_parse_date(s: str) -> Optional[pd.Timestamp]:
    s = normalize_ru_text(s)
    dt = dateparser.parse(
        s,
        languages=["ru"],
        settings={
            "RETURN_AS_TIMEZONE_AWARE": False,
            "PREFER_DAY_OF_MONTH": "first",
            "DATE_ORDER": "DMY",
        }
    )
    return pd.Timestamp(dt) if dt else None

def parse_date_range(s: str) -> Tuple[Optional[pd.Timestamp], Optional[pd.Timestamp]]:
    """
    Понимает:
    - "1 Декабря — 20 Февраля 2026"
    - "14 — 17 Сентября 2026"
    - "11 — 25 Апр 2025"
    - "28 Апр 2026"
    """
    s0 = normalize_ru_text(s)
    # 1) "14 — 17 сентября 2026"
    m = re.search(r"\b(\d{1,2})\s*—\s*(\d{1,2})\s+([а-я]+)\s+(\d{4})\b", s0.lower())
    if m:
        d1, d2, mon, yy = m.group(1), m.group(2), m.group(3), m.group(4)
        return dp_parse_date(f"{d1} {mon} {yy}"), dp_parse_date(f"{d2} {mon} {yy}")

    # 2) две части через "—", год может быть только справа
    parts = [p.strip() for p in s0.split("—") if p.strip()]
    if len(parts) >= 2:
        left, right = parts[0], parts[1]
        yr = re.search(r"\b(\d{4})\b", right)
        if yr and not re.search(r"\b(\d{4})\b", left):
            left = f"{left} {yr.group(1)}"
        return dp_parse_date(left), dp_parse_date(right)

    # 3) одна дата
    one = dp_parse_date(s0)
    return one, one

def extract_pair_value(lines: List[str], key: str) -> Optional[str]:
    for i, ln in enumerate(lines):
        if ln.strip().lower() == key.lower():
            if i + 1 < len(lines):
                return lines[i+1].strip()
    return None

def redact_pii(text: str) -> str:
    """
    Удаляем email/телефоны из описаний (на всякий случай).
    """
    t = text or ""
    t = re.sub(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", "[REDACTED_EMAIL]", t)
    t = re.sub(r"\+?\d[\d\-\s\(\)]{7,}\d", "[REDACTED_PHONE]", t)
    return t.strip()

def looks_like_contact_line(s: str) -> bool:
    if "@" in s:
        return True
    if re.search(r"\+?\d[\d\-\s\(\)]{7,}\d", s):
        return True
    return False

def looks_like_person_name(s: str) -> bool:
    # грубая эвристика ФИО: 2-3 слова с Заглавной, кириллица
    s = (s or "").strip()
    if len(s) < 5 or len(s) > 80:
        return False
    if re.search(r"[@\d]", s):
        return False
    parts = s.split()
    if len(parts) not in (2, 3):
        return False
    ok = 0
    for p in parts:
        if re.match(r"^[А-ЯЁ][а-яё\-]+$", p):
            ok += 1
    return ok == len(parts)

In [None]:
# -------------------------
# Структуры данных
# -------------------------
@dataclass
class EventParsed:
    event_id: int
    url_main: str
    url_menu: str
    title: str
    subtitle: Optional[str]

    status: Optional[str]           # Проводится / Прошло / Планируется и т.п.
    event_start: Optional[pd.Timestamp]
    event_end: Optional[pd.Timestamp]
    start_time: Optional[str]
    end_time: Optional[str]

    city: Optional[str]
    venue: Optional[str]

    reg_status: Optional[str]       # open/closed/unknown
    reg_start: Optional[pd.Timestamp]
    reg_end: Optional[pd.Timestamp]
    reg_deadline_dt: Optional[pd.Timestamp]  # если "Регистрация до ..."

    cost: Optional[str]
    organizers: List[str]
    languages: Optional[str]        # строка, если нашли "Рабочие языки: ..."
    format: Optional[str]           # очный/дистанционный/смешанный (если найдём)

    description: str
    tracks: List[str]

    menu_pages: List[Dict[str, Any]]   # внутренние страницы меню
    menu_links: List[Dict[str, Any]]   # внешние ссылки меню
    file_links: List[Dict[str, Any]]   # /file/ ссылки (инфописьма и т.п.)

    scraped_at: str
    parse_warnings: List[str]


def is_focus_event(ev: EventParsed) -> bool:
    blob = " ".join([
        ev.title or "",
        ev.subtitle or "",
        ev.city or "",
        ev.venue or "",
        " ".join(ev.organizers or []),
        ev.description or "",
    ]).lower()
    return any(k.lower() in blob for k in FOCUS_ORG_KEYWORDS)

In [None]:
# -------------------------
# Парсер /menu/ (стабильные поля + меню ссылок)
# -------------------------
def parse_menu_page(html: str, url_menu: str) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
    soup = BeautifulSoup(html, "lxml")
    lines = safe_lines_from_soup(soup)

    # title
    h1 = soup.find("h1")
    title = h1.get_text(" ", strip=True) if h1 else None

    # subtitle (часто следующая строка после #title)
    subtitle = None
    if title:
        try:
            idx = next(i for i, ln in enumerate(lines) if ln.strip() == title.strip())
            if idx + 1 < len(lines):
                cand = lines[idx + 1].strip()
                if cand and cand != title and len(cand) < 220:
                    subtitle = cand
        except StopIteration:
            pass

    # status (обычно рядом сверху: "Проводится", "Прошло", ...)
    status = None
    for ln in lines[:35]:
        if ln in {"Проводится", "Прошло", "Планируется"}:
            status = ln
            break

    # date line + time line
    date_line = None
    time_line = None
    for i, ln in enumerate(lines[:60]):
        if re.search(r"\b\d{1,2}\s*[-—]\s*\d{1,2}\s+[А-Яа-я]{3,10}\s+\d{4}\b", normalize_ru_text(ln)):
            date_line = ln
            if i + 1 < len(lines) and parse_time_range(lines[i + 1]):
                time_line = lines[i + 1]
            break
        if re.search(r"\b\d{1,2}\s+[А-Яа-я]{3,10}\s+\d{4}\b", normalize_ru_text(ln)):
            date_line = ln
            if i + 1 < len(lines) and parse_time_range(lines[i + 1]):
                time_line = lines[i + 1]
            break

    event_start, event_end = (None, None)
    if date_line:
        event_start, event_end = parse_date_range(date_line)

    start_time, end_time = (None, None)
    if time_line:
        tr = parse_time_range(time_line)
        if tr:
            start_time, end_time = tr

    city = extract_pair_value(lines, "Город")
    venue = extract_pair_value(lines, "Место")
    reg_raw = extract_pair_value(lines, "Регистрация")

    reg_status, reg_start, reg_end, reg_deadline = (None, None, None, None)
    if reg_raw:
        rr = normalize_ru_text(reg_raw)
        low = rr.lower()
        if "закрыта" in low:
            reg_status = "closed"
        elif "открыта" in low:
            reg_status = "open"
        elif "до" in low:
            # "Регистрация до 6 апреля 23:59" (часто без года)
            # год возьмём из event_start, если есть
            year_hint = event_start.year if isinstance(event_start, pd.Timestamp) else None
            m = re.search(r"до\s+(.+)$", rr, flags=re.I)
            if m:
                tail = m.group(1).strip()
                if year_hint and not re.search(r"\b\d{4}\b", tail):
                    tail = f"{tail} {year_hint}"
                dt = dp_parse_date(tail)
                reg_deadline = dt
            reg_status = "unknown"
        else:
            rs, re_ = parse_date_range(rr)
            reg_start, reg_end = rs, re_
            reg_status = "unknown"

    # 1) Внутренние страницы меню (важно для ИС: "правила", "о конференции", "программа", ...)
    menu_pages = []
    # 2) Внешние ссылки меню (forms/disk/прочее)
    menu_links = []

    for a in soup.select("a[href]"):
        href = (a.get("href") or "").strip()
        text = (a.get_text(" ", strip=True) or "").strip()
        if not href or href.startswith("#"):
            continue

        # normalize href
        if href.startswith("/"):
            full = BASE + href
        else:
            full = href

        # выкинем навигацию и auth
        bad_any = [
            f"/{LANG}/login", f"/{LANG}/register", f"/{LANG}/event/search",
            f"/{LANG}/policy", f"/{LANG}/feedback", f"/{LANG}/promo",
        ]
        if any(b in full for b in bad_any):
            continue

        # отделим menu internal pages vs external
        if BASE in full and f"/{LANG}/event/" in full:
            # внутренняя страница события (не user/profile!)
            if re.search(rf"/{LANG}/(user|profile|people|account)/", full):
                continue
            menu_pages.append({
                "page_title": text[:240] if text else None,
                "url": full
            })
        else:
            # внешняя ссылка (или гугл-календарь)
            if href.startswith("mailto:") or href.startswith("tel:"):
                continue
            menu_links.append({
                "link_title": text[:240] if text else None,
                "url": full,
                "link_type": "calendar" if "google.com/calendar" in full else "external"
            })

    # дедуп
    def dedup(items: List[Dict[str, Any]], keys: Tuple[str, ...]) -> List[Dict[str, Any]]:
        out, seen = [], set()
        for x in items:
            k = tuple((x.get(kk) or "") for kk in keys)
            if k in seen:
                continue
            seen.add(k)
            out.append(x)
        return out

    menu_pages = dedup(menu_pages, ("url",))
    menu_links = dedup(menu_links, ("url",))

    core = None
    if title:
        core = dict(
            title=title,
            subtitle=subtitle,
            status=status,
            event_start=event_start,
            event_end=event_end,
            start_time=start_time,
            end_time=end_time,
            city=city,
            venue=venue,
            reg_status=reg_status,
            reg_start=reg_start,
            reg_end=reg_end,
            reg_deadline_dt=reg_deadline,
        )
    return core, menu_pages, menu_links

In [None]:
# -------------------------
# Парсер основной страницы (описание/организаторы/стоимость/треки/файлы) БЕЗ ПДн
# -------------------------
STOP_HEADERS = {
    "организаторы", "контактная информация", "о партнёрах",
    "стоимость участия", "файлы", "списки", "поданные заявки"
}

def extract_block(lines: List[str], header: str) -> List[str]:
    """
    Берём строки после header до следующего "крупного" заголовка.
    """
    hdr = header.lower()
    start = None
    for i, ln in enumerate(lines):
        if ln.strip().lower() == hdr:
            start = i + 1
            break
    if start is None:
        return []
    out = []
    for j in range(start, len(lines)):
        low = lines[j].strip().lower()
        if low in STOP_HEADERS:
            break
        out.append(lines[j].strip())
    return [x for x in out if x]

def parse_main_page(html: str, url_main: str, year_hint: Optional[int]) -> Dict[str, Any]:
    soup = BeautifulSoup(html, "lxml")
    lines = safe_lines_from_soup(soup)

    # Описание: блок "О мероприятии" до следующего крупного заголовка
    desc_lines = extract_block(lines, "О мероприятии")
    description = redact_pii("\n".join(desc_lines))

    # Организаторы: фильтруем контакты и ФИО
    org_lines = extract_block(lines, "Организаторы")
    organizers = []
    for x in org_lines:
        if looks_like_contact_line(x):
            continue
        if looks_like_person_name(x):
            continue
        organizers.append(x)
    organizers = list(dict.fromkeys([o.strip() for o in organizers if o.strip()]))

    # Стоимость участия
    cost = None
    # иногда это "Стоимость участия" -> следующая строка
    for i, ln in enumerate(lines):
        if ln.strip().lower() == "стоимость участия":
            if i + 1 < len(lines):
                cost = lines[i + 1].strip()
            break

    # Языки / формат из текста
    languages = None
    fmt = None

    m = re.search(r"рабочие языки[^:]*:\s*([^\n\.]+)", (description or ""), flags=re.I)
    if m:
        languages = m.group(1).strip()

    if re.search(r"\bдистанционн", (description or ""), flags=re.I):
        fmt = "remote_or_hybrid"
    if re.search(r"\bочный\b", (description or ""), flags=re.I):
        fmt = "onsite_or_hybrid"

    # Треки:
    # 1) пробуем вытащить нумерованные секции по эвристике (как в "Виттевские чтения")
    tracks = []
    # нумерованные
    for ln in lines:
        mm = re.match(r"^\s*(\d{1,3})[\.\)]\s*(.+)$", ln)
        if mm:
            t = mm.group(2).strip()
            if t and len(t) > 2 and len(t) < 220:
                tracks.append(t)

    # 2) если не нашли — попробуем h3/h2 заголовки секций (как у больших форумов)
    if len(tracks) < 2:
        for tag in soup.find_all(["h2", "h3"]):
            tx = (tag.get_text(" ", strip=True) or "").strip()
            if not tx:
                continue
            low = tx.lower()
            if low in STOP_HEADERS:
                continue
            # отсечём "11 апреля", "25 апреля" и т.п.
            if re.search(r"\b\d{1,2}\s+[а-я]+\b", low) and len(tx) < 30:
                continue
            if len(tx) > 2 and len(tx) < 140:
                tracks.append(tx)

    # дедуп/чистка
    clean_tracks, seen = [], set()
    for t in tracks:
        t2 = re.sub(r"\s+", " ", t).strip()
        if not t2:
            continue
        k = t2.lower()
        if k in seen:
            continue
        seen.add(k)
        clean_tracks.append(t2)
    tracks = clean_tracks[:200]  # safety cap

    # Файлы: берём только /file/ (это безопасно)
    file_links = []
    for a in soup.select('a[href*="/file/"]'):
        href = (a.get("href") or "").strip()
        if not href:
            continue
        full = BASE + href if href.startswith("/") else href
        title = (a.get_text(" ", strip=True) or "").strip()[:240] or None
        file_links.append({
            "link_title": title,
            "url": full,
            "link_type": "file"
        })

    # дедуп файлов
    tmp, seen = [], set()
    for x in file_links:
        if x["url"] in seen:
            continue
        seen.add(x["url"])
        tmp.append(x)
    file_links = tmp

    # Регистрация "до ..." иногда дублируется внутри основной страницы
    reg_deadline_dt = None
    for ln in lines[:120]:
        if "регистрация до" in ln.lower():
            tail = ln.split("до", 1)[-1].strip()
            if year_hint and not re.search(r"\b\d{4}\b", tail):
                tail = f"{tail} {year_hint}"
            dt = dp_parse_date(tail)
            if dt is not None:
                reg_deadline_dt = dt
                break

    return dict(
        description=description,
        organizers=organizers,
        cost=cost,
        tracks=tracks,
        languages=languages,
        format=fmt,
        file_links=file_links,
        reg_deadline_dt_extra=reg_deadline_dt
    )

In [None]:
# -------------------------
# 1) Находим max event_id (по выдаче search)
# -------------------------
def guess_max_event_id() -> int:
    html = fetch_html(SEARCH_URL)
    if not html:
        return 11000
    ids = re.findall(rf"/{LANG}/event/(\d+)/", html)
    ids = [int(x) for x in ids] if ids else []
    return max(ids) if ids else 11000

max_id = guess_max_event_id()
print("Max event_id (по search):", max_id)

Max event_id (по search): 10167


In [None]:
# -------------------------
# 2) Сканируем ID назад: собираем /menu/ + main
# -------------------------
events: List[EventParsed] = []

def parse_event_id_from_url(url: str) -> Optional[int]:
    m = re.search(r"/event/(\d+)/", url)
    return int(m.group(1)) if m else None

focus_count = 0
consecutive_misses = 0
MAX_CONSEC_MISSES = 900  # защита: если долго одни 404 — останавливаемся

pbar = tqdm(range(max_id, max_id - SCAN_BACK_IDS, -1), total=SCAN_BACK_IDS, desc=f"Scan {LANG}/event/<id>")

for eid in pbar:
    if len(events) >= TARGET_EVENTS and focus_count >= MIN_FOCUS_EVENTS:
        break
    if consecutive_misses >= MAX_CONSEC_MISSES and len(events) >= max(50, TARGET_EVENTS // 4):
        break

    url_main = f"{BASE}/{LANG}/event/{eid}/"
    url_menu = f"{BASE}/{LANG}/event/{eid}/menu/"

    html_menu = fetch_html(url_menu)
    if not html_menu:
        consecutive_misses += 1
        time.sleep(SLEEP_SEC)
        continue

    consecutive_misses = 0
    core, menu_pages, menu_links = parse_menu_page(html_menu, url_menu)
    if not core or not core.get("title"):
        time.sleep(SLEEP_SEC)
        continue

    # вытаскиваем год-подсказку
    year_hint = core["event_start"].year if isinstance(core["event_start"], pd.Timestamp) else None

    html_main = fetch_html(url_main)
    if not html_main:
        # иногда /menu/ есть, а main не отдается — всё равно сохраним минимум
        main_data = dict(description="", organizers=[], cost=None, tracks=[], languages=None, format=None, file_links=[], reg_deadline_dt_extra=None)
    else:
        main_data = parse_main_page(html_main, url_main, year_hint=year_hint)

    # склеим reg_deadline: приоритет — menu, потом main-extra
    reg_deadline_dt = core.get("reg_deadline_dt") or main_data.get("reg_deadline_dt_extra")

    # сбор предупреждений качества
    warnings = []
    if core.get("event_start") is None:
        warnings.append("no_event_date")
    if core.get("city") is None:
        warnings.append("no_city")
    if core.get("venue") is None:
        warnings.append("no_venue")

    # собираем объект
    ep = EventParsed(
        event_id=eid,
        url_main=url_main,
        url_menu=url_menu,
        title=core["title"],
        subtitle=core.get("subtitle"),
        status=core.get("status"),
        event_start=core.get("event_start"),
        event_end=core.get("event_end"),
        start_time=core.get("start_time"),
        end_time=core.get("end_time"),
        city=core.get("city"),
        venue=core.get("venue"),
        reg_status=core.get("reg_status"),
        reg_start=core.get("reg_start"),
        reg_end=core.get("reg_end"),
        reg_deadline_dt=reg_deadline_dt,
        cost=main_data.get("cost"),
        organizers=main_data.get("organizers") or [],
        languages=main_data.get("languages"),
        format=main_data.get("format"),
        description=main_data.get("description") or "",
        tracks=main_data.get("tracks") or [],
        menu_pages=menu_pages or [],
        menu_links=menu_links or [],
        file_links=main_data.get("file_links") or [],
        scraped_at=BUILD_TS,
        parse_warnings=warnings
    )

    events.append(ep)
    if is_focus_event(ep):
        focus_count += 1

    pbar.set_postfix({"events": len(events), "focus": focus_count, "eid": eid})
    time.sleep(SLEEP_SEC)

print("Collected events:", len(events), "| focus events:", focus_count)

Scan rus/event/<id>:   0%|          | 0/6500 [00:00<?, ?it/s]

Collected events: 714 | focus events: 12


In [None]:
# -------------------------
# 3) Таблицы: events / tracks / event_pages / links(files+external)
# -------------------------
def safe_days(a, b):
    if a is None or b is None or pd.isna(a) or pd.isna(b):
        return None
    return int((pd.Timestamp(b).normalize() - pd.Timestamp(a).normalize()).days)

events_rows = []
tracks_rows = []
pages_rows = []
links_rows = []

for ev in events:
    is_focus = is_focus_event(ev)

    # event-level completeness
    key_fields = [
        ev.title, ev.event_start, ev.city, ev.venue,
        ev.reg_status, ev.description
    ]
    completeness = sum(1 for x in key_fields if x not in (None, "", pd.NaT)) / len(key_fields)

    events_rows.append({
        "event_id": ev.event_id,
        "source": "lomonosov-msu.ru",
        "lang": LANG,
        "url_main": ev.url_main,
        "url_menu": ev.url_menu,
        "title": ev.title,
        "subtitle": ev.subtitle,
        "status": ev.status,
        "city": ev.city,
        "venue": ev.venue,
        "event_start": ev.event_start,
        "event_end": ev.event_end,
        "start_time": ev.start_time,
        "end_time": ev.end_time,
        "event_duration_days": safe_days(ev.event_start, ev.event_end),
        "reg_status": ev.reg_status,
        "reg_start": ev.reg_start,
        "reg_end": ev.reg_end,
        "reg_deadline_dt": ev.reg_deadline_dt,
        "reg_window_days": safe_days(ev.reg_start, ev.reg_end),
        "cost": ev.cost,
        "organizers": "; ".join(ev.organizers) if ev.organizers else None,
        "languages": ev.languages,
        "format": ev.format,
        "description_len": len(ev.description or ""),
        "tracks_count": len(ev.tracks or []),
        "is_focus_university": is_focus,
        "parse_warnings": ";".join(ev.parse_warnings) if ev.parse_warnings else None,
        "parser_version": PARSER_VERSION,
        "scraped_at": ev.scraped_at,
        "completeness_score": round(float(completeness), 4),
    })

    # tracks
    for i, tr in enumerate(ev.tracks or [], start=1):
        tracks_rows.append({
            "event_id": ev.event_id,
            "track_no": i,
            "track_title": tr
        })

    # menu pages
    for p in ev.menu_pages or []:
        pages_rows.append({
            "event_id": ev.event_id,
            "page_title": p.get("page_title"),
            "url": p.get("url"),
            "page_type": "internal_page"
        })

    # menu external links
    for l in ev.menu_links or []:
        links_rows.append({
            "event_id": ev.event_id,
            "link_type": l.get("link_type", "external"),
            "link_title": l.get("link_title"),
            "url": l.get("url"),
            "source_part": "menu"
        })

    # files (только /file/)
    for l in ev.file_links or []:
        links_rows.append({
            "event_id": ev.event_id,
            "link_type": "file",
            "link_title": l.get("link_title"),
            "url": l.get("url"),
            "source_part": "main_file"
        })

events_df = pd.DataFrame(events_rows).drop_duplicates(subset=["event_id"])
tracks_df = pd.DataFrame(tracks_rows).drop_duplicates()
pages_df  = pd.DataFrame(pages_rows).drop_duplicates()
links_df  = pd.DataFrame(links_rows).drop_duplicates()

print(events_df.shape, tracks_df.shape, pages_df.shape, links_df.shape)
display(events_df.head(5))
display(tracks_df.head(5))
display(links_df.head(5))

(714, 31) (2016, 3) (981, 4) (4399, 5)


Unnamed: 0,event_id,source,lang,url_main,url_menu,title,subtitle,status,city,venue,...,organizers,languages,format,description_len,tracks_count,is_focus_university,parse_warnings,parser_version,scraped_at,completeness_score
0,10167,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10167/,https://lomonosov-msu.ru/rus/event/10167/menu/,Севастопольская гавань-2026,-,Проводится,"Севастополь, Россия",СевГУ,...,Севастопольский государственный университет; М...,,remote_or_hybrid,5141,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0
1,10166,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10166/,https://lomonosov-msu.ru/rus/event/10166/menu/,Универсиада по методам обработки информации в ...,,Проводится,"Москва, Россия",МГУ,...,Московский государственный университет имени М...,,,1048,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0
2,10165,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10165/,https://lomonosov-msu.ru/rus/event/10165/menu/,Универсиада по предпринимательству и управлени...,-,Проводится,"Москва, Россия",МГУ,...,Московский государственный университет имени М...,,onsite_or_hybrid,808,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0
3,10164,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10164/,https://lomonosov-msu.ru/rus/event/10164/menu/,Универсиада по Физике Частиц и Атомных Ядер «Л...,-,Проводится,"Москва, Россия",МГУ,...,Московский государственный университет имени М...,,,1285,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0
4,10163,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10163/,https://lomonosov-msu.ru/rus/event/10163/menu/,От Пушкина до наших дней,-,Проводится,"Казань, Россия",КФУ,...,Казанский (Приволжский) федеральный университе...,,,350,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0


Unnamed: 0,event_id,track_no,track_title
0,10157,1,«Философия»
1,10157,2,«Религиоведение»
2,10157,3,«Культурология»
3,10157,4,«Реклама и связи с общественностью»
4,10157,5,«Прикладная этика»


Unnamed: 0,event_id,link_type,link_title,url,source_part
0,10167,external,Ломоносов,https://lomonosov-msu.ru/rus/,menu
1,10167,external,Eng,https://lomonosov-msu.ru/eng/event/10167/menu/,menu
2,10167,external,Укр,https://lomonosov-msu.ru/ukr/event/10167/menu/,menu
3,10167,calendar,В Google календарь,http://www.google.com/calendar/event?action=TE...,menu
4,10167,external,Страница события,https://www.sevsu.ru/nauka/konferentsii-sevgu/...,menu


In [None]:
# -------------------------
# 4) Синтетика под ИС: participants / registrations / submissions / reviews
#    (без настоящих ФИО/почт; только хэши и агрегаты)
# -------------------------
SYN_UNIS = [
    "МУ имени С. Ю. Витте",
    "МГУ имени М.В. Ломоносова",
    "НИУ ВШЭ",
    "МФТИ",
    "СПбГУ",
    "РУДН",
    "РЭУ им. Г.В. Плеханова",
    "ИТМО",
    "КФУ",
    "СевГУ",
]
ROLES = ["student", "master", "phd", "researcher", "teacher", "industry"]
COUNTRIES = ["Россия", "Казахстан", "Беларусь", "Армения", "Узбекистан", "Кыргызстан", "Азербайджан"]

def hash_token(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:16]

# 4.1 создадим пул участников (чтобы они повторялись между событиями)
target_participants_pool = max(600, int(events_df.shape[0] * sum(SYN_PARTICIPANTS_PER_EVENT) / 2 / sum(AVG_EVENTS_PER_PARTICIPANT)))
participants = []
for pid in range(1, target_participants_pool + 1):
    uni = random.choice(SYN_UNIS)
    role = random.choice(ROLES)
    country = random.choice(COUNTRIES)
    city = fake.city()
    token = hash_token(f"{pid}-{fake.user_name()}-{SYN_SEED}")
    participants.append({
        "participant_id": pid,
        "participant_uid": token,
        "university": uni,
        "role": role,
        "country": country,
        "city": city,
    })
participants_df = pd.DataFrame(participants)

# 4.2 распределим посещения по событиям
registrations_rows = []
submissions_rows = []
reviews_rows = []

reg_id = 1
subm_id = 1
rev_id = 1

# вспомогательная мапа треков per event
tracks_by_event = tracks_df.groupby("event_id")["track_title"].apply(list).to_dict()

for _, ev in events_df.iterrows():
    eid = int(ev["event_id"])

    n = random.randint(*SYN_PARTICIPANTS_PER_EVENT)

    # выбор участников: повышаем шанс "Витте" для фокусных событий
    pool_ids = participants_df["participant_id"].tolist()
    chosen = set()

    while len(chosen) < n:
        pid = random.choice(pool_ids)
        # bias: для фокусных событий чаще участники "Витте"
        if ev["is_focus_university"] and random.random() < 0.55:
            vitte_ids = participants_df.loc[participants_df["university"] == "МУ имени С. Ю. Витте", "participant_id"].tolist()
            if vitte_ids:
                pid = random.choice(vitte_ids)
        chosen.add(pid)

    chosen = list(chosen)

    # окно регистрации
    ev_date = ev["event_start"]
    if pd.notna(ev["reg_start"]) and pd.notna(ev["reg_end"]):
        reg_start = pd.Timestamp(ev["reg_start"])
        reg_end = pd.Timestamp(ev["reg_end"])
    elif pd.notna(ev["reg_deadline_dt"]):
        reg_end = pd.Timestamp(ev["reg_deadline_dt"])
        reg_start = reg_end - pd.Timedelta(days=random.randint(20, 110))
    elif pd.notna(ev_date):
        reg_end = pd.Timestamp(ev_date) - pd.Timedelta(days=random.randint(1, 5))
        reg_start = reg_end - pd.Timedelta(days=random.randint(20, 120))
    else:
        reg_end = pd.Timestamp("2025-01-01")
        reg_start = reg_end - pd.Timedelta(days=60)

    # треки
    tr_list = tracks_by_event.get(eid, [])
    has_tracks = len(tr_list) > 0

    for pid in chosen:
        role = participants_df.loc[participants_df["participant_id"] == pid, "role"].iloc[0]
        uni = participants_df.loc[participants_df["participant_id"] == pid, "university"].iloc[0]

        # registered_at
        delta_sec = max(1, int((reg_end - reg_start).total_seconds()))
        registered_at = reg_start + pd.Timedelta(seconds=random.randint(0, delta_sec))

        # attended prob
        attended_prob = 0.52
        if ev["is_focus_university"] and uni == "МУ имени С. Ю. Витте":
            attended_prob += 0.12
        if role in {"phd", "researcher", "teacher"}:
            attended_prob += 0.08

        attended = 1 if random.random() < min(0.92, max(0.05, attended_prob)) else 0

        feedback_score = None
        if attended:
            feedback_score = int(min(5, max(1, round(random.gauss(4.05, 0.85)))))

        registrations_rows.append({
            "registration_id": reg_id,
            "event_id": eid,
            "participant_id": int(pid),
            "registered_at": registered_at,
            "attended": attended,
            "feedback_score": feedback_score,
        })
        reg_id += 1

        # submissions: часть участников подаёт тезисы/доклады
        submit_prob = 0.34 if attended else 0.10
        if role in {"phd", "researcher"}:
            submit_prob += 0.10

        if random.random() < min(0.65, submit_prob):
            track = random.choice(tr_list) if has_tracks else None
            submitted_at = registered_at + pd.Timedelta(days=random.randint(0, 12))

            # решение ПК: зависимость от "качества" (имитируем)
            base_q = random.gauss(0.0, 1.0)
            accept_prob = 0.55 + 0.10 * (1 if attended else 0) + 0.08 * (1 if role in {"phd","researcher"} else 0) + 0.05 * base_q
            accepted = 1 if random.random() < min(0.92, max(0.08, accept_prob)) else 0

            submissions_rows.append({
                "submission_id": subm_id,
                "event_id": eid,
                "participant_id": int(pid),
                "track_title": track,
                "submitted_at": submitted_at,
                "decision": "accepted" if accepted else "rejected",
            })

            # reviews: 2 рецензии (баллы 1..10)
            for _ in range(2):
                score = int(min(10, max(1, round(random.gauss(7.2 if accepted else 5.2, 1.4)))))
                reviews_rows.append({
                    "review_id": rev_id,
                    "submission_id": subm_id,
                    "score": score,
                    "confidence": int(min(5, max(1, round(random.gauss(3.6, 1.0)))))
                })
                rev_id += 1

            subm_id += 1

registrations_df = pd.DataFrame(registrations_rows)
submissions_df = pd.DataFrame(submissions_rows)
reviews_df = pd.DataFrame(reviews_rows)

print(participants_df.shape, registrations_df.shape, submissions_df.shape, reviews_df.shape)
display(registrations_df.head(5))
display(submissions_df.head(5))
display(reviews_df.head(5))

(17403, 6) (70906, 6) (19003, 6) (38006, 4)


Unnamed: 0,registration_id,event_id,participant_id,registered_at,attended,feedback_score
0,1,10167,15874,2026-02-10 04:44:20,1,3.0
1,2,10167,4105,2026-02-02 21:37:14,1,4.0
2,3,10167,5132,2026-01-20 06:07:41,1,4.0
3,4,10167,8717,2026-02-18 16:37:51,0,
4,5,10167,1550,2026-02-13 17:19:24,1,4.0


Unnamed: 0,submission_id,event_id,participant_id,track_title,submitted_at,decision
0,1,10167,15874,,2026-02-19 04:44:20,rejected
1,2,10167,4105,,2026-02-12 21:37:14,rejected
2,3,10167,5132,,2026-01-27 06:07:41,accepted
3,4,10167,8717,,2026-02-27 16:37:51,accepted
4,5,10167,16420,,2026-02-10 11:39:54,accepted


Unnamed: 0,review_id,submission_id,score,confidence
0,1,1,3,5
1,2,1,4,5
2,3,2,4,4
3,4,2,4,3
4,5,3,7,5


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import re, json, zipfile

PROCESSED_DIR = Path("/content/science_events_dataset_processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

def to_dt(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")
    return df

def to_int(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
    return df

def to_float(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Float64")
    return df

def clean_str(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype("string").str.strip()
            df[c] = df[c].replace({"": pd.NA, "-": pd.NA, "None": pd.NA})
    return df

def dedup(df: pd.DataFrame, subset=None):
    before = len(df)
    df = df.drop_duplicates(subset=subset).reset_index(drop=True)
    return df, before - len(df)

In [None]:
# ---- events ----
events_df = events_df.copy()
events_df = to_int(events_df, ["event_id"])
events_df = to_dt(events_df, ["event_start","event_end","reg_start","reg_end","reg_deadline_dt"])
events_df = to_float(events_df, ["completeness_score"])
events_df = clean_str(events_df, ["source","lang","url_main","url_menu","title","subtitle","status","city","venue",
                                 "start_time","end_time","reg_status","cost","organizers","languages","format",
                                 "parse_warnings","parser_version","scraped_at"])
events_df["is_focus_university"] = events_df["is_focus_university"].astype(bool)

# если end пустой, делаем end=start
events_df["event_end"] = events_df["event_end"].fillna(events_df["event_start"])

# длительности (пересчёт на всякий случай)
events_df["event_duration_days"] = (events_df["event_end"].dt.normalize() - events_df["event_start"].dt.normalize()).dt.days
events_df.loc[events_df["event_start"].isna(), "event_duration_days"] = pd.NA

events_df["reg_window_days"] = (events_df["reg_end"].dt.normalize() - events_df["reg_start"].dt.normalize()).dt.days
events_df.loc[events_df["reg_start"].isna() | events_df["reg_end"].isna(), "reg_window_days"] = pd.NA

events_df, d = dedup(events_df, subset=["event_id"])
print("events: dropped duplicates =", d, "| shape:", events_df.shape)

# ---- tracks ----
tracks_df = tracks_df.copy()
tracks_df = to_int(tracks_df, ["event_id","track_no"])
tracks_df = clean_str(tracks_df, ["track_title"])
tracks_df, d = dedup(tracks_df, subset=["event_id","track_no","track_title"])
print("tracks: dropped duplicates =", d, "| shape:", tracks_df.shape)

# ---- pages ----
pages_df = pages_df.copy()
pages_df = to_int(pages_df, ["event_id"])
pages_df = clean_str(pages_df, ["page_title","url","page_type"])
pages_df, d = dedup(pages_df, subset=["event_id","url"])
print("event_pages: dropped duplicates =", d, "| shape:", pages_df.shape)

# ---- links ----
links_df = links_df.copy()
links_df = to_int(links_df, ["event_id"])
links_df = clean_str(links_df, ["link_type","link_title","url","source_part"])
links_df, d = dedup(links_df, subset=["event_id","url"])
print("links: dropped duplicates =", d, "| shape:", links_df.shape)

# ---- participants ----
participants_df = participants_df.copy()
participants_df = to_int(participants_df, ["participant_id"])
participants_df = clean_str(participants_df, ["participant_uid","university","role","country","city"])
participants_df, d = dedup(participants_df, subset=["participant_id"])
print("participants: dropped duplicates =", d, "| shape:", participants_df.shape)

# ---- registrations ----
registrations_df = registrations_df.copy()
registrations_df = to_int(registrations_df, ["registration_id","event_id","participant_id"])
registrations_df = to_dt(registrations_df, ["registered_at"])
registrations_df = to_float(registrations_df, ["feedback_score"])
registrations_df["attended"] = registrations_df["attended"].astype("Int64")
registrations_df, d = dedup(registrations_df, subset=["registration_id"])
print("registrations: dropped duplicates =", d, "| shape:", registrations_df.shape)

# ---- submissions ----
submissions_df = submissions_df.copy()
submissions_df = to_int(submissions_df, ["submission_id","event_id","participant_id"])
submissions_df = to_dt(submissions_df, ["submitted_at"])
submissions_df = clean_str(submissions_df, ["track_title","decision"])
submissions_df, d = dedup(submissions_df, subset=["submission_id"])
print("submissions: dropped duplicates =", d, "| shape:", submissions_df.shape)

# ---- reviews ----
reviews_df = reviews_df.copy()
reviews_df = to_int(reviews_df, ["review_id","submission_id","score","confidence"])
reviews_df, d = dedup(reviews_df, subset=["review_id"])
print("reviews: dropped duplicates =", d, "| shape:", reviews_df.shape)

display(events_df.head(3))

events: dropped duplicates = 0 | shape: (714, 31)
tracks: dropped duplicates = 0 | shape: (2016, 3)
event_pages: dropped duplicates = 0 | shape: (981, 4)
links: dropped duplicates = 12 | shape: (4387, 5)
participants: dropped duplicates = 0 | shape: (17403, 6)
registrations: dropped duplicates = 0 | shape: (70906, 6)
submissions: dropped duplicates = 0 | shape: (19003, 6)
reviews: dropped duplicates = 0 | shape: (38006, 4)


Unnamed: 0,event_id,source,lang,url_main,url_menu,title,subtitle,status,city,venue,...,organizers,languages,format,description_len,tracks_count,is_focus_university,parse_warnings,parser_version,scraped_at,completeness_score
0,10167,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10167/,https://lomonosov-msu.ru/rus/event/10167/menu/,Севастопольская гавань-2026,,Проводится,"Севастополь, Россия",СевГУ,...,Севастопольский государственный университет; М...,,remote_or_hybrid,5141,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0
1,10166,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10166/,https://lomonosov-msu.ru/rus/event/10166/menu/,Универсиада по методам обработки информации в ...,,Проводится,"Москва, Россия",МГУ,...,Московский государственный университет имени М...,,,1048,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0
2,10165,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10165/,https://lomonosov-msu.ru/rus/event/10165/menu/,Универсиада по предпринимательству и управлени...,,Проводится,"Москва, Россия",МГУ,...,Московский государственный университет имени М...,,onsite_or_hybrid,808,0,False,,v2.0-menu+main-privacy-safe,2025-12-18 15:56:26 UTC,1.0


In [None]:
def integrity_report():
    rep = {}

    ev_ids = set(events_df["event_id"].dropna().astype(int).tolist())

    rep["tracks_orphan_rows"] = int((~tracks_df["event_id"].isin(ev_ids)).sum()) if len(tracks_df) else 0
    rep["pages_orphan_rows"]  = int((~pages_df["event_id"].isin(ev_ids)).sum()) if len(pages_df) else 0
    rep["links_orphan_rows"]  = int((~links_df["event_id"].isin(ev_ids)).sum()) if len(links_df) else 0
    rep["regs_orphan_event"]  = int((~registrations_df["event_id"].isin(ev_ids)).sum()) if len(registrations_df) else 0
    rep["subs_orphan_event"]  = int((~submissions_df["event_id"].isin(ev_ids)).sum()) if len(submissions_df) else 0

    sub_ids = set(submissions_df["submission_id"].dropna().astype(int).tolist())
    rep["reviews_orphan_submission"] = int((~reviews_df["submission_id"].isin(sub_ids)).sum()) if len(reviews_df) else 0

    pid_set = set(participants_df["participant_id"].dropna().astype(int).tolist())
    rep["regs_orphan_participant"] = int((~registrations_df["participant_id"].isin(pid_set)).sum()) if len(registrations_df) else 0
    rep["subs_orphan_participant"] = int((~submissions_df["participant_id"].isin(pid_set)).sum()) if len(submissions_df) else 0

    # логические проверки дат
    rep["bad_event_date_ranges"] = int(((events_df["event_start"].notna()) & (events_df["event_end"].notna()) & (events_df["event_end"] < events_df["event_start"])).sum())
    rep["bad_reg_date_ranges"]   = int(((events_df["reg_start"].notna()) & (events_df["reg_end"].notna()) & (events_df["reg_end"] < events_df["reg_start"])).sum())

    return rep

rep = integrity_report()
print(json.dumps(rep, ensure_ascii=False, indent=2))

# ---- PII scan (на всякий случай) ----
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"\+?\d[\d\-\s\(\)]{7,}\d")

def pii_count_series(s: pd.Series) -> dict:
    txt = s.dropna().astype(str)
    return {
        "email_hits": int(txt.apply(lambda x: bool(EMAIL_RE.search(x))).sum()),
        "phone_hits": int(txt.apply(lambda x: bool(PHONE_RE.search(x))).sum()),
    }

pii = {}
# если у тебя нет description в events_df — пропусти эти строки
if "description_len" in events_df.columns:
    # PII может быть в organizers/cost/link_title
    pii["events.organizers"] = pii_count_series(events_df["organizers"]) if "organizers" in events_df.columns else {"email_hits":0,"phone_hits":0}
pii["links.link_title"] = pii_count_series(links_df["link_title"]) if "link_title" in links_df.columns else {"email_hits":0,"phone_hits":0}

print(json.dumps(pii, ensure_ascii=False, indent=2))

# Автозачистка (если вдруг что-то нашлось в текстовых полях)
def redact_text(x: Any) -> Any:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return x
    t = str(x)
    t = EMAIL_RE.sub("[REDACTED_EMAIL]", t)
    t = PHONE_RE.sub("[REDACTED_PHONE]", t)
    return t

for col in ["organizers", "cost", "title", "subtitle", "city", "venue"]:
    if col in events_df.columns:
        events_df[col] = events_df[col].map(redact_text)

if "link_title" in links_df.columns:
    links_df["link_title"] = links_df["link_title"].map(redact_text)

print("PII redaction pass done.")

{
  "tracks_orphan_rows": 0,
  "pages_orphan_rows": 0,
  "links_orphan_rows": 0,
  "regs_orphan_event": 0,
  "subs_orphan_event": 0,
  "reviews_orphan_submission": 0,
  "regs_orphan_participant": 0,
  "subs_orphan_participant": 0,
  "bad_event_date_ranges": 0,
  "bad_reg_date_ranges": 0
}
{
  "events.organizers": {
    "email_hits": 0,
    "phone_hits": 0
  },
  "links.link_title": {
    "email_hits": 0,
    "phone_hits": 2
  }
}
PII redaction pass done.


In [None]:
# ---- event features for analytics ----
events_proc = events_df.copy()
events_proc["event_year"] = events_proc["event_start"].dt.year.astype("Int64")
events_proc["event_month"] = events_proc["event_start"].dt.month.astype("Int64")
events_proc["has_tracks"] = (events_proc["tracks_count"].fillna(0) > 0).astype(bool)
events_proc["has_reg_window"] = (events_proc["reg_window_days"].notna()).astype(bool)
events_proc["has_deadline"] = (events_proc["reg_deadline_dt"].notna()).astype(bool)
events_proc["is_online_flag"] = events_proc["format"].fillna("").str.contains("remote", case=False)

# ---- event KPI table (для отчётов ИС) ----
regs_kpi = registrations_df.groupby("event_id", dropna=False).agg(
    reg_count=("registration_id","count"),
    attended_count=("attended", lambda x: int(pd.Series(x).fillna(0).astype(int).sum())),
    avg_feedback=("feedback_score","mean"),
).reset_index()

subs_kpi = submissions_df.groupby("event_id", dropna=False).agg(
    submissions=("submission_id","count"),
    accepted=("decision", lambda s: int((pd.Series(s) == "accepted").sum())),
).reset_index()

event_kpi = events_proc.merge(regs_kpi, on="event_id", how="left").merge(subs_kpi, on="event_id", how="left")
event_kpi["reg_count"] = event_kpi["reg_count"].fillna(0).astype("Int64")
event_kpi["attended_count"] = event_kpi["attended_count"].fillna(0).astype("Int64")
event_kpi["submissions"] = event_kpi["submissions"].fillna(0).astype("Int64")
event_kpi["accepted"] = event_kpi["accepted"].fillna(0).astype("Int64")
event_kpi["attendance_rate"] = (event_kpi["attended_count"] / event_kpi["reg_count"].replace({0: np.nan})).astype("Float64")
event_kpi["accept_rate"] = (event_kpi["accepted"] / event_kpi["submissions"].replace({0: np.nan})).astype("Float64")

display(event_kpi.sort_values(["is_focus_university","reg_count"], ascending=[False, False]).head(12))

Unnamed: 0,event_id,source,lang,url_main,url_menu,title,subtitle,status,city,venue,...,has_reg_window,has_deadline,is_online_flag,reg_count,attended_count,avg_feedback,submissions,accepted,attendance_rate,accept_rate
713,9448,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/9448/,https://lomonosov-msu.ru/rus/event/9448/menu/,Исследование процессов интеграции ESG-практик ...,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,160,90,4.066667,42,25,0.5625,0.595238
124,10043,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10043/,https://lomonosov-msu.ru/rus/event/10043/menu/,IV Всероссийская научно-практическая конференция,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,158,106,4.0,34,22,0.670886,0.647059
43,10124,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10124/,https://lomonosov-msu.ru/rus/event/10124/menu/,Виттевские чтения - 2026,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,147,93,4.032258,37,25,0.632653,0.675676
480,9682,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/9682/,https://lomonosov-msu.ru/rus/event/9682/menu/,Межвузовская олимпиада по теме «Математика и м...,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,114,72,4.041667,38,31,0.631579,0.815789
651,9510,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/9510/,https://lomonosov-msu.ru/rus/event/9510/menu/,XXI Малышевские чтения,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,97,59,3.830508,33,25,0.608247,0.757576
613,9549,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/9549/,https://lomonosov-msu.ru/rus/event/9549/menu/,Современные аспекты гуманизации и цифровизации...,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,77,56,4.125,13,8,0.727273,0.615385
24,10143,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10143/,https://lomonosov-msu.ru/rus/event/10143/menu/,Современные аспекты гуманизации и цифровизации...,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,70,45,3.866667,17,13,0.642857,0.764706
96,10071,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/10071/,https://lomonosov-msu.ru/rus/event/10071/menu/,DIGITAL2026,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,58,41,4.02439,16,6,0.706897,0.375
336,9827,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/9827/,https://lomonosov-msu.ru/rus/event/9827/menu/,VII Летняя школа по Канту,,Проводится,"Калининград, Россия",БФУ им. И. Канта,...,False,True,False,57,34,3.617647,19,14,0.596491,0.736842
465,9697,lomonosov-msu.ru,rus,https://lomonosov-msu.ru/rus/event/9697/,https://lomonosov-msu.ru/rus/event/9697/menu/,XXV МЕЖДУНАРОДНЫЙ КОНГРЕСС МОЛОДОЙ НАУКИ «ВИТТ...,,Проводится,"Москва, Россия",МУ имени С. Ю. Витте,...,False,True,False,42,21,4.142857,11,8,0.5,0.727273


In [None]:
def save_df(df: pd.DataFrame, name: str):
    csv_path = PROCESSED_DIR / f"{name}.csv"
    parquet_path = PROCESSED_DIR / f"{name}.parquet"
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    df.to_parquet(parquet_path, index=False)
    return str(csv_path), str(parquet_path)

paths = []
paths += list(save_df(events_proc, "events"))
paths += list(save_df(tracks_df, "tracks"))
paths += list(save_df(pages_df, "event_pages"))
paths += list(save_df(links_df, "links"))
paths += list(save_df(participants_df, "participants"))
paths += list(save_df(registrations_df, "registrations"))
paths += list(save_df(submissions_df, "submissions"))
paths += list(save_df(reviews_df, "reviews"))
paths += list(save_df(event_kpi, "event_kpi"))

# dataset card
(PROCESSED_DIR / "dataset_card.md").write_text(
    f"# Science Events Dataset — processed build\n\n"
    f"- build_ts: {pd.Timestamp.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}\n"
    f"- parser_version: {events_df['parser_version'].dropna().iloc[0] if 'parser_version' in events_df.columns and events_df['parser_version'].dropna().shape[0] else 'unknown'}\n\n"
    f"## Tables\n"
    f"- events, tracks, event_pages, links\n"
    f"- participants, registrations, submissions, reviews\n"
    f"- event_kpi (витрина метрик по мероприятиям)\n\n"
    f"## Integrity report\n"
    f"```json\n{json.dumps(rep, ensure_ascii=False, indent=2)}\n```\n",
    encoding="utf-8"
)

# zip
zip_path = Path("/content/science_events_dataset_processed.zip")
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for p in PROCESSED_DIR.rglob("*"):
        z.write(p, arcname=p.relative_to(PROCESSED_DIR))

print("Saved processed dataset to:", PROCESSED_DIR)
print("ZIP:", zip_path)

# Скачать в Colab
from google.colab import files
files.download(str(zip_path))

Saved processed dataset to: /content/science_events_dataset_processed
ZIP: /content/science_events_dataset_processed.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>