# Извлечение фактов из научной статьи с помощью spaCy

Этот notebook демонстрирует извлечение фактов (established knowledge) из научной статьи `2510.04749v1.pdf` используя библиотеку spaCy.

**Цель:** Получить список фактов из секций Introduction и Background статьи.

## 1. Установка и импорты

In [None]:
# Установка зависимостей (запустить один раз)
# !pip install spacy PyMuPDF
# !python -m spacy download en_core_web_sm

In [None]:
import sys
from itertools import count

import spacy
from pathlib import Path

# Добавляем корень проекта в путь для импорта модулей
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.parsers import PDFParser
from src.models import Entity, EntityType

## 2. Загрузка spaCy модели и парсинг PDF

In [2]:
# Загрузка spaCy модели
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_sm")
print("✓ Model loaded")

# Путь к PDF файлу
pdf_path = project_root / "docs" / "articles" / "2510_04749v1" / "2510.04749v1.pdf"
print(f"\nPDF path: {pdf_path}")
print(f"File exists: {pdf_path.exists()}")

Loading spaCy model...
✓ Model loaded

PDF path: /Users/ivanartemov/PycharmProjects/AAIAA/docs/articles/2510_04749v1/2510.04749v1.pdf
File exists: True


In [12]:
# Парсинг PDF
print("Parsing PDF...")
parser = PDFParser()
parsed_doc = parser.parse(str(pdf_path))

print(f"\n✓ Parsing completed in {parsed_doc.parse_time:.2f}s")
print(f"  Pages: {parsed_doc.page_count}")
print(f"  Words: {parsed_doc.word_count}")
print(f"  Sections detected: {list(parsed_doc.sections.keys())}")

Parsing PDF...

✓ Parsing completed in 0.03s
  Pages: 10
  Words: 3673
  Sections detected: []
ParsedDocument(text='LLM-Based Information Extraction to Support Scientific Literature Research and Publication Workflows ⋆ Samy Ateia1 , Udo Kruschwitz1 , Melanie Scholz2, Agnes Koschmider2 , and Moayad Almohaishi2 1 University of Regensburg, Universitätsstraße 31, 93053 Regensburg, Germany {udo.kruschwitz,samy.ateia}@ur.de 2 University of Bayreuth, Universitätsstraße 30, 95447 Bayreuth, Germany {melanie.scholz,agnes.koschmider,moayad.almohaishi}@uni-bayreuth.de Abstract. The increasing volume of scholarly publications requires ad- vanced tools for efficient knowledge discovery and management. This paper introduces ongoing work on a system using Large Language Mod- els (LLMs) for the semantic extraction of key concepts from scientific documents. Our research, conducted within the German National Re- search Data Infrastructure for and with Computer Science (NFDIx CS) project, seeks to support

## 3. Извлечение фактов с помощью spaCy

Факты обычно находятся в секциях Introduction и Background. Используем spaCy для:
- Определения границ предложений
- Pattern matching для утверждений
- NER для научных терминов

In [None]:
import re

# Паттерны для идентификации фактов (утверждений из литературы)
fact_patterns = [
    # Паттерны с цитированием
    r'\[\d+[,\d\s-]*\]',  # [1], [1,2], [1-3]
    r'\(\w+\s+et\s+al\.?[,\s]*\d{4}\)',  # (Smith et al., 2020)
    r'\(\w+[,\s]+\d{4}\)',  # (Smith, 2020)
    
    # Слова-индикаторы установленного знания
    r'\b(previous studies|prior research|previous work|earlier studies|literature shows)\b',
    r'\b(it is known|it has been shown|it is established|it is well-known|it is recognized)\b',
    r'\b(studies have shown|research has shown|evidence suggests|findings indicate)\b',
    r'\b(has been demonstrated|has been reported|has been observed|has been found)\b',
]

# Слова-исключения (указывают на гипотезы/выводы, а не факты)
hypothesis_indicators = [
    r'\b(we hypothesize|we propose|we suggest|we predict|we expect|we anticipate)\b',
    r'\b(our study|this study|present study|current study)\b',
    r'\b(aim|objective|goal)\b',
]

compiled_fact_patterns = [re.compile(p, re.IGNORECASE) for p in fact_patterns]
compiled_hypothesis_patterns = [re.compile(p, re.IGNORECASE) for p in hypothesis_indicators]

In [19]:
facts = []
doc = nlp(parsed_doc.text)
entity_id = 0
count = 0
for sent in doc.sents:
        print(count)
        for ent in sent.ents:
            print(ent.label_)
        count += 1
        print(sent.text)

0
ORG
ORG
ORG
PERSON
ORG
ORG
CARDINAL
DATE
GPE
GPE
CARDINAL
GPE
GPE
LLM-Based Information Extraction to Support Scientific Literature Research and Publication Workflows ⋆ Samy Ateia1 , Udo Kruschwitz1 , Melanie Scholz2, Agnes Koschmider2 , and Moayad Almohaishi2 1 University of Regensburg, Universitätsstraße 31, 93053 Regensburg, Germany {udo.kruschwitz,samy.ateia}@ur.de 2 University of Bayreuth, Universitätsstraße 30, 95447 Bayreuth, Germany {melanie.scholz,agnes.koschmider,moayad.almohaishi}@uni-bayreuth.de Abstract.
1
The increasing volume of scholarly publications requires ad- vanced tools for efficient knowledge discovery and management.
2
PERSON
This paper introduces ongoing work on a system using Large Language Mod- els (LLMs) for the semantic extraction of key concepts from scientific documents.
3
NORP
ORG
ORG
ORG
Our research, conducted within the German National Re- search Data Infrastructure for and with Computer Science (NFDIx CS) project, seeks to support FAIR (Findable, A

In [None]:
def extract_facts_from_text(text, section_name):
    facts = []
    doc = nlp(text)
    entity_id = 0
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if len(sent_text.split()) < 5:
            continue

    return facts

In [None]:
# Извлечение фактов из Introduction и Abstract
print("Extracting facts...\n")

all_facts = []

# Приоритетные секции для фактов
fact_sections = ['abstract', 'introduction', 'background']

for section_name in fact_sections:
    if section_name in parsed_doc.sections:
        print(f"Processing section: {section_name.upper()}")
        section_text = parsed_doc.sections[section_name]
        
        facts = extract_facts_from_text(section_text, section_name)
        all_facts.extend(facts)
        
        print(f"  Found {len(facts)} facts\n")

print(f"\n✓ Total facts extracted: {len(all_facts)}")

## 4. Список извлеченных фактов

In [7]:
# Сортировка по уверенности
all_facts.sort(key=lambda x: x.confidence, reverse=True)

print("=" * 100)
print("ИЗВЛЕЧЕННЫЕ ФАКТЫ")
print("=" * 100)
print()

for i, fact in enumerate(all_facts, 1):
    print(f"[{i}] Confidence: {fact.confidence:.2f} | Section: {fact.source_section}")
    print(f"    {fact.text}")
    print()

ИЗВЛЕЧЕННЫЕ ФАКТЫ



## 5. Экспорт фактов (опционально)

In [8]:
# Простой список фактов (только текст)
facts_list = [fact.text for fact in all_facts]

print(f"Extracted {len(facts_list)} facts:")
print()
for fact in facts_list:
    print(f"• {fact}")

Extracted 0 facts:



In [None]:
# Сохранение в JSON (опционально)
import json

output_data = {
    "paper_id": "2510.04749v1",
    "facts_count": len(all_facts),
    "facts": [fact.to_dict() for fact in all_facts]
}

output_path = project_root / "results" / "spacy_facts_2510.04749v1.json"
output_path.parent.mkdir(exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

print(f"✓ Facts saved to: {output_path}")