In [1]:
"""citation module"""

from dataclasses import dataclass
from typing import Tuple


@dataclass
class Quote:
    """Represents a citation / quote"""

    position_in_text: Tuple[int, int]
    citation: str
    subject: str  # der Bundesrat Berset
    citation_verb: str  # sagte, meinte, erklärte

    def __init__(
        self,
        position_in_text: int,
        citation: str,
        subject: str,
        citation_verb: str,
    ):
        self.position_in_text = (
            position_in_text + 1,
            position_in_text + 1 + len(citation),
        )
        self.citation = citation
        self.subject = subject
        self.citation_verb = citation_verb


In [2]:
"""
This module can be used to extract citations from a string.
Currently only syntactic quotes included
"""

import os
from functools import reduce
from operator import iconcat
from typing import List, Optional, Callable, Tuple, TypeVar, Iterator
from nltk import Tree  # type: ignore
from spacy import load
from spacy.tokens import Doc, Token
from spacy.language import Language


class _NotFoundError(BaseException):
    pass


__NLP: Optional[Language] = None  # cached spacy parser instance


def __get_nlp(model: str = "de_core_news_lg") -> Language:
    # pylint: disable=global-statement
    global __NLP
    # pylint: enable=global-statement
    if __NLP is None:
        __NLP = load(model)
    return __NLP


def __get_roots(string: str) -> Iterator[Token]:
    return (sent.root for sent in __get_nlp()(string).sents)


__QUOTATION_VERBS: Optional[List[str]] = None


def __get_quotation_verbs() -> List[str]:
    verbs = "sagen meinen erzählen versichern erklären betonen mitteilen ankündigen begrüssen twittern zitieren teilen bringen berichten schreiben verraten bestätigen dementieren rufen aufrufen empfehlen stellen feststellen fassen zusammenfassen behaupten verweisen bezeichnen argumentieren kontern kündigen nennen rechnen erwidern fragen werfen"
    return verbs.split(" ")


def __get_hilfsverben() -> List[str]:
    return ["sein", "haben"]


# Breadth First Search
def __get_nearest_tokens_by_condition(
    node: Token, condition: Callable[[Token], bool]
) -> List[Token]:
    def get_nearest_by(
        node: Token, condition: Callable[[Token], bool], depth: int
    ) -> List[Tuple[Token, int]]:
        # check current node
        if condition(node):
            return [(node, depth)]

        # Recursion step (flatten result)
        return __get_flattened_list(
            [get_nearest_by(n, condition, depth + 1) for n in node.children]
        )

    results = get_nearest_by(node, condition, 0)
    if len(results) < 1:
        return []
    min_depth = min(results, key=lambda t: t[1])[1]

    return list(map(lambda t: t[0], filter(lambda t: t[1] == min_depth, results)))


T = TypeVar("T")


def __get_flattened_list(list_of_lists: List[List[T]]) -> List[T]:
    return reduce(iconcat, list_of_lists, [])


def __get_flattened_tree(node: Token) -> List[Token]:
    def get_flattened_tree_(node: Token) -> List[Token]:
        return reduce(
            iconcat, [get_flattened_tree_(child) for child in node.children], [node]
        )

    return sorted(get_flattened_tree_(node), key=lambda x: x.i)


def __get_text_from_tree(node: Token) -> str:
    return "".join(map(lambda x: x.text_with_ws, __get_flattened_tree(node))).strip()


def __get_subject_node(node: Token) -> Token:
    condition = lambda n: n.dep_ == "sb"
    result = __get_nearest_tokens_by_condition(node, condition)
    if len(result) < 1:
        raise _NotFoundError()
    return result[0]


def __get_quote_node(root: Token) -> Token:
    result = []

    # Perfekt, Plusquamperfekt, Futur
    if root.lemma_ in __get_hilfsverben():
        print("Perfekt, Plusquamperfekt, Futur")
        condition = (
            lambda n: n.head.lemma_ in __get_quotation_verbs() and n.dep_ == "oc"
        )
        result = __get_flattened_list(
            [__get_nearest_tokens_by_condition(c, condition) for c in root.children]
        )

    # Präsens, Präteritum
    if root.lemma_ in __get_quotation_verbs():
        print("räsens, Präteritum")
        result = [x for x in root.children if x.dep_ == "oc"]

    if len(result) < 1:
        raise _NotFoundError()

    print(result[0])
    return result[0]


def __get_subject(node: Token) -> str:
    return __get_text_from_tree(__get_subject_node(node))


def __get_quote(node: Token) -> str:
    return __get_text_from_tree(__get_quote_node(node))


def __get_syntactic_quote(node: Token) -> Optional[Quote]:
    try:
        position_in_text = node.idx
        subject = __get_subject(node)
        citation = __get_quote(node)
        citation_verb = __get_quote_node(node).head.text

        return Quote(
            position_in_text,
            citation,
            subject,
            citation_verb,
        )
    except _NotFoundError:
        return None


def print_nltk_tree(text: str) -> None:
    """Print a nltk parse tree for the given text"""

    def tok_format(tok: Token) -> str:
        return "_".join([tok.orth_, tok.dep_])

    def to_nltk_tree_(node: Token) -> Tree:
        if node.n_lefts + node.n_rights > 0:
            return Tree(
                tok_format(node), [to_nltk_tree_(child) for child in node.children]
            )
        return tok_format(node)

    doc = __get_nlp()(text)
    _ = [to_nltk_tree_(sent.root).pretty_print() for sent in doc.sents]


def get_syntactic_quotes(string: str) -> List[Quote]:
    """Get all syntactic quotes from a given string"""
    roots = __get_roots(string)
    quotes = map(__get_syntactic_quote, roots)
    return list(filter(lambda x: x is not None, quotes))  # type: ignore


In [3]:
import spacy
from nltk import Tree
from spacy.tokens.token import Token
from typing import Callable, Optional
from spacy.lang.de.examples import sentences 

nlp = spacy.load("de_core_news_lg")
doc = nlp(sentences[0])

def to_nltk_tree(sentence: str, tok_format: Optional[Callable[Token, str]] = None):
    def tok_format_(tok: Token):
        return "_".join([tok.orth_, tok.dep_])
    
    if tok_format is None:
        tok_format = tok_format_
        
    def to_nltk_tree_(node: Token):
        if node.n_lefts + node.n_rights > 0:
            return Tree(tok_format(node), [to_nltk_tree_(child) for child in node.children])
        else:
            return tok_format(node)
        
    doc = nlp(sentence)
    
    ret_val = []
    for sent in doc.sents:
        nltk_tree = to_nltk_tree_(sent.root)
        if isinstance(nltk_tree,str):
            ret_val.append(nltk_tree)
        else:
            ret_val.append(nltk_tree.pretty_print())
    return ret_val

In [4]:
sentence = "Dort habe er in einer Wohnung auch einen Grossteil seiner Sachen, unter anderem Familienerbstücke, wie das Inventar aus dem Restaurant Rossberg, wie der «Landbote» schreibt."

In [5]:
to_nltk_tree(sentence)

                                                                            habe_ROOT                                                                                                          
    ____________________________________________________________________________|__________________                                                                                             
   |      |      |        |                                                                   Grossteil_oa                                                                                     
   |      |      |        |          ______________________________________________________________|_____________________                                                                       
   |      |      |        |         |       |        |        |                                                   Familienerbstück                                                             
   |      |      |        |         | 

[None]

In [6]:
get_syntactic_quotes(sentence)

Perfekt, Plusquamperfekt, Futur


[]

In [7]:
article = """
Der Bundesrat schafft ein neues Staatssekretariat für Sicherheit im Verteidigungsdepartement (VBS). Es erarbeitet und koordiniert ab dem 1. Januar 2024 eine gesamtheitliche Sicherheitspolitik. Das Cybersicherheits-Zentrum wechselt vom Finanzdepartement ins VBS.
Das Departement für Verteidigung, Bevölkerungsschutz und Sport (VBS) wird bis Ende Jahr die rechtlichen Grundlagen erarbeiten, wie der Bundesrat am Mittwoch mitteilte. Anstoss zu dem neuen Staatssekretariat gab der Krieg in der Ukraine.
"""

get_syntactic_quotes(article)

[]

In [8]:
sentence = "Das Departement für Verteidigung, Bevölkerungsschutz und Sport (VBS) wird bis Ende Jahr die rechtlichen Grundlagen erarbeiten, wie der Bundesrat am Mittwoch mitteilte."
get_syntactic_quotes(sentence)
to_nltk_tree(sentence)

                                                                                wird_ROOT                                                                            
    ________________________________________________________________________________|________________________________________                                         
   |                                    Departement_sb                                                                       |                                       
   |       ___________________________________|____________________                                                          |                                        
   |      |                für_mnr                                 |                                                         |                                       
   |      |                   |                                    |                                                         |                                        
 

[None]