In [1]:
import spacy
from spacy.lang.de.examples import sentences 

nlp = spacy.load("de_core_news_lg")

In [6]:
dependencies = "ROOT, ac, adc, ag, ams, app, avc, cc, cd, cj, cm, cp, cvc, da, dep, dm, ep, ju, mnr, mo, ng, nk, nmc, oa, oc, og, op, par, pd, pg, ph, pm, pnc, punct, rc, re, rs, sb, sbp, svp, uc, vo".split(", ")
for d in dependencies:
    print(f"{d},{spacy.explain(d)}")

ROOT,root
ac,adpositional case marker
adc,adjective component
ag,genitive attribute
ams,measure argument of adjective
app,apposition
avc,adverbial phrase component
cc,coordinating conjunction
cd,coordinating conjunction
cj,conjunct
cm,comparative conjunction
cp,complementizer
cvc,collocational verb construction
da,dative
dep,unclassified dependent
dm,discourse marker
ep,expletive es
ju,junctor
mnr,postnominal modifier
mo,modifier
ng,negation
nk,noun kernel element
nmc,numerical component
oa,accusative object
oc,clausal object
og,genitive object
op,prepositional object
par,parenthetical element
pd,predicate
pg,phrasal genitive
ph,placeholder
pm,morphological particle
pnc,proper noun component
punct,punctuation
rc,relative clause
re,repeated element
rs,reported speech
sb,subject
sbp,passivized subject (PP)
svp,separable verb prefix
uc,unit component
vo,vocative


In [2]:
import spacy
from spacy import displacy

def print_all(doc):
    print("text", "lemma_", "pos_", "tag_", "dep_", "shape_", "is_alpha", "is_stop")
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop)

def print_sentence(sentence):
    doc = nlp(sentence)

    print(doc.text)
    print()
    print_all(doc)

def print_dependency_diagram(sentence):
    doc = nlp(sentence)
    displacy.render(doc, style="dep", options={"compact":True})

In [211]:
import spacy
from nltk import Tree
from spacy.tokens.token import Token
from typing import Callable, Optional

def to_nltk_tree(sentence: str, tok_format: Optional[Callable[Token, str]] = None):
    def tok_format_(tok: Token):
        return "_".join([tok.orth_, tok.dep_])
    
    if tok_format is None:
        tok_format = tok_format_
        
    def to_nltk_tree_(node: Token):
        if node.n_lefts + node.n_rights > 0:
            return Tree(tok_format(node), [to_nltk_tree_(child) for child in node.children])
        else:
            return tok_format(node)
        
    doc = nlp(sentence)
    return [to_nltk_tree_(sent.root).pretty_print() for sent in doc.sents]

In [212]:
sentence = 'Der Palast hat bestätigt, dass Prinz Harry (38) zur Krönung von König Charles III. (74) und dessen Ehefrau, Königin Camilla (75), nach London kommt.'
to_nltk_tree(sentence, lambda x: f"{x.text}_{x.lemma_}_{x.dep_}")

                                                                                                     hat_haben_ROOT                                                                                                                                                               
     ______________________________________________________________________________________________________|________________                                                                                                                                                       
    |             |                                                                                                  bestätigt_bestät                                                                                                                                             
    |             |                                                                                                      igen_oc                                              

[None]

In [213]:
def get_roots(string: str):
    return list([sent.root for sent in nlp(string).sents])
roots = get_roots(sentence)

In [214]:
quotation_verbs_raw = 'sagen meinen erzählen versichern erklären betonen mitteilen ankündigen bestätigen'
quotation_verbs_doc = nlp(quotation_verbs_raw)
quotation_verbs = list(map( lambda t: t.lemma_, quotation_verbs_doc))

In [215]:
from functools import reduce
from operator import iconcat
def flatten_list(list_of_lists: List[List]) -> List:
    return reduce(iconcat, list_of_lists, [])

def flatten_tree(node: Token):
    def flatten_tree_(node: Token):
        return reduce(iconcat, [flatten_tree_(child) for child in node.children], [node])
    return sorted(flatten_tree_(node), key = lambda x: x.i)
                  
flattened = flatten_tree(roots[0])

In [216]:
from functools import reduce
from operator import iconcat
from typing import List, Optional

# This does not work so well yet, find a way to do breadth first instead of depth first
def get_nearest_tokens_by_condition(node: Token, condition: Callable[Token, bool]) -> List[Token]:
    # check current node
    if condition(node):
        return [node]
    
    # Recursion step (flatten result)
    return flatten_list([get_nearest_tokens_by_condition(n, condition) for n in node.children])


In [217]:
from functools import reduce
from operator import iconcat
from typing import List, Optional

# Breadth First Search
def get_nearest_tokens_by_condition(node: Token, condition: Callable[Token, bool]) -> List[Token]:
    def get_nearest_by(node: Token, condition: Callable[Token, bool], depth: int) -> List[Tuple[Token, int]]:
        # check current node
        if condition(node):
            return [(node, depth)]

        # Recursion step (flatten result)
        return flatten_list([get_nearest_by(n, condition, depth + 1) for n in node.children])
    
    results = get_nearest_by(node, condition, 0)
    if len(results) < 1:
        return []
    min_depth = min(results, key = lambda t: t[1])[1]
    print(results)
    print(min_depth)
    
    return list(map(lambda t: t[0], filter(lambda t: t[1] == min_depth, results)))


In [218]:
def get_text_from_tree(node: Token):
    return "".join(map(lambda x: x.text_with_ws, flatten_tree(node))).strip()

In [219]:
from typing import Union

def get_subject_node(node: Token) -> Union[Token, None]:
    condition = lambda n: n.dep_ == 'sb'
    return get_nearest_tokens_by_condition(node, condition)[0]

def get_quote_node(node: Token) -> Union[Token, None]:
    def condition(node: Token) -> bool:
        
        def is_quote_node_in_praesens_or_praeteritum(node: Token) -> bool:
            return node.dep_ == 'oc' and node.head.lemma_ in quotation_verbs
        
        def is_quote_node_in_perfekt_plusquamperfekt_or_futur(node_Token) -> bool:
            return node.dep_ == 'oc' and node.lemma_ in quotation_verbs and node.head.lemma_ in hilfsverben
        
        without_hilfsverb = is_quote_node_in_praesens_or_praeteritum(node)
        with_hilfsverb = is_quote_node_in_perfekt_plusquamperfekt_or_futur(node)
        
        print(f"{node}_{without_hilfsverb}_{with_hilfsverb}")
        
        return  without_hilfsverb or with_hilfsverb
    return get_nearest_tokens_by_condition(node, condition)[0]

def get_subject(node: Token) -> Union[str, None]:
    return get_text_from_tree(get_subject_node(node))

def get_quote(node: Token) -> Union[str, None]:
    return get_text_from_tree(get_quote_node(node))


In [220]:
hilfsverben = ['sein', 'haben']

In [221]:
from typing import Union, Tuple

# Perfekt, Plusquamperfekt oder Futur
def get_quote_node_with_verb(node:Token) -> Union[Tuple[Token, Token], None]:
    condition = lambda n: n.lemma_ in hilfsverben and n.dep_ in ['ROOT', 'oc']
    quotation_nodes = get_nearest_tokens_by_condition(node, condition)
    print(quotation_nodes)
    quotation_node = quotation_nodes[0]
    
    if quotation_node is None:
        return None
    
    verb = get_quote_node(quotation_node)
    if verb is None:
        return None
    
    return (quotation_node, verb)

In [222]:
roots[0]

hat

In [223]:
get_quote_node_with_verb(roots[0])

[(hat, 0)]
0
[hat]
hat_False_False
Palast_False_False
Der_False_False
bestätigt_False_True
._False_False
[(bestätigt, 1)]
1


(hat, bestätigt)

In [224]:
sentence = "Macron sagte zu Xi, »Die Aggression hat der Stabilität einen Schlag versetzt«."
roots = get_roots(sentence)

In [225]:
get_quote_node_with_verb(roots[0])

[(hat, 1)]
1
[hat]
hat_True_False
[(hat, 0)]
0


(hat, hat)

In [226]:
to_nltk_tree(sentence)

                                sagte_ROOT                                              
     _______________________________|_______________________                             
    |        |       |      |                             hat_oc                        
    |        |       |      |        _______________________|__________                  
    |        |       |      |       |            |                versetzt_oc           
    |        |       |      |       |            |           __________|___________      
    |        |       |    zu_mo     |      Aggression_sb    |    Stabilität_da Schlag_oa
    |        |       |      |       |            |          |          |           |     
Macron_sb ,_punct ._punct Xi_nk  »_punct       Die_nk    «_punct     der_nk     einen_nk



[None]

In [227]:
def get_quote_nodes(sentence: str) -> List[Token]:
    def get_quote_node_(root: Token) -> Token:
        if root.lemma_ in hilfsverben:
            condition = lambda n: n.head.lemma_ in quotation_verbs and n.dep_ == 'oc'
            quotation_verb_nodes = flatten_list([get_nearest_tokens_by_condition(c, condition) for c in root.children])
            if len(quotation_verb_nodes) < 1:
                return None
            return quotation_verb_nodes[0]

        if root.lemma_ in quotation_verbs:
            quotation_subtrees = [x for x in root.children if x.dep_ == 'oc']
            if len(quotation_subtrees) < 1:
                return None
            return quotation_subtrees[0]

        return None
    roots = get_roots(sentence)
    return list(map(lambda r: get_quote_node_(r), roots))

In [228]:
get_quote_nodes("Macron sagte zu Xi, »Die Aggression hat der Stabilität einen Schlag versetzt«.")

[hat]

In [229]:
get_quote_nodes('Der Palast hat bestätigt, dass Prinz Harry (38) zur Krönung von König Charles III. (74) und dessen Ehefrau, Königin Camilla (75), nach London kommt.')

[(kommt, 1)]
1


[kommt]