## RE using Stanza dependency parser

In [None]:
pip install stanza

Collecting stanza
  Downloading stanza-1.3.0-py3-none-any.whl (432 kB)
[?25l[K     |▊                               | 10 kB 31.7 MB/s eta 0:00:01[K     |█▌                              | 20 kB 8.4 MB/s eta 0:00:01[K     |██▎                             | 30 kB 7.9 MB/s eta 0:00:01[K     |███                             | 40 kB 7.4 MB/s eta 0:00:01[K     |███▉                            | 51 kB 4.0 MB/s eta 0:00:01[K     |████▌                           | 61 kB 4.2 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 4.5 MB/s eta 0:00:01[K     |██████                          | 81 kB 5.0 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 3.8 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 4.1 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 4.1 MB/s eta 0:00:01[K     |█████████                       | 122 kB 4.1 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 4.1 MB/s eta 0:00:01[K    

In [None]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2021-11-16 14:32:19 INFO: Downloading default packages for language: en (English)...
2021-11-16 14:32:20 INFO: File exists: /root/stanza_resources/en/default.zip.
2021-11-16 14:32:31 INFO: Finished downloading models and saved to /root/stanza_resources.
2021-11-16 14:32:31 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2021-11-16 14:32:31 INFO: Use device: gpu
2021-11-16 14:32:31 INFO: Loading: tokenize
2021-11-16 14:33:31 INFO: Loading: pos
2021-11-16 14:33:31 INFO: Loading: lemma
2021-11-16 14:33:31 INFO: Loading: depparse
2021-11-16 14:33:31 INFO: Loading: sentiment
2021-11-16 14:33:32 INFO: Loading: constituency
2021-11-16 14:33:33 INFO: Loading: ner
2021-11-16 14:33:33 INFO: Done loading processors!


In [None]:
sentence = "Barack Obama was born in Hawaii. He was elected president in 2008."
#sentence = "In the summer of 2014 several discussions took place between the Muslim community of the municipality of Adigeni and the local government authorities on the status of an old building in the village of Mokhe (“the disputed building”), asserted by the former to be an ancient mosque."
doc = nlp(sentence)
# doc.sentences[0].print_dependencies()

In [None]:
# ner_tags = ["PERS", "PERS", "O", "O", "O", "LOC", "O","O","O","O","O","O","O","O"]

for sent in doc.sentences:
  for word in sent.words:
    # if (ner_tags[word.id] == "PERS" && )
    print(word.id, word.text, word.head, sent.words[word.head-1].text, word.deprel)
    # print(word.text, sent.words[sent.words[word.head-1].head-1].text, sent.words[word.head-1].text)

1 Barack 4 born nsubj:pass
2 Obama 1 Barack flat
3 was 4 born aux:pass
4 born 0 . root
5 in 6 Hawaii case
6 Hawaii 4 born obl
7 . 4 born punct
1 He 3 elected nsubj:pass
2 was 3 elected aux:pass
3 elected 0 . root
4 president 3 elected xcomp
5 in 6 2008 case
6 2008 3 elected obl
7 . 3 elected punct


In [None]:
def appendChunk(org, new):
    return org + ' ' + new

def isRelationCandidate(token):
    deps = ["root", "adj", "attr", "agent"]
    return any(subs in token.deprel for subs in deps)

def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod", "obl"]
    return any(subs in token.deprel for subs in deps)

def processSubjectObjectPairs(tokens):
    subject = ''
    obj = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        #printToken(token)
        if "punct" in token.deprel:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.text)
        if isConstructionCandidate(token):
            if subjectConstruction:
              subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
              objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.deprel:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.deprel:
            obj = appendChunk(obj, token.text)
            obj = appendChunk(objectConstruction, obj)
            objectConstruction = ''

    if obj == '':
      obj = appendChunk(objectConstruction, obj)
    if subject == '':
      subject = appendChunk(subjectConstruction, subject)

    print(subject.strip(), ",", relation.strip(), ",", obj.strip())
    return (subject.strip(), relation.strip(), obj.strip())

In [None]:
## Could be improved by adding coreference and anaphore resolution (coref)
for sent in doc.sentences:
  processSubjectObjectPairs(sent.words)

Barack , born , 
He , elected , 


In [None]:
## Sensitive to exceptions
## https://universaldependencies.org/u/dep/ (Look at Hawaï)

## SpaCy RE

In [None]:
import spacy
from spacy.lang.en import English
import networkx as nx
import matplotlib.pyplot as plt
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

def getSentences(text):
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    #nlp.add_pipe('sentencizer')
    document = nlp(text)
    #print([type(sent.text) for sent in document.sents])
    return [sent.text.strip() for sent in document.sents]

def printToken(token):
    print(token.text, "->", token.dep_)

def appendChunk(original, chunk):
    return original + ' ' + chunk

def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)

def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)

def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        #printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    print (subject.strip(), ",", relation.strip(), ",", object.strip())
    return (subject.strip(), relation.strip(), object.strip())

def processSentence(sentence):
    tokens = nlp_model(sentence)
    return processSubjectObjectPairs(tokens)


text = "The Court reiterates that by virtue of the essential function the press fulfils in a democracy, Article 10 of the Convention affords journalists protection, subject to the proviso that they act in good faith in order to provide accurate and reliable information in accordance with the tenets of responsible journalism (see, among other authorities, Pentikäinen v. Finland [GC], no. 11882/10, § 90, ECHR 2015). In considering the “duties and responsibilities” of a journalist, the potential impact of the medium concerned is an important factor and it is commonly acknowledged that the audiovisual media have often a much more immediate and powerful effect than the print media. The audiovisual media have means of conveying through images meanings which the print media are not able to impart. At the same time, the methods of objective and balanced reporting may vary considerably, depending among other things on the media in question. It is not for this Court, nor for the national courts for that matter, to substitute their own views for those of the press as to what technique of reporting should be adopted by journalists. In this context the Court reiterates that Article 10 protects not only the substance of the ideas and information expressed, but also the form in which they are conveyed (see Jersild, cited above, §§ 31). The punishment of a journalist for assisting in the dissemination of statements made by another person in an interview would seriously hamper the contribution of the press to discussion of matters of public interest and should not be envisaged unless there are particularly strong reasons for doing so (ibid., § 35, and Thoma, cited above, § 62). A general requirement for journalists systematically and formally to distance themselves from the content of a quotation that might insult or provoke others or damage their reputation is not reconcilable with the press’s role of providing information on current events, opinions and ideas (see Thoma, cited above, § 64)."
# text = "On 11 February 2014 the Broadcasting Council issued a new decision in which it again concluded that the applicant company had breached the Broadcasting and Retransmission Act and fined it EUR 500. It held that the applicant company’s freedom of expression was to be restricted on the grounds of the ban on promoting drug use provided for in section 19(1)e) of the Broadcasting and Retransmission Act, which pursued the legitimate aim of protecting public order. That ban reflected the public interest in not publishing information which amounted to a positive assessment of drug use. Given the objective (strict) liability nature of the administrative offence, what was decisive in the case at hand was not whether the applicant company had aimed to promote drug use, but whether the programme, in the light of its content and the manner of processing the information, had had a promotional character. In the Broadcasting Council’s opinion, such was the case since X.’s comments had disseminated the idea that marijuana had a positive influence; the journalist’s comments had downplayed and justified them as being common, which went beyond a simple statement of views and beyond reproducing information that had already been publicly available. In that way, the applicant company had significantly interfered with the legitimate interests in protecting public order, health and morals, while the lowest possible fine had restricted its freedom of expression to a very little extent, which had made the interference fully proportionate."

sentences = getSentences(text)
nlp_model = spacy.load('en_core_web_sm')

triples = []
print (text)
for sentence in sentences:
    triples.append(processSentence(sentence))

The Court reiterates that by virtue of the essential function the press fulfils in a democracy, Article 10 of the Convention affords journalists protection, subject to the proviso that they act in good faith in order to provide accurate and reliable information in accordance with the tenets of responsible journalism (see, among other authorities, Pentikäinen v. Finland [GC], no. 11882/10, § 90, ECHR 2015). In considering the “duties and responsibilities” of a journalist, the potential impact of the medium concerned is an important factor and it is commonly acknowledged that the audiovisual media have often a much more immediate and powerful effect than the print media. The audiovisual media have means of conveying through images meanings which the print media are not able to impart. At the same time, the methods of objective and balanced reporting may vary considerably, depending among other things on the media in question. It is not for this Court, nor for the national courts for that

## Disadvantages of current approach

* It doesn't always work, because it's rule based, especially on sentences that are written differently (e.g. legal documents).
* Only takes semantics limited into account
* Only one relation per sentence -> Long sentences messes up

## NLTK

In [None]:
import nltk
import re
from nltk.sem import extract_rels, rtuple
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
## Tag the sentence
sample = "Barack Obama is born in Hawai"

sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]

# Maybe try to train averaged_perceptron_tagger (CRF, used by nltk.pos_tag),
# to increase quality

print(tagged_sentences)

[[('Barack', 'NNP'), ('Obama', 'NNP'), ('is', 'VBZ'), ('born', 'VBN'), ('in', 'IN'), ('Hawai', 'NNP')]]


In [None]:
# Extract relations
X = re.compile(r'.*\bin\b(?!\b.+ing)')

ents = ["LOCATION", "ORGANIZATION", "PERSON", "DURATION", "DATE",
        "CARDINAL", "PERCENT", "MONEY", "MEASURE"]

for sub_ent in ents:
  for obj_ent in ents:
    for i, sent in enumerate(tagged_sentences):
      sent = nltk.ne_chunk(sent)
      for rel in extract_rels(sub_ent, obj_ent, sent, corpus='ace', pattern=X):
        print(nltk.sem.rtuple(rel))

In [None]:
# Didn't find any relation in the sentence
print(sent)

(S
  (PERSON Barack/NNP)
  (ORGANIZATION Obama/NNP)
  is/VBZ
  born/VBN
  in/IN
  (GPE Hawai/NNP))
