In [1]:
import regex
from gatenlp import Document
from gatenlp.pam.pampac import *
from gatenlp.lib_stanza import AnnStanza



In [38]:
annotator = AnnStanza(lang="en")
text = """Barack Obama was the 44th president of the US and he followed George W. Bush and
  was followed by Donald Trump. Before Bush, Bill Clinton was president."""
doc = Document(text)
doc = annotator(doc)
doc

2021-09-21 10:08:03,874|INFO|stanza|Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2021-09-21 10:08:03,875|INFO|stanza|Use device: cpu
2021-09-21 10:08:03,876|INFO|stanza|Loading: tokenize
2021-09-21 10:08:03,881|INFO|stanza|Loading: pos
2021-09-21 10:08:04,118|INFO|stanza|Loading: lemma
2021-09-21 10:08:04,150|INFO|stanza|Loading: depparse
2021-09-21 10:08:04,506|INFO|stanza|Loading: sentiment
2021-09-21 10:08:04,908|INFO|stanza|Loading: ner
2021-09-21 10:08:05,508|INFO|stanza|Done loading processors!


In [3]:
annset = doc.annset()  # set on which to match
def print_match(success, context, location):
    results = success.result(matchtype="all")
    print("Results:", results)
    print("Location:", location)
    outset = context.outset
    for idx, result in enumerate(results):
        outset.add(result.span, "Result"+str(idx))
def outset(name):
    ret = doc.annset(name)
    ret.clear()
    return ret

In [4]:
# show the sequence of annotations and their locations
for idx, ann in enumerate(annset):
    print(f"{idx}: {ann.start} / {ann.type} / {doc[ann]}")

0: 0 / Token / Barack
1: 0 / Sentence / Barack Obama was the 44th president of the US and he followed George W. Bush and
  was followed by Donald Trump.
2: 0 / PERSON / Barack Obama
3: 7 / Token / Obama
4: 13 / Token / was
5: 17 / Token / the
6: 21 / Token / 44th
7: 21 / ORDINAL / 44th
8: 26 / Token / president
9: 36 / Token / of
10: 39 / Token / the
11: 43 / Token / US
12: 43 / GPE / US
13: 46 / Token / and
14: 50 / Token / he
15: 53 / Token / followed
16: 62 / Token / George
17: 62 / PERSON / George W. Bush
18: 69 / Token / W.
19: 72 / Token / Bush
20: 77 / Token / and
21: 83 / Token / was
22: 87 / Token / followed
23: 96 / Token / by
24: 99 / Token / Donald
25: 99 / PERSON / Donald Trump
26: 106 / Token / Trump
27: 111 / Token / .
28: 113 / Token / Before
29: 113 / Sentence / Before Bush, Bill Clinton was president.
30: 120 / Token / Bush
31: 120 / PERSON / Bush
32: 124 / Token / ,
33: 126 / Token / Bill
34: 126 / PERSON / Bill Clinton
35: 131 / Token / Clinton
36: 139 / Token / was

In [5]:
# Test1: two tokens with upos PROPN in sequence
doc.annset("Test01").clear()
pattern1 = Seq(
    AnnAt(type="Token", features=dict(upos="PROPN")),
    AnnAt(type="Token", features=dict(upos="PROPN"))
)
Pampac(Rule(pattern1, AddAnn(type="pattern2"))).run(doc, annset, outset=outset("Test01"))
doc

In [6]:
# Test2: One to 5 Tokens with upos PROPN in sequence
doc.annset("Test02").clear()
pattern2 = N(
    AnnAt(type="Token", features=dict(upos="PROPN")),
    min=1, max=5,
)
Pampac(Rule(pattern2, print_match)).run(doc, annset, outset=outset("Test02"))
doc

Results: [Result(loc=Location(12,4),span=Span(0,12),matches=[])]
Location: Location(0,0)
Results: [Result(loc=Location(45,12),span=Span(43,45),matches=[])]
Location: Location(40,11)
Results: [Result(loc=Location(76,20),span=Span(62,76),matches=[])]
Location: Location(54,16)
Results: [Result(loc=Location(111,27),span=Span(99,111),matches=[])]
Location: Location(97,24)
Results: [Result(loc=Location(124,31),span=Span(120,124),matches=[])]
Location: Location(114,30)
Results: [Result(loc=Location(138,36),span=Span(126,138),matches=[])]
Location: Location(125,33)


In [7]:
# Test3: Text "followed by" followed by a Token with upos PROPN
doc.annset("Test03").clear()
pattern3 = Seq(
    Text("Barack "),
    AnnAt(type="Token", features=dict(upos="PROPN")),
    Text(" was")
)
#pattern3 = Text("followed by")
Pampac(Rule(pattern3, print_match)).run(doc, annset, outset=outset("Test03"))
doc

Results: [Result(loc=Location(16,5),span=Span(0,16),matches=[])]
Location: Location(0,0)


In [8]:
from gatenlp.pam.matcher import *
from gatenlp.features import Features

In [9]:
c1 = isIn("this", "that", 25, matchcase=False)
fm1 = FeatureMatcher(f1 = c1)

In [10]:
fm1(Features(f1="That"))

True

## Test within and Text within

In [11]:
doc.annset("Test04").clear()
pattern4 = Ann(type="Token").within(type="PERSON")
Pampac(Rule(pattern4, print_match)).run(doc, annset, outset=outset("Test04"))
doc

Results: [Result(loc=Location(6,1),span=Span(0,6),matches=[])]
Location: Location(0,0)
Results: [Result(loc=Location(12,4),span=Span(7,12),matches=[])]
Location: Location(6,3)
Results: [Result(loc=Location(68,17),span=Span(62,68),matches=[])]
Location: Location(54,16)
Results: [Result(loc=Location(71,19),span=Span(69,71),matches=[])]
Location: Location(68,18)
Results: [Result(loc=Location(76,20),span=Span(72,76),matches=[])]
Location: Location(71,19)
Results: [Result(loc=Location(105,25),span=Span(99,105),matches=[])]
Location: Location(97,24)
Results: [Result(loc=Location(111,27),span=Span(106,111),matches=[])]
Location: Location(105,26)
Results: [Result(loc=Location(124,31),span=Span(120,124),matches=[])]
Location: Location(114,30)
Results: [Result(loc=Location(130,34),span=Span(126,130),matches=[])]
Location: Location(125,33)
Results: [Result(loc=Location(138,36),span=Span(131,138),matches=[])]
Location: Location(130,35)


In [12]:
import regex
doc.annset("Test05").clear()
pattern5 = Text(regex.compile(r"Barack|Donald")).within(type="PERSON")
Pampac(Rule(pattern5, print_match)).run(doc, annset, outset=outset("Test05"))
doc

Results: [Result(loc=Location(6,3),span=Span(0,6),matches=[])]
Location: Location(0,0)
Results: [Result(loc=Location(105,26),span=Span(99,105),matches=[])]
Location: Location(99,24)


### Test within from separate set



In [13]:
# copy only the PERSON annotations to a new set
doc.annset("Other").clear()
doc.annset("Other").add_anns(doc.annset().with_type("PERSON"))
doc

In [14]:
import regex
doc.annset("Test06").clear()
pattern6 = Text(regex.compile(r"Barack|Donald")).within(type="PERSON", annset=doc.annset("Other"))
Pampac(Rule(pattern6, print_match)).run(doc, annset, outset=outset("Test06"))
doc

Results: [Result(loc=Location(6,3),span=Span(0,6),matches=[])]
Location: Location(0,0)
Results: [Result(loc=Location(105,26),span=Span(99,105),matches=[])]
Location: Location(99,24)


### Test adding to a new set

In [15]:
import regex
doc.annset("Test07b").clear()
doc.annset("Test07a").clear()
pattern7 = Text(regex.compile(r"Barack|Donald")).within(type="PERSON", annset=doc.annset("Other"))
Pampac(Rule(pattern7, AddAnn(type="NewAnn", annset=doc.annset("Test07b")))).run(doc, annset, outset=outset("Test07a"))
doc

In [16]:
import regex
doc.annset("Test08a").clear()
pattern8a = Text(regex.compile(r"Barack|Donald"), name="match1").within(type="PERSON", annset=doc.annset("Other"))
Pampac(Rule(
    pattern8a, 
    AddAnn(type="NewAnn", name="match1", annset=doc.annset("Test08a"))
)).run(doc, annset, outset=outset("Test08a"))
doc

In [17]:
import regex
doc.annset("Test08b").clear()
pattern8b = Text(regex.compile(r"Barack|Donald"), name="match1").within(type="PERSON", annset=doc.annset("Other"))
Pampac(Rule(
    pattern8b, 
    AddAnn(type="NewAnn", 
           name="match1",
           features=dict(thetext=GetText()),
           annset=doc.annset("Test08b"))
)).run(doc, annset, outset=outset("Test08b"))
doc

## Test running on containing annotations

In [18]:
from gatenlp import AnnotationSet
doc.annset("Test09").clear()
pattern9 = Seq(
    AnnAt(type="Token", features=dict(upos="PROPN")),
    AnnAt(type="Token", features=dict(upos="PROPN"))
)
containing = doc.annset().with_type("Sentence")
# just use the first 
first = AnnotationSet.create_from(list(containing)[0])
Pampac(Rule(pattern9, AddAnn(type="pattern9"))).run(doc, annset, outset=outset("Test09"), containing_anns=first)
doc

## Test PampacAnnotator

In [19]:
from gatenlp.pam.pampac import PampacAnnotator
doc.annset("Test10").clear()
pattern10 = Seq(
    AnnAt(type="Token", features=dict(upos="PROPN")),
    AnnAt(type="Token", features=dict(upos="PROPN"))
)
pmp = Pampac(Rule(pattern10, AddAnn(type="pattern10")))
annt = PampacAnnotator(pmp, [("", "Token")], "Test10")
annt(doc)
doc

## Test complex text-based patterns, regex on features

In [20]:
# see https://github.com/GateNLP/python-gatenlp/discussions/132#discussion-3572882

doc.annset("Test11").clear()
# test1: find all tokens where the upos feature starts with "PR"
import re
regex1 = re.compile(r"PR.+")

p1 = Ann("Token", features=dict(upos=regex1), name="p1")
r1 = Rule(p1, AddAnn(name="p1", type="r1"))
Pampac(r1).run(doc, doc.annset().with_type("Token"), outset=doc.annset("Test11"))
doc

In [21]:
# second question
import spacy
import regex
from gatenlp import Document
from gatenlp.pam.pampac import *
from gatenlp.lib_spacy import apply_spacy

nlp = spacy.load("en_core_web_sm")
doc2=Document("The wheel of the car was found near the restaurant of the gas station of my friend. Nobody knows how the wheel arrived there. ")
doc2=apply_spacy(nlp, doc2, setname="Original markups")
doc2

In [22]:
outset = doc2.annset("Out")
outset.clear()
r5 = Rule(
       Seq ( AnnAt("NounChunk"),
            AnnAt("Token", features=dict(lemma="of")), 
            AnnAt("NounChunk")
            , name="chunk"),
        AddAnn( name="chunk", type="extendedChunk")
)

# r5 = Rule(
#     Seq(
#         AnnAt("NounChunk"),
#         AnnAt("Token", features=dict(lemma="of")),
#         AnnAt("NounChunk"),
#         name="chunk",
#        ),
#     AddAnn(type="NEW")
# )

annSet=doc2.annset("Original markups").with_type(["Token", "NounChunk"])
Pampac(r5, skip="one").run(doc2, annSet, outset=outset)
doc2

In [23]:
for ann in outset:
    print(f"{ann.type}: {doc2[ann]}")

extendedChunk: The wheel of the car
extendedChunk: the restaurant of the gas station
extendedChunk: the gas station of my friend


In [24]:
for ann in annSet:
    print(f"{ann.type}: {doc2[ann]}")

Token: The
NounChunk: The wheel
Token: wheel
Token: of
Token: the
NounChunk: the car
Token: car
Token: was
Token: found
Token: near
Token: the
NounChunk: the restaurant
Token: restaurant
Token: of
Token: the
NounChunk: the gas station
Token: gas
Token: station
Token: of
Token: my
NounChunk: my friend
Token: friend
Token: .
Token: Nobody
NounChunk: Nobody
Token: knows
Token: how
Token: the
NounChunk: the wheel
Token: wheel
Token: arrived
Token: there
Token: .


## Test RemoveAnn



In [25]:
doc

In [26]:
# copy default set to Test12
doc.annset("Test12").clear()
doc.annset("Test12").add_anns(doc.annset())

In [27]:
# find all Tokens which have a upos="PROPN" and remove those
Pampac(
    Rule(
        AnnAt("Token", features=dict(upos="PROPN"), name="match"),
        RemoveAnn("match", doc.annset("Test12"))
    )
).run(doc, doc.annset("Test12").with_type("Token"))
doc

## Match multiple AnnAt

In [28]:
def make_doc():
    doc = Document("Some test document")
    doc.annset().add(0, 2, "Ann")  # 0
    doc.annset().add(0, 2, "Ann")  # 1
    doc.annset().add(0, 2, "Token")  # 2
    doc.annset().add(2, 4, "Ann")  # 3
    doc.annset().add(2, 4, "Ann")  # 4
    doc.annset().add(4, 6, "Ann")  # 5
    doc.annset().add(4, 6, "Ann")  # 6
    doc.annset().add(4, 6, "Person")  # 7
    doc.annset().add(6, 8, "Ann")  # 8
    doc.annset().add(6, 8, "Ann")  # 9
    doc.annset().add(8, 10, "XXXX")  # 10
    return doc
doc3 = make_doc()
for ann in doc3.annset():
    print(ann.start, ann.type)
doc3

0 Ann
0 Ann
0 Token
2 Ann
2 Ann
4 Ann
4 Ann
4 Person
6 Ann
6 Ann
8 XXXX


In [29]:
def log(succ, context=None, location=None):
    print("RESULT:")
    for idx, res in enumerate(succ):
        print(f"  {idx}: loc={res.location}, span={res.span}:")
        for m in res.matches:
            print(f"    {m}")
Pampac(
    Rule(
        AnnAt("Ann", name="match", matchtype="all"),
        Actions(log, RemoveAnn(name="match", annset=doc3.annset()))
    ),
).run(doc3, doc3.annset().with_type("Ann", "XXX", "Person", "Token"))


RESULT:
  0: loc=Location(2,1), span=Span(0,2):
    {'span': Span(0,2), 'location': Location(0,0), 'ann': Annotation(0,2,Ann,features=Features({}),id=0), 'name': 'match'}
  1: loc=Location(2,2), span=Span(0,2):
    {'span': Span(0,2), 'location': Location(0,1), 'ann': Annotation(0,2,Ann,features=Features({}),id=1), 'name': 'match'}
RESULT:
  0: loc=Location(4,4), span=Span(2,4):
    {'span': Span(2,4), 'location': Location(2,3), 'ann': Annotation(2,4,Ann,features=Features({}),id=3), 'name': 'match'}
  1: loc=Location(4,5), span=Span(2,4):
    {'span': Span(2,4), 'location': Location(2,4), 'ann': Annotation(2,4,Ann,features=Features({}),id=4), 'name': 'match'}
RESULT:
  0: loc=Location(6,6), span=Span(4,6):
    {'span': Span(4,6), 'location': Location(4,5), 'ann': Annotation(4,6,Ann,features=Features({}),id=5), 'name': 'match'}
  1: loc=Location(6,7), span=Span(4,6):
    {'span': Span(4,6), 'location': Location(4,6), 'ann': Annotation(4,6,Ann,features=Features({}),id=6), 'name': 'match'

[(0, [[None, None]]),
 (2, [[None, None]]),
 (4, [[None, None]]),
 (6, [[None, None]])]

In [30]:
doc3

## Check >> operator


In [31]:
doc

In [35]:
# find Person followed by token, followed by person: with >> we only get the last match!
outset = doc.annset("Test13")
outset.clear()
Pampac(
    Rule(
        AnnAt("PERSON") >> AnnAt("Token") >> AnnAt("PERSON"),
        AddAnn(type="MATCH")
    )
).run(
    doc, 
    doc.annset().with_type("PERSON", "Token"),
    outset=outset
)
doc

In [37]:
# find Person followed by token, followed by person: with Seq we should get the whole sequence
# !!!! BUG: get only last???
outset = doc.annset("Test14")
outset.clear()
Pampac(
    Rule(
        Seq(
            AnnAt("PERSON"),
            AnnAt("Token"),
            AnnAt("PERSON"),
            name="match"
        ),
        AddAnn(name="match", type="MATCH")
    )
).run(
    doc, 
    doc.annset().with_type("PERSON", "Token"),
    outset=outset
)
doc