In [1]:
# Notes for configuring the presentation:
# reveal/rise config is in the notebook metadata (see Edit - Edit Notebook Metadata, "rise" section)
# possible themes: https://revealjs.com/themes/
# - for us: simple, white
# possible transitions: https://revealjs.com/transitions/
# - for us: none, slide, zoom
# More: see https://github.com/damianavila/RISE/blob/master/doc/customize.md#how-to-customize
# Documentation: https://rise.readthedocs.io/en/latest/

## <center>An Overview of<br>Python GateNLP<br></center>

* Online slides: https://gatenlp.github.io/python-gatenlp/overview/gatenlp-overview.slides.html
* Slides License: [CC BY-NC-SA 3.0](https://creativecommons.org/licenses/by-nc-sa/3.0/)

# Python GateNLP: Aims

* NLP framework written in pure Python. 
* Similar concepts as Java GATE: documents, document features, annotation sets, annotations, ...
* "pythonic" API, try to make basic things very simple (e.g. loading/saving of documents)
* if possible, fix some of the problems/idiosyncracies of Java GATE
* Does NOT try to be a full multilingual NLP processing package, rather COMBINE:
  * Use existing tools and solutions: Spacy, Stanford Stanza, NLTK, ...
  * Add own tools and improvements where needed
  * Concentrate on what others do NOT provide


# Python GateNLP: status

* Current release: 1.0.6
* Published as PyPi package
  * 400 - 1400 downloads / month, 40.000 total (see [PePy](https://pepy.tech/project/gatenlp))
  * 40.000 downloads total
* All 1.0.x: get community feedback:
  * how to improve API, abstractions, conventions, find bugs
  * what is most important to still get added?
* Planned 1.1.x releases and onwards: stable API

# Python GateNLP: Info and Feedback

* Documentation: https://gatenlp.github.io/python-gatenlp/
* Sources: https://github.com/GateNLP/python-gatenlp
* Report a bug, request a feature with issue tracker: https://github.com/GateNLP/python-gatenlp/issues
* Discuss, ask: 
  * discussions forum at https://github.com/GateNLP/python-gatenlp/discussions
  * GATE mailing list https://groups.io/g/gate-users

# Python GateNLP: Main Concepts (1)

* [**Document**](https://gatenlp.github.io/python-gatenlp/documents): 
  * has text
  * has named [**AnnotationSets**](https://gatenlp.github.io/python-gatenlp/annotationsets), each with  [**Annotations**](https://gatenlp.github.io/python-gatenlp/annotations), over **Spans** 
  * documents and annotation sets can have **Features** (similar to `dict`)
* **Annotators** process a document, return a document or a (possibly empty) list of documents: 
  * [**Gazetteers**](https://gatenlp.github.io/python-gatenlp/gazetteers) (StringGazetteer, [**StringRegexAnnotator**](https://gatenlp.github.io/python-gatenlp/stringregex), TokenGazetteer), 
  * [**Tokenizers**](https://gatenlp.github.io/python-gatenlp/tokenizers), 
  * [**PAMPAC**](https://gatenlp.github.io/python-gatenlp/pampac) (annotation pattern matching rules), 
  * [**Spacy annotator**](https://gatenlp.github.io/python-gatenlp/lib_spacy), [**Stanza Annotator**](https://gatenlp.github.io/python-gatenlp/lib_stanza)
  * [**GateWorkerAnnotator**](https://gatenlp.github.io/python-gatenlp/gateworker): arbitrary GATE pipelines
  * [**Client Annotators**](https://gatenlp.github.io/python-gatenlp/client_annotators): ELG, IBM, Google, TagMe, TextRazor


# Python GateNLP: Main Concepts (2)

* Collections of documents:  
  * [**Corpus**](https://gatenlp.github.io/python-gatenlp/corpora): direct access 
  * [**DocumentSource**](https://gatenlp.github.io/python-gatenlp/corpora): sequential reading 
  * [**DocumentDestination**](https://gatenlp.github.io/python-gatenlp/corpora): sequential writing
* Pipeline: process documents, pipe through several annotators, process results
* Load/Save/Import: support various formats (GATE XML, BDOC, HTML, CONLL)
* Visualization: show document as HTML, in notebooks

# Documents (1)

In [4]:
from gatenlp import Document

doc1 = Document("This is a small test document")

doc1.features["Feature1"] = "Some feature value"
set1 = doc1.annset("Set1")

set1.add(0, 4, "Ann", dict(feat1=2, feat2="some string"))

doc1

# Documents (2)

In [5]:
# from IPython.display import display
doc2 = Document.load("https://en.m.wikipedia.org/wiki/Gate_(disambiguation)", fmt="html")

doc2.show(row1_style="height:20em;min-height:3em;", annsets=["Original markups"])

# AnnotationSets, Annotations

In [6]:
# set1 is the set that is part of the document
set1 = doc2.annset("Original markups")
print("Set has", len(set1), "annotations, immutable=", set1.immutable, "detached=", set1.isdetached())

Set has 464 annotations, immutable= False detached= False


In [8]:
# subset1 is an immutable, "detached" set. 
subset1 = set1.with_type("h1", "head", "form") 
print("Set has", len(subset1), "annotations, immutable=", subset1.immutable, "detached=", subset1.isdetached())

# Annotations have a default order: starting offset, annotation id (= order if addition)
# -> zero length annotations have a well-defined order
print("Annotations:", "\n".join(str(x) for x in list(subset1)))

Set has 3 annotations, immutable= True detached= True
Annotations: Annotation(1,35,head,features=Features({}),id=2)
Annotation(120,120,form,features=Features({'action': '/w/index.php', 'method': 'get'}),id=76)
Annotation(127,149,h1,features=Features({'id': 'firstHeading', 'class': 'firstHeading mw-first-heading'}),id=88)


# AnnotationSet operations

In [6]:
# remove all annotations, add 2 new ones
annset=doc1.annset("Set1")
annset.clear()
annset.add(0,4,"AnnType1")
annset.add(5,7,"AnnType1", dict(a=1, b=2))
annset.add(0,20,"AnnType2")
annset.add(22,22,"AnnType3")
doc1

## Annotation Span Relations

* Annotations can overlap arbitrarily
* Annotation API has methods to check **how** they relate to each other
  * overlap, within, covering, before, after, rightoverlapping, startingat, endingwith, coextensive ...
* Annotation API implements ordering by start offset and annotation id

![Annotation Relations](data/ann-relations-cropped.png)

* Ann1 overlaps with all others, covers all but Ann2 and Ann4 
* Ann5 is directly before Ann3, is before Ann6
* Ann10 starts at Ann1, Ann12 ends with Ann1, Ann3 and Ann9 are coextensive 

## Annotation Relations

![Annotation Relations](ann-relations-cropped.png)

Let's load and view an example document to demonstrate this:

In [13]:
# Make sure the html ann viewer is smaller
from IPython.core.display import display, HTML

doc3 = Document.load("ann-relations.bdocjs")
doc3.show(row1_style="height:10em;min-height:3em;font-size: 75% !important;")

## Annotation/Span Relations API 

![Annotation Relations](data/ann-relations-cropped.png)

In [8]:
from gatenlp import Span

In [9]:
# make a variable for each annotation type
for anntype in list(doc3.annset("set1").type_names):
    vars()[anntype.lower()] = doc3.annset("set1").with_type(anntype).for_idx(0)
print("Ann2 isoverlapping Ann1:", ann2.isoverlapping(ann1))
print("Ann2 isbefore Ann3:", ann2.isbefore(ann3))
print("Ann3 isafter Ann2:", ann3.isafter(ann2))
print("Ann1 iscovering Ann5:", ann1.iscovering(ann5))
print("Ann3 iscoextensive Ann9:", ann3.iscoextensive(ann9))
print("Ann6 iswithin Ann1:", ann6.iswithin(ann1))
print("Ann4 isrightoverlapping Ann1:", ann4.isrightoverlapping(ann1))
print("Span(0,3).isoverlapping(ann2)):", Span(0,3).isoverlapping(Span(2,5)))

Ann2 isoverlapping Ann1: True
Ann2 isbefore Ann3: True
Ann3 isafter Ann2: True
Ann1 iscovering Ann5: True
Ann3 iscoextensive Ann9: True
Ann6 iswithin Ann1: True
Ann4 isrightoverlapping Ann1: True
Span(0,3).isoverlapping(ann2)): True


## AnnotationSet: retrieve by relation

* get all annotations that overlap/are before/start at/... an annotation/span/annotation set
* returns a new annotation set
* returned set is **detached**: not part of document, changes do set not affect document 
* returned set is initially **immutable**: set cannot be changed, but can be made mutable
* annotations **are mutable** and still the same as in the set!
* possible to "detach" annotations by (deep)copying them

In [10]:
set1 = doc3.annset("set1") # "attached" set
print("Within Ann1: ", [a.type for a in set1.within(ann1)])
print("Coextensive with Ann3:", [a.type for a in set1.coextensive(ann3)])
print("Coextensive with span of Ann3:", [a.type for a in set1.coextensive(ann3.span)])

Within Ann1:  ['Ann10', 'Ann5', 'Ann3', 'Ann7', 'Ann9', 'Ann11', 'Ann6', 'Ann8', 'Ann12']
Coextensive with Ann3: ['Ann9']
Coextensive with span of Ann3: ['Ann3', 'Ann9']


# Document: loading/saving

Supported formats:
* plain text
* bdocjs, bdocym, bdocmp: load/save JSON, YAML, MsgPack (aliasing: only bdocym)
* GATE xml: load (but only basic data types, no aliasing)
* HTML: load and create annotations for HTML entities
* plain text: load / save
* tweet: load v1 format, WIP!
* pickle: load/save
* html-ann-viewer: save HTML stand-alone file or HTML section for notebooks

## Document: view sets/types

Use: `doc.show(annsets=["set1", ("set2", "type1"), ("set3", ["type1", "type2"])]`

In [11]:
doc2.show(annsets=[("Original markups", ["h1","h2","a","li"])])

## Exchange Documents with Java GATE

* Python GateNLP can read Java GATE XML format
* GATE plugin [Format_Bdoc](https://gatenlp.github.io/gateplugin-Format_Bdoc/) provides support for loading/saving formats bdocjs, bdocym and bdocmp in Java GATE
* Offsets differ between GATE and GateNLP: 
  * Java: offsets refer to UTF-16 encoding, possibly a *surrogate pair* of UTF-16 characters
  * Python: offsets refer to Unicode code points 
  * bdocjs/bdocym/bdocmp automatically convert the offsets on either side
  * field `offset_type` is either `p` or `j` 

# Corpus

* a list-like collection of a fixed number of documents which can be retrieved and stored by index:<br>
  get: `doc = corpus[2]` set: `corpus[3] = doc`
* on retrieval, the index gets stored in a document feature
* implements `store(doc)` to save a document to the index stored in the document feature
  * !! Important for batched pipelining !!
* some implementations: `append(doc)` to add a new document to the corpus
* some implementations: store/retrieve `None`
  * on retrieveal: `None` indicates absence of document
  * on storing: `None` indicates that document should get removed or should not get updated


## DirFilesCorpus

* all (recursive) files in a directory with some specific extension
* specify some specific format or infer from file extension
* stores the relative file path as a document feature


In [12]:
from gatenlp.corpora import DirFilesCorpus
corp1 = DirFilesCorpus("dir1")  # get all the matching filenames from the directory
print("Number of documents:", len(corp1))
doc1 = corp1[2]  # actually read the document from the directory
print("Text for idx=2:", doc1.text)
print("Features for idx=2:", doc1.features)
doc1.annset().add(0,len(doc1.text), "Document", dict(what="test document"))
# this writes the document back to the file:
corp1.store(doc1)
# could also have used: corp1[2] = doc1


Number of documents: 4
Text for idx=2: This is another document for testing which mentions John Smith.
Features for idx=2: Features({'gate.SourceURL': 'created from String', '__idx_140022178505224': 2})


## Other Corpus Classes

* `ListCorpus`: wrap Python list-like collection
* `NumberedDirFilesCorpus`: create a directory tree where the path represents digits of a large number
  * e.g. `000/002/341.bdoc` for element number 2341 of 600000000 total
* `EveryNthCorpus`: wrap a corpus and access only every nth elements starting at k
  * e.g.: get elements 3, 7, 11, 15 from a corpus with 17 elements (n=4, k=3)
  * useful for processing files in a DirFilesCorpus with multiple processes
* `ShuffledCorpus`: random re-ordering of the elements in the wrapped corpus
* `CachedCorpus`: store retrieved elements from a (slow) base corpus in a (fast) cache corpus
* Still work in progress

## Source, Destination

* Document Source: something that can be iterated over to get one Document after the other
  * unknown size
  * a Corpus may also function as a Source
* Document Destination: something that has `append(doc)` to add Document instances
  * unknown final size
  * also has `close()` to end writing
  * may implement the `with documentdestination as dest:` pattern
  * an appendable Corpus may also function as a Destination


## Source, Destination examples

* `BdocjsLinesFileSource/Destination`: one line of bdocjs serialization per document
* `TsvFileSource`: one column in a TSV file contains the text, other columns can be stored in features
* `PandasDfSource`: similar to TSV source, but for a Pandas data frame
* `ConllUFileSource`: corpus from a (large) ConllU File
* work in progress

## Conll-U Source

* Read in one of the many multilingual corpora from https://universaldependencies.org/
* create documents from k sentences, paragraphs conll documents
* use original text hints or space hints, if available
* Example: first few lines of `ar-ud-train.conllu`

In [13]:
from gatenlp.corpora.conll import ConllUFileSource
src = ConllUFileSource("ar-tiny.conllu", group_by="doc", n=1)
corp = list(src)
print(len(corp))

3


## Conll-U Source

In [14]:
corp[0].show(doc_style="direction: rtl; font-size: 1.5em; line-height: 1.5;")

# Annotators, Executors

* Annotator: a *callable* that accepts a document to process and either:
  * returns a single document (most common situation); not necessarily same instance
  * returns None: something went wrong or the document should get filtered
  * returns a list of zero to n documents: filter, error, split documents
  * standard methods for handling over-a-corpus results
  * `pipe` method for batched processing
* Pipeline: a special annotator that recursively runs other annotators in sequence
* Executor: a class that runs an annotator
  * on a corpus
  * on a source and optional destination
  * takes care of handling None, lists of returned documents

 



# Spacy Annotator

* Use a [SpaCy](https://spacy.io/) pipeline to annotate a document
* convert spacy tokens, entities etc into Annotations, convert token attributes into annotation features

In [15]:
import spacy
from gatenlp.lib_spacy import AnnSpacy
print("Spacy version:",spacy.__version__)

nlp = spacy.load("en_core_web_sm")
annotator = AnnSpacy(pipeline=nlp, outsetname="Spacy")
doc3 = Document.load("document-testing.txt")
doc3.annset("Spacy").clear()   # avoid annotation duplication when running several times
doc3 = annotator(doc3)

Spacy version: 3.1.2


## Spacy Annotator

In [16]:
doc3

# Stanza Annotator

* Use a [Stanza](https://stanfordnlp.github.io/stanza/) pipeline to annotate a document
* convert stanza tokens, entities etc into Annotations, convert token attributes into annotation features

In [17]:
import stanza
print("Stanza version:",stanza.__version__)
from gatenlp.lib_stanza import AnnStanza

nlpstanza = stanza.Pipeline(logging_level="ERROR")
annotatorstanza = AnnStanza(pipeline=nlpstanza, outsetname="Stanza")
doc3.annset("Stanza").clear()   # avoid annotation duplication when running several times
doc3 = annotatorstanza(doc3)


Stanza version: 1.3.0


## Stanza Annotator

In [18]:
doc3.show(annsets=["Stanza"])

# Gazetteers/Regexp Matchers

* Look up tokens/words/strings/phrases/regexps
* TokenGazetteer: list of token sequences, match against annotation sequence
* StringGazetteer: list of strings, match against text string
* StringRegexAnnotator: List of regex rules, match against text string

# StringGazetteer


In [16]:
from gatenlp.processing.gazetteer import StringGazetteer
doc4 = Document.load("document2.txt")
gazlist1 = [("Barack Obama", dict(nr=44)), ("Obama", dict(nr=44)), ("Donald Trump", dict(nr=45)),
    ("Trump", dict(nr=45)), ("George W. Bush", dict(nr=[41, 43])), ("George H. W. Bush", dict(nr=41)),
    ("Bush", dict(nr=[41,43])), ("Bill Clinton", dict(nr=42)), ("Clinton", dict(nr=42))]
gazlist2 = ["United States", "US", "United Kingdom", "UK", "Austria", "South Korea", "대한민국", "Iran", "جمهوری اسلامی ایران", "ایران"]
gaz1 = StringGazetteer(source=gazlist1, source_fmt="gazlist", list_type="President")
gaz1.append(source=gazlist2, source_fmt="gazlist", list_type="Country", list_features=dict(l="list2"))
doc4 = gaz1(doc4)
doc4.show()

# StringRegexAnnotator

In [None]:
# TODO: simple date example

# Token Gazetteer

TODO: short overview, no code

# PAMPAC

* PAttern Matching through PArser Combinators
* A pattern language for matching annotations and text
* DSL (domain specific language) implemented as Python classes
* Purpose: similar to Java GATE JAPE / JAPE Plus, but:
  * does not need a separate language
  * can match annotation sequences, text, regular expressions
  * can perform arbitrary actions
  * can use text, features from previous matches for later matches
  * can match first, longest, all ...



## PAMPAC Parsers

* `Ann(...)`: match next Annotation with given properties
* `AnnAt(...)`: match Annotation with given properties at next offset
* `N(p, min=k, max=l)`:  k to l repetitions of p
* `Seq(p1, p2, p3)`: sequence of p1, p2, p3
* `Or(p1,p2,..)`: match if any of p1, p2, .. match
* `And(p1, p2,..)`: match if all of p1, p2, .. match
* `p.within(anndesc)`: match if p matches within .. also notwithin, overlapping, notoverlapping ...
* More .. (Find, Filter, ...)

## PAMPAC - Example 1

Lets create a rule that annotates any Token which is within a PERSON or ORG annotation:

In [None]:
from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq, Or
from gatenlp.pam.matcher import FeatureMatcher, ifnot

r1 = Rule(
    # first the pattern
    Or ( Ann("Token", name="tok").within("ORG"),
         Ann("Token", name="tok").within("PERSON")
       ),
    # then the action for the pattern
    AddAnn(name="tok", anntype="PersOrOrg")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type(["Token", "PERSON", "ORG"])

outset = doc2.annset("Pampac1")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)

## PAMPAC - Example 1



In [None]:
doc2

## PAMPAC - Example 2

Create a rule that annotates any Sequence of two or more Token annotations which have a "upos" tag of "PROPN", separated by at most one other arbitrary token:

In [None]:
from gatenlp.pam.pampac import Ann, AnnAt, Rule, Pampac, AddAnn, N, Seq
from gatenlp.pam.matcher import FeatureMatcher, ifnot

feat = FeatureMatcher(upos="PROPN")
r1 = Rule(
    # first the pattern
    Seq( Ann("Token", features=feat),
         N( Seq( N(Ann("Token", features=ifnot(feat)), min=0, max=1),
                 Ann("Token", features=feat)),
           min=1, max=99),
         name="seq1"
       ),
    # then the action for the pattern
    AddAnn(name="seq1", anntype="PROPNSEQ")
)
# get the annotations we want to use for matching
anns2match = doc2.annset("Stanza").with_type("Token")

outset = doc2.annset("Pampac2")
outset.clear()
# Create the Pampac instance from the single rule and run it on the annotations, also specify output set
# The run method returns the list of offsets and the action return values where the rule matches in the doc
Pampac(r1).run(doc2, anns2match, outset=outset)
len(outset)

## PAMPAC - Example 2

Result: found 8 matches and added annotations for them:

In [None]:
doc2

# GATE Worker

* Allows running the Java GATE process from Python
* API for exchanging document and performing frequent Java GATE tasks from Python
* [Py4J](https://www.py4j.org/) API to run ANY Java from Python
* Python connects to a Java process, communicates over sockets
  * Option 1: Start GATE GUI, load PythonWorkerLr, then connect a GateNLP GateWorker to it
  * Option 2: Start Java GATE worker using the `gatenlp-gate-worker` command
  * Option 3: directly start the Java GATE worker when creating the GateNLP GateWorker instance

Let's try Option 3 first: `GATE_HOME` environment variable must be set, or must know GATE installation directory

## GATE Worker

In [None]:
from gatenlp.gateworker import GateWorker

gs = GateWorker()
# if GATE_HOME not set use gs = GateWorker(gatehome="/where/Gate/is/Installed")
# if java is not on the PATH use gs = GateWorker(java=""/path/to/the/java/binary")

In [None]:
# Create a GATE document on the JAVA GATE side and return a handle
gdoc1 = gs.createDocument("An example document mentioning Barack Obama and New York")
# Can call Java API methods on that handle and get/convert the result
print(gdoc1.getClass())
print(gdoc1.getName())
print(gdoc1.getAnnotationSetNames())

## GATE Worker

In [None]:
# lets load the prepared ANNIE pipeline on the Java side and process the GATE document with it
gs.loadMavenPlugin("uk.ac.gate.plugins", "annie", "9.0")
gpipe = gs.loadPipelineFromPlugin("uk.ac.gate.plugins", "annie", "/resources/ANNIE_with_defaults.gapp")
gcorp = gs.newCorpus()
gcorp.add(gdoc1)
gpipe.setCorpus(gcorp)
gpipe.execute()

## GATE Worker

So far, everything happened on the Java side, use a GateWorker API method to convert the document into a Python GateNLP document:

In [None]:
pdoc1 = gs.gdoc2pdoc(gdoc1)
pdoc1

## GateWorker

* Stopping: the GateWorker (Java process) can get stopped using `gs.close()`
* Will also automatically stop when the Python process ends

In [None]:
gs.close()

# GateWorker Annotator

An annotator to process Python GateNLP documents with a Java GATE pipeline

In [None]:
from gatenlp.gateworker import GateWorkerAnnotator
# Specify a prepared GATE pipeline file to get loaded into Java GATE
# optionally add the gatehome=... kw argument
# optionally specify port using port=23445 or similar
gs_app = GateWorkerAnnotator(pipeline="data/annie.xgapp", port=25444)

## GateWorkerAnnotator

Example, running on a directory corpus:

In [None]:
dircorpus = DirFilesCorpus("data/dir1", sort=True)
exe = SerialCorpusExecutor(annotator=gs_app, corpus=dircorpus)
exe()
gs_app.close()

tmpdoc = dircorpus[2]
print(tmpdoc.features)
tmpdoc

In [None]:
# NOT SHOWN: clean the corpus we have just processed
def clearanns(doc):
    # instead of just removing annotations, return a completely new document
    docnew = Document(doc.text)
    return docnew
annclear = AnnotatorFunction(clearanns)
execlear = SerialCorpusExecutor(annotator=annclear, corpus=dircorpus)
execlear()
dircorpus[1]