# debug-gatezetteer-token
## Debug/test token gazetteer 

In [1]:
import sys, os
from gatenlp.processing.gazetteer import TokenGazetteer
from gatenlp import Document
from gatenlp.gateworker import GateWorker

## Basic Usage

In [3]:
# try loading the GATE ANNIE gazetteer
# this needs a symbolic link ./gazetteer to the ANNIE src/main/resources/resources/gazetteer directory or a copy
gaz1 = TokenGazetteer(source="gazetteer/lists.def", source_sep=":", source_tokenizer=None)

In [4]:
gaz1.nodes["Drax"]

Node(is_match=True,data=[{}],listidx=[9],nodes=None)

In [5]:
gaz1.get(["South", "Acton"])


[{'majorType': 'location', 'minorType': 'city'}]

In [6]:
["South", "Acton"] in gaz1

True

In [7]:
gaz1.get("Drax")

[{'majorType': 'location', 'minorType': 'city'}]

In [8]:
gaz1[["South", "Acton"]]

[{'majorType': 'location', 'minorType': 'city'}]

In [9]:
len(gaz1)

80766

In [10]:
len(gaz1.nodes)

66729

In [11]:
gaz1[["Tupac", "Shakur"]]

[{'majorType': 'person_full', 'minorType': 'music'}]

In [12]:
gaz1[["Barack", "Obama"]]

[{'gender': 'male', 'majorType': 'person_full'}]

## Test using Annie tokenizer for gazetteer list

In [None]:
SYNTAXERROR TO PREVENT REST FROM RUNNING BY DEFAULT

In [None]:
text = """
Some text for testing.
Includes words like Apple, Apple#, A.B.C., Apple/ and @User
"""
doc = Document(text)

In [None]:
doc

In [None]:
# Run ANNIE on that document
if "gs" not in vars():
    gs = GateWorker()
gs.loadMavenPlugin("uk.ac.gate.plugins", "annie", "9.0")
gpipe = gs.loadPipelineFromPlugin("uk.ac.gate.plugins", "annie", "/resources/ANNIE_with_defaults.gapp")
gdoc = gs.pdoc2gdoc(doc)
gcorp = gs.newCorpus()
gcorp.add(gdoc)
gpipe.setCorpus(gcorp)
gpipe.execute()
anniedoc = gs.gdoc2pdoc(gdoc)
anniedoc

In [None]:
gazlist = [
    "Apple",
    "Apple#",
    "A.B.C.",
    "@User",
    "text for",
]
# create temporary GATE-style gazetteer files
dfile = "/tmp/debug-gazetteer-token-def1.def"
lfile = "/tmp/debug-gazetteer-token-lst1.lst"
with open(dfile, "wt") as outfp:
    print("debug-gazetteer-token-lst1.lst:major:minor:lang", file=outfp)
with open(lfile, "wt") as outfp:
    for e in gazlist:
        print(e, file=outfp)

In [None]:
# Create a token gazetteer from the def file, using the default tokenizer
tok1 = TokenGazetteer(dfile, fmt="gate-def", outset="tok1")
# apply to anniedoc 
tok1(anniedoc)
anniedoc

In [None]:
# manually create a pre-gokenized gazetteer list 
gazlist_tok1 = [
    (["Apple"], dict()),
    (["Apple", "#"], dict()),
    (["A",".","B",".","C","."], dict()),
    (["@", "User"], dict()),
    (["text for"], dict())
]

In [None]:
# Create a token gazetteer from the tokenized list
tok2 = TokenGazetteer(gazlist_tok1, fmt="gazlist", outset="tok2")
# apply to anniedoc 
tok2(anniedoc)
anniedoc

In [None]:
# now try to implement code that makes it possible to use the ANNIE tokenizer for tokenizing
# the GATE format gazetteer list(s)

from gatenlp.processing.annotator import Annotator

class AnnieTokenizer(Annotator):
    def __init__(self, gateworker, tokeniserPR):
        self._gw = gateworker
        self._tok = tokeniserPR    
        self._ctrl = gateworker.jvm.gate.Factory.createResource("gate.creole.SerialAnalyserController")
        self._ctrl.add(tokeniserPR)
        self._corpus = gateworker.newCorpus()
        self._ctrl.setCorpus(self._corpus)
    def __call__(self, doc):
        gdoc = self._gw.pdoc2gdoc(doc)
        self._corpus.add(gdoc)
        self._ctrl.execute()
        self._corpus.remove(gdoc)
        tmpdoc = self._gw.gdoc2pdoc(gdoc)
        # make sure we return the SAME document!
        outset = doc.annset()
        for ann in tmpdoc.annset().with_type("Token"):
            outset.add_ann(ann)
        return doc
        
annietok = AnnieTokenizer(gs, gpipe.getPRs()[1])        

In [None]:
tmpdoc1 = Document("A.B.C. and Apple and Apple#")
tmpdoc1 = annietok(tmpdoc1)
print(tmpdoc1.annset())
tmpdoc1

In [None]:
# Create a token gazetteer from the def file, using the ANNIE tokenizer
tok3 = TokenGazetteer(dfile, fmt="gate-def", outset="tok3", tokenizer=annietok)
# apply to anniedoc 


In [None]:
# should be able to close the gateworker now
gs.close()

In [None]:
anniedoc2 = tok3(anniedoc)
anniedoc2