Aim : To implement Entity Identification for Pronoun Resolution

In [1]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/50/ae/a70a58ce6b4e2daad538688806ee0f238dbe601954582a74ea57cde6c532/stanza-1.2-py3-none-any.whl (282kB)
[K     |█▏                              | 10kB 13.9MB/s eta 0:00:01[K     |██▎                             | 20kB 20.0MB/s eta 0:00:01[K     |███▌                            | 30kB 21.1MB/s eta 0:00:01[K     |████▋                           | 40kB 14.3MB/s eta 0:00:01[K     |█████▉                          | 51kB 5.2MB/s eta 0:00:01[K     |███████                         | 61kB 5.7MB/s eta 0:00:01[K     |████████▏                       | 71kB 6.2MB/s eta 0:00:01[K     |█████████▎                      | 81kB 6.4MB/s eta 0:00:01[K     |██████████▌                     | 92kB 6.9MB/s eta 0:00:01[K     |███████████▋                    | 102kB 7.2MB/s eta 0:00:01[K     |████████████▉                   | 112kB 7.2MB/s eta 0:00:01[K     |██████████████                  | 122kB 7.2MB/s eta 0:0

In [2]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

2021-05-06 04:56:26 INFO: Installing CoreNLP package into ./corenlp...
Downloading http://nlp.stanford.edu/software/stanford-corenlp-latest.zip: 100%|██████████| 504M/504M [02:38<00:00, 3.19MB/s]


In [3]:
# Import client module
from stanza.server import CoreNLPClient

In [9]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], 
    memory='4G', 
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

2021-05-06 05:04:02 INFO: Writing properties to tmp file: corenlp_server-d9ad1c6d9c5a458a.props
2021-05-06 05:04:02 INFO: Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9001 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-d9ad1c6d9c5a458a.props -annotators tokenize,ssplit,pos,lemma,ner -preload -outputFormat serialized


<stanza.server.client.CoreNLPClient object at 0x7f6bbc6d7950>


In [10]:
# Annotate some text
text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity. He is a great man."
document = client.annotate(text)
#print(document)

In [11]:
# Iterate over all tokens in all sentences, and print out the word, lemma, pos and ner tags
print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))

for i, sent in enumerate(document.sentence):
    print("[Sentence {}]".format(i+1))
    for t in sent.token:
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(t.word, t.lemma, t.pos, t.ner))
    print("")

Word        	Lemma       	POS   	NER
[Sentence 1]
Albert      	Albert      	NNP   	PERSON
Einstein    	Einstein    	NNP   	PERSON
was         	be          	VBD   	O
a           	a           	DT    	O
German      	german      	JJ    	NATIONALITY
-           	-           	HYPH  	O
born        	bear        	VBN   	O
theoretical 	theoretical 	JJ    	TITLE
physicist   	physicist   	NN    	TITLE
.           	.           	.     	O

[Sentence 2]
He          	he          	PRP   	O
developed   	develop     	VBD   	O
the         	the         	DT    	O
theory      	theory      	NN    	O
of          	of          	IN    	O
relativity  	relativity  	NN    	O
.           	.           	.     	O

[Sentence 3]
He          	he          	PRP   	O
is          	be          	VBZ   	O
a           	a           	DT    	O
great       	great       	JJ    	O
man         	man         	NN    	O
.           	.           	.     	O



In [12]:
# Iterate over all detected entity mentions
print("{:30s}\t{}".format("Mention", "Type"))

for sent in document.sentence:
    for m in sent.mentions:
       print("{:30s}\t{}".format(m.entityMentionText, m.entityType))


Mention                       	Type
Albert Einstein               	PERSON
German                        	NATIONALITY
theoretical physicist         	TITLE
He                            	PERSON
He                            	PERSON


## Exercise

In [13]:
# Annotate some text
text = "Rafael Nadal is a Spanish professional tennis player. He is called the king of the clay. He is ranked world no. 2."
document = client.annotate(text)
# Iterate over all tokens in all sentences, and print out the word, lemma, pos and ner tags
print("{:12s}\t{:12s}\t{:6s}\t{}".format("Word", "Lemma", "POS", "NER"))

for i, sent in enumerate(document.sentence):
    print("[Sentence {}]".format(i+1))
    for t in sent.token:
        print("{:12s}\t{:12s}\t{:6s}\t{}".format(t.word, t.lemma, t.pos, t.ner))
    print("")
# Iterate over all detected entity mentions
print("{:30s}\t{}".format("Mention", "Type"))

for sent in document.sentence:
    for m in sent.mentions:
       print("{:30s}\t{}".format(m.entityMentionText, m.entityType))


Word        	Lemma       	POS   	NER
[Sentence 1]
Rafael      	Rafael      	NNP   	PERSON
Nadal       	Nadal       	NNP   	PERSON
is          	be          	VBZ   	O
a           	a           	DT    	O
Spanish     	spanish     	JJ    	NATIONALITY
professional	professional	JJ    	O
tennis      	tennis      	NN    	TITLE
player      	player      	NN    	TITLE
.           	.           	.     	O

[Sentence 2]
He          	he          	PRP   	O
is          	be          	VBZ   	O
called      	call        	VBN   	O
the         	the         	DT    	O
king        	king        	NN    	TITLE
of          	of          	IN    	O
the         	the         	DT    	O
clay        	clay        	NN    	O
.           	.           	.     	O

[Sentence 3]
He          	he          	PRP   	O
is          	be          	VBZ   	O
ranked      	rank        	VBN   	O
world       	world       	NN    	O
no.         	no.         	NN    	O
2           	2           	CD    	NUMBER
.           	.           	.     	O

Mention  

In [14]:
# Shut down the background CoreNLP server
client.stop()

time.sleep(10)
!ps -o pid,cmd | grep java

    236 /bin/bash -c ps -o pid,cmd | grep java
    238 grep java
