# SPANCAT

In [1]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
spancat = nlp.add_pipe("spancat")


from spacy.tokens import Doc
from spacy.training import Example

from spacy.vocab import Vocab
vocab = Vocab(strings=["Appy", "some", "sunscreen"])

def get_examples():
    predicted = Doc(vocab, words=["Apply", "some", "sunscreen"])
    token_ref = ["Apply", "some", "sun", "screen"]
    tags_ref = ["VERB", "DET", "NOUN", "NOUN"]
    example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
    yield example

spancat.add_label("SPANCAT")
spancat.initialize(get_examples, nlp=nlp)

text = "South Korea along with Taiwan and Japan is a leading innovator in advanced 4 nanometer ARM semiconductors with companies like Samsung Electronics and SK Hynix."
doc = nlp(text)

### Initialise `doc.spans["sc"]`

In [2]:
from spacy.tokens import Span
doc.spans["sc"] = [] 

### Recursive Procedure to extract non-duplicate spans

In [3]:
# pass after removing colons
def find_spans(doc):
    for i in range(0, len(doc)-1):
        print("i=", i)
        for j in range(i+1, len(doc)):
            print("j=", j)
            tdoc = nlp(doc[i:j].text)
            print("tdoc = ", tdoc)
            for ent in tdoc.ents:
                print("ent=",  ent.text)
                label = ent.label_
                s = ent.start + i
                t = ent.end + i
                print("s=", s)
                print("t=", t)
                print(ent.text,"=" ,label)
                span = Span(doc, s, t, label)
                print("span = ",span)
                if span not in doc.spans["sc"]:
                    doc.spans["sc"].append(span)
                    print("added")
                    print("doc.spans=", doc.spans["sc"])
            print("---------------------")

In [4]:
find_spans(doc)

i= 0
j= 1
tdoc =  South
---------------------
j= 2
tdoc =  South Korea
ent= South Korea
s= 0
t= 2
South Korea = GPE
span =  South Korea
added
doc.spans= [South Korea]
---------------------
j= 3
tdoc =  South Korea along
ent= South Korea
s= 0
t= 2
South Korea = GPE
span =  South Korea
---------------------
j= 4
tdoc =  South Korea along with
ent= South Korea
s= 0
t= 2
South Korea = GPE
span =  South Korea
---------------------
j= 5
tdoc =  South Korea along with Taiwan
ent= South Korea
s= 0
t= 2
South Korea = GPE
span =  South Korea
ent= Taiwan
s= 4
t= 5
Taiwan = GPE
span =  Taiwan
added
doc.spans= [South Korea, Taiwan]
---------------------
j= 6
tdoc =  South Korea along with Taiwan and
ent= South Korea
s= 0
t= 2
South Korea = GPE
span =  South Korea
ent= Taiwan
s= 4
t= 5
Taiwan = GPE
span =  Taiwan
---------------------
j= 7
tdoc =  South Korea along with Taiwan and Japan
ent= South Korea
s= 0
t= 2
South Korea = GPE
span =  South Korea
ent= Taiwan
s= 4
t= 5
Taiwan = GPE
span =  Taiwan

In [5]:
print(doc.spans["sc"])

[South Korea, Taiwan, Japan, 4, Samsung, Samsung Electronics, SK, SK Hynix, Korea, Hynix]


In [6]:
displacy.serve(doc, style="span")




Using the 'span' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


## Roungh Work

In [None]:
for token in doc:
    print(token)

In [None]:
for token in doc:
    print(token.ent_type_)

In [None]:
for token in doc:
    print(token.ent_type)

In [None]:
print(doc[0:3])

In [None]:
tdoc = doc[3:6]
print(tdoc)

In [None]:
tdoc1 = nlp(doc[2+1:5].text)
tdoc2 = doc[2:5-1]
print(tdoc1)
print(tdoc2)

In [None]:
for ent in tdoc1.ents:
    print(ent)
    word_start1 = ent.start
    word_end1 = ent.end
    print(word_start1)

    index_start1 = ent.start_char
    index_end1 = ent.end_char
    print(index_start1)

    label1 = ent.label_

    span = Span(doc, word_start1+word_start + 1, word_end1+word_start + 1, label1)
    doc.spans["sc"].append(span)

for ent in tdoc2.ents:
    print(ent)

In [None]:
print(doc.spans["sc"])

In [None]:
for span in doc.spans["sc"]:
    print(len(span))

In [None]:
displacy.serve(doc, style="span")

In [None]:
doc = nlp("Bank of")
for ent in doc.ents:
    print(ent.text)