### Reference: https://spacy.io/

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

In [1]:
!pip install spacy-langdetect

Collecting spacy-langdetect
  Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)
Collecting langdetect==1.0.7
  Downloading langdetect-1.0.7.zip (998 kB)
[K     |████████████████████████████████| 998 kB 10.1 MB/s eta 0:00:01
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.7-py3-none-any.whl size=993461 sha256=40df5c84f8ea481044ffc800ea5548e55446005cb32b2fc3558ec8ce0ede734c
  Stored in directory: /Users/andrewong/Library/Caches/pip/wheels/c1/6d/ab/bf9ecd1ab14dd236da586dfd0d4b008e2e803e571cf2229c26
Successfully built langdetect
Installing collected packages: langdetect, spacy-langdetect
Successfully installed langdetect-1.0.7 spacy-langdetect-0.1.2


In [1]:
# Instructor's version is 3.3.0
import spacy
print(spacy.__version__)

3.3.1


In [2]:
nlp = spacy.load("en_core_web_lg")



### Tokenization https://spacy.io/usage/linguistic-features#tokenization

In [3]:
doc = nlp("I am flying to Manila")
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila']


In [4]:
for w in doc:
    print(w.text)

I
am
flying
to
Manila


### Lemmatization https://spacy.io/usage/linguistic-features#lemmatization

In [5]:
doc = nlp("this product integrates both libraries for downloading and applying patches")
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


### POS tagging https://spacy.io/usage/linguistic-features#dependency-parse

In [6]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
have VERB VBP
flown NOUN NN
to ADP IN
Cebu PROPN NNP
. PUNCT .
Now ADV RB
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [7]:
# Explains the meaning of the terms (e.g. PRON, NNP)
spacy.explain("NNP")

'noun, proper singular'

### Segmentation

In [8]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


### Retokenization

In [9]:
doc=nlp('The Golden State Bridge is an iconic landmark in San Francisco')
[doc[i] for i in range(len(doc))]

[The, Golden, State, Bridge, is, an, iconic, landmark, in, San, Francisco]

In [10]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[-2:])

In [11]:
[doc[i] for i in range(len(doc))]

[The, Golden State Bridge, is, an, iconic, landmark, in, San Francisco]

### Syntactic Parsing

In [12]:
doc = nlp('I want a green apple,')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT root
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
, PUNCT punct punctuation


### Visualizations

In [13]:
from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [17]:
displacy.render(doc, style='dep')

In [18]:
# Answer was different from instruction. Recommended to upgrade spacy
from IPython.core.display import display, HTML

doc = nlp('I want to fly to Manila.')

from spacy import displacy
html = displacy.render(doc, style="ent", page=True)

display(HTML(html))

<IPython.core.display.HTML object>

In [19]:
spacy.explain('PERSON')

'People, including fictional'

### Similarity

In [None]:
doc = nlp('I want a green apple.')

In [None]:
doc[----------:--------]

In [None]:
doc.similarity(doc[----------:--------])

In [None]:
nlp('apple').similarity(nlp('banana'))

In [None]:
nlp('lovelife').similarity(nlp('forever'))

In [None]:
nlp('apple').---------------------

### Language Detection: https://spacy.io/universe/project/spacy-langdetect

In [1]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x17b96722ee0>

In [2]:
text = 'This is an english text.'
doc = nlp(text)

print(doc._.language)

{'language': 'en', 'score': 0.9999967603029023}


In [5]:
text = 'This is an english text mabuhay'
doc = nlp(text)

print(doc._.language)

{'language': 'en', 'score': 0.7142835126723377}


In [6]:
text = 'magandang gabi!'
doc = nlp(text)

print(doc._.language)

{'language': 'tl', 'score': 0.9999954822984114}
