### Reference: https://spacy.io/

In [None]:
!pip install spacy
!python -m spacy download en_core_web_lg

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 13.3 MB/s eta 0:00:01   |███▍                            | 1.4 MB 2.8 MB/s eta 0:00:05     |██████████████                  | 5.6 MB 3.0 MB/s eta 0:00:03     |███████████████████▍            | 7.8 MB 3.0 MB/s eta 0:00:02     |███████████████████████▊        | 9.5 MB 13.3 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
!pip install spacy-langdetect --upgrade



In [1]:
# Instructor's version is 3.3.0
import spacy
print(spacy.__version__)

3.3.1


In [2]:
nlp = spacy.load("en_core_web_lg")



### Tokenization https://spacy.io/usage/linguistic-features#tokenization

In [3]:
doc = nlp("I am flying to Manila")
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Manila']


In [4]:
for w in doc:
    print(w.text)

I
am
flying
to
Manila


### Lemmatization https://spacy.io/usage/linguistic-features#lemmatization

In [5]:
doc = nlp("this product integrates both libraries for downloading and applying patches")
for token in doc:
    print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch


### POS tagging https://spacy.io/usage/linguistic-features#dependency-parse

In [6]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for token in doc:
    print(token.text, token.pos_, token.tag_)

I PRON PRP
have VERB VBP
flown NOUN NN
to ADP IN
Cebu PROPN NNP
. PUNCT .
Now ADV RB
I PRON PRP
am AUX VBP
flying VERB VBG
to ADP IN
Manila PROPN NNP
. PUNCT .


In [7]:
# Explains the meaning of the terms (e.g. PRON, NNP)
spacy.explain("NNP")

'noun, proper singular'

### Segmentation

In [8]:
doc=nlp('I have flown to Cebu. Now I am flying to Manila.')
for sent in doc.sents:
    print([sent[i] for i in range(len(sent))])

[I, have, flown, to, Cebu, .]
[Now, I, am, flying, to, Manila, .]


### Retokenization

In [9]:
doc=nlp('The Golden State Bridge is an iconic landmark in San Francisco')
[doc[i] for i in range(len(doc))]

[The, Golden, State, Bridge, is, an, iconic, landmark, in, San, Francisco]

In [10]:
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[1:4])
with doc.retokenize() as retokenizer:
    retokenizer.merge(doc[-2:])

In [11]:
[doc[i] for i in range(len(doc))]

[The, Golden State Bridge, is, an, iconic, landmark, in, San Francisco]

### Syntactic Parsing

In [12]:
doc = nlp('I want a green apple,')
for token in doc:
    print(token.text, token.pos_, token.dep_, spacy.explain(token.dep_))

I PRON nsubj nominal subject
want VERB ROOT root
a DET det determiner
green ADJ amod adjectival modifier
apple NOUN dobj direct object
, PUNCT punct punctuation


### Visualizations

In [13]:
from spacy import displacy
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [14]:
displacy.render(doc, style='dep')

In [15]:
# Answer was different from instruction. Recommended to upgrade spacy
from IPython.core.display import display, HTML

doc = nlp('I want to fly to Manila.')

from spacy import displacy
html = displacy.render(doc, style="ent", page=True)

display(HTML(html))

<IPython.core.display.HTML object>

In [16]:
spacy.explain('PERSON')

'People, including fictional'

### Similarity

In [17]:
doc = nlp('I want a green apple.')

In [18]:
doc[2:5]

a green apple

In [19]:
doc.similarity(doc[2:5])

0.6503791216866719

In [20]:
nlp('apple').similarity(nlp('banana'))

0.5842525615808574

In [21]:
nlp('lovelife').similarity(nlp('forever'))

0.38831077789515944

In [22]:
nlp('apple').vector

array([-0.29168665,  0.33883744, -0.8272711 ,  0.01344246,  0.38658643,
       -0.43554235, -0.16280729, -0.57587934, -0.94883066, -1.1764417 ,
       -0.69975144, -0.11214721, -0.21175031,  0.6304336 , -0.29004937,
       -0.39811704, -0.13970251,  0.0075793 , -0.87072265,  0.9699889 ,
       -0.09873042,  0.47996315,  0.00969672,  0.40159252,  0.55970776,
       -0.024492  ,  0.07643387,  0.7981604 ,  0.29940873, -0.63179433,
        0.3660159 , -0.78419524,  1.8358154 , -0.05675983,  0.033584  ,
       -0.74939   ,  0.05875957,  1.8598862 ,  0.10377809,  0.96772027,
       -0.42822474,  0.00797323, -0.37591213,  0.8209616 , -0.20133084,
       -0.18540785,  1.6793395 , -0.90393966, -0.871858  , -1.1379825 ,
       -1.1211238 ,  0.35565192,  1.2745702 ,  0.5057031 ,  0.6050507 ,
        0.64611554, -0.71222675,  0.8028663 , -1.0640801 ,  0.26709023,
       -0.1024441 ,  0.21893689, -0.9620294 , -0.19878814,  0.9914344 ,
        0.4231488 , -1.1635175 , -0.24937162, -0.951578  ,  0.11

### Language Detection: https://spacy.io/universe/project/spacy-langdetect

In [7]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x162053160>

In [8]:
text = 'This is an english text.'
doc = nlp(text)

print(doc._.language)

{'language': 'en', 'score': 0.9999975702314919}


In [9]:
text = 'This is an english text mabuhay'
doc = nlp(text)

print(doc._.language)

{'language': 'en', 'score': 0.9999972396329464}


In [10]:
text = 'magandang gabi!'
doc = nlp(text)

print(doc._.language)

{'language': 'tl', 'score': 0.9999973110756293}
