https://spacy.io/usage/models

In [1]:
pip install spacy

Collecting Spacy
  Downloading spacy-3.0.1-cp38-cp38-win_amd64.whl (11.8 MB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.5-cp38-cp38-win_amd64.whl (21 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.4-cp38-cp38-win_amd64.whl (6.5 MB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting spacy-legacy<3.1.0,>=3.0.0
  Downloading spacy_legacy-3.0.1-py2.py3-none-any.whl (7.0 kB)
Collecting srsly<3.0.0,>=2.4.0
  Downloading srsly-2.4.0-cp38-cp38-win_amd64.whl (451 kB)
Collecting catalogue<2.1.0,>=2.0.1
  Downloading catalogue-2.0.1-py3-none-any.whl (9.6 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.5-cp38-cp38-win_amd64.whl (36 kB)
Collecting pathy
  Downloading pathy-0.3.5-py3-none-any.whl (34 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.3-cp38-cp38-win_amd64.whl (1.8 MB)
Collecting thinc<8.1.0,>=8.0.0
  Downloading thinc-8.0.1-cp38-cp38-win_amd64.whl (1.0 MB)
Collecting preshed<3.1.0,>=3

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
# loading a model
import spacy
nlp = spacy.load("en_core_web_sm")

In [8]:
text = """The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in US politics since the 1970s. As election day approaches, pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer."""
text1 = "The Republican president is being challenged by Democratic Party nominee Joe Biden,"

In [13]:
doc = nlp(text)
print(doc, type(doc))

The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in US politics since the 1970s. As election day approaches, pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer. <class 'spacy.tokens.doc.Doc'>


In [16]:
# Sentence Tokenizer
sentence = nlp(text)

# identify the element
sentence = list(sentence.sents)
print(sentence)
print(len(sentence))

[The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in US politics since the 1970s., As election day approaches, pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer.]
2


In [21]:
# Word Tokennizer
word = nlp(text1)

# tokennize
for token in word:
    print(token)

The
Republican
president
is
being
challenged
by
Democratic
Party
nominee
Joe
Biden
,


In [24]:
# check the stopwords for english
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(stopwords, len(stopwords))

{'since', 'did', 'whither', 'other', 'of', 'above', 'such', 'out', 'nevertheless', 'more', 'front', 'who', 'too', 'hence', 'any', 'therein', 'been', 'per', 'how', 'yours', 'around', 'else', 'was', 'however', 'ever', 'side', 'beyond', 'really', 'may', 'quite', 'before', 'hers', 'among', 'name', 'show', 'becoming', 'besides', 'that', '‘d', 'please', 'almost', 'fifteen', 'whole', 'sometime', 'becomes', 'neither', 'upon', '‘s', 'no', 'once', 'made', 'part', 'had', 'whenever', 'every', 'should', 'few', 'myself', 'another', 'whereby', 'down', 'whereas', 'thus', 'why', "'ll", 'is', 'using', 'after', 'could', 'something', 'nor', 'to', 'over', 'though', 'still', 'everyone', 'somehow', '‘ll', '’ll', 'than', 'also', 'within', 'here', 'this', 'but', 'former', 'ten', 'rather', 'anyway', 'not', 'even', 'throughout', 'again', '‘re', 'and', 'under', 'seems', 'take', 'regarding', 'beside', 'most', 'when', 'sixty', 'his', 'her', 'now', 'become', 'afterwards', 'for', 'further', 'nothing', 'amongst', 'get

In [25]:
# Stopwords removal
stop = nlp(text1)

# tokennize
for token in stop:
    if token.is_stop:
        print(token)

The
is
being
by


In [47]:
# [token for token in stop if not token.is_stop]
for token in stop:
    if not token.is_stop:
        print(token)

Republican
president
challenged
Democratic
Party
nominee
Joe
Biden
,


In [37]:
# Stopwords removal
stop = nlp(text1)

# tokennize
for token in stop:
    if token.is_punct:
        print(token)

,


In [45]:
a = [token.text for token in stop if not token.is_stop and not token.is_punct]
print(a, type(a))
print(" ".join(a))

['Republican', 'president', 'challenged', 'Democratic', 'Party', 'nominee', 'Joe', 'Biden'] <class 'list'>
Republican president challenged Democratic Party nominee Joe Biden


In [54]:
# Lemmatization or Stemming
doc = nlp(text1)

for token in doc:
    print(f'{token} --> {token.lemma_}')

The --> the
Republican --> republican
president --> president
is --> be
being --> be
challenged --> challenge
by --> by
Democratic --> Democratic
Party --> Party
nominee --> nominee
Joe --> Joe
Biden --> Biden
, --> ,


In [56]:
# Tag
for token in doc:
    print(f'{token} --> {token.tag_}')

The --> DT
Republican --> JJ
president --> NN
is --> VBZ
being --> VBG
challenged --> VBN
by --> IN
Democratic --> NNP
Party --> NNP
nominee --> NN
Joe --> NNP
Biden --> NNP
, --> ,


In [57]:
# POS
for token in doc:
    print(f'{token} --> {token.pos_}')

The --> DET
Republican --> ADJ
president --> NOUN
is --> AUX
being --> AUX
challenged --> VERB
by --> ADP
Democratic --> PROPN
Party --> PROPN
nominee --> NOUN
Joe --> PROPN
Biden --> PROPN
, --> PUNCT


In [58]:
for token in doc:
    print(f'{token} --> {token.pos_} {spacy.explain(token.tag_)}')

The --> DET determiner
Republican --> ADJ adjective
president --> NOUN noun, singular or mass
is --> AUX verb, 3rd person singular present
being --> AUX verb, gerund or present participle
challenged --> VERB verb, past participle
by --> ADP conjunction, subordinating or preposition
Democratic --> PROPN noun, proper singular
Party --> PROPN noun, proper singular
nominee --> NOUN noun, singular or mass
Joe --> PROPN noun, proper singular
Biden --> PROPN noun, proper singular
, --> PUNCT punctuation mark, comma


In [59]:
# Visualization/Dependency Parsing
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True , options={'distance':100})

In [63]:
# NER
doc = nlp(text)

for token in doc.ents:
    print(f'{token} --> {token.label_}')

Republican --> NORP
Democratic Party --> ORG
Joe Biden --> PERSON
Barack Obama’s --> PERSON
US --> GPE
the 1970s --> DATE
election day --> DATE


In [64]:
# Dependency Parsing(Entity)
displacy.render(doc, style='ent', jupyter=True , options={'distance':100})

In [65]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.0.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [66]:
nlp = spacy.load("en_core_web_md")

In [68]:
doc = nlp(text1)

for token in doc:
    print(f'{token} --> {token.has_vector}')

The --> True
Republican --> True
president --> True
is --> True
being --> True
challenged --> True
by --> True
Democratic --> True
Party --> True
nominee --> True
Joe --> True
Biden --> True
, --> True


In [69]:
doc = nlp('hello world asabd sgjiekg')

for token in doc:
    print(f'{token} --> {token.vector_norm}')

hello --> 5.586428165435791
world --> 5.974550724029541
asabd --> 0.0
sgjiekg --> 0.0


In [73]:
# Simmilarity
doc1 = nlp('excellent')
doc2 = nlp('good')
score = doc1.similarity(doc2)
print(score)

0.7774078298027361


In [53]:
dir(doc)
doc=nlp('hello')
for token in doc:
    print(dir(token))

['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type', 'ent_type_', 'get_extension', 'has_dep', 'has_extension', 'has_head', 'has_morph', 'has_vector', 'head', 'i', 'idx', 'iob_strings', 'is_alpha', 'is_ancestor', 'is_ascii', 'is_bracket', 'is_currency', 'is_digit', 'is_left_punct', 'is_lower', 'is_oov', 'is_punct', 'is_quote', 'is_right_punct', 'is_sent_end', 'is_sent_start', 'is_space', 'is_stop', 'is_title', 'is_upper', 'lang', 'lang_', 'left_edge', 'lefts', 'lemma', 'lemma_', 'lex', 'lex_id', 'like_email', 'like

In [None]:
is_ascii
is_digit
is_lower
is_uppercase
is_alpha
like_email
like_url