# Spacy

----

edited by: Line Abele

last update: Feb 29, 2024

----

#### Install Spacy

In [1]:
%%bash
pip install spacy



#### Download models or corpora

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#### Define the model

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

#### Process the texts

In [4]:
nlp = spacy.load("en_core_web_sm", disable=['ner'])


def normalize(text, remove_stopwords, remove_punctuation):
    text = text.lower()
    text = nlp(text)
    lemmatized = list()
    for word in text:
        if remove_stopwords and word.is_stop:
            continue
        if remove_punctuation and word.is_punct:
            continue
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return lemmatized



In [5]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"
normalize(tweet, remove_stopwords=True, remove_punctuation=True)

['rt',
 '@lor42wsoefcv3f',
 'fall',
 'fast',
 'crash',
 'hard',
 'forgive',
 'easily',
 'care',
 'amiright']

In [6]:
text = nlp(tweet)
for token in text:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

RT RT PROPN NNP compound XX True False
@lOR42wsOEFcv3f @lOR42wsOEFcv3f PROPN NNP ROOT @xXXddxxXXXxxdx False False
: : PUNCT : punct : False False
I I PRON PRP nsubj X True True
fall fall VERB VBP ccomp xxxx True False
too too ADV RB advmod xxx True True
fast fast ADV RB advmod xxxx True False
, , PUNCT , punct , False False
crash crash VERB VBP conj xxxx True False
too too ADV RB advmod xxx True True
hard hard ADV RB advmod xxxx True False
, , PUNCT , punct , False False
forgive forgive VERB VB ROOT xxxx True False
too too ADV RB advmod xxx True True
easily easily ADV RB advmod xxxx True False
and and CCONJ CC cc xxx True True
care care VERB VB conj xxxx True False
too too ADV RB advmod xxx True True
much much ADV RB advmod xxxx True True
... ... PUNCT : punct ... False False
:( :( PUNCT NFP punct :( False False
# # NOUN NNS nsubj # False False
amiright amiright NOUN NN ROOT xxxx True False


In [7]:
from spacy.matcher import Matcher


matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', [[{'ORTH': '#'}, {'IS_ASCII': True}]])

doc = nlp('This is a #sentence. Here is another #hashtag. #The #End.')
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
    hashtags.append(doc[start:end])

print([t.text for t in doc])

['This', 'is', 'a', '#', 'sentence', '.', 'Here', 'is', 'another', '#', 'hashtag', '.', '#', 'The', '#', 'End', '.']


In [None]:
Matcher??

In [8]:
for span in hashtags:
  print(span)

#sentence
#hashtag
#The
#End


### Romanian example

In [9]:
!python -m spacy download ro_core_news_sm

Collecting ro-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.7.0/ro_core_news_sm-3.7.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ro-core-news-sm
Successfully installed ro-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ro_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
from spacy.lang.ro.examples import sentences

nlp = spacy.load("ro_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple plănuiește să cumpere o companie britanică pentru un miliard de dolari
Apple PROPN nsubj
plănuiește AUX ROOT
să PART mark
cumpere AUX ccomp
o DET det
companie NOUN obj
britanică ADJ amod
pentru ADP case
un DET det
miliard NUM obl
de ADP case
dolari NOUN nmod


In [11]:
doc = nlp("Aceeași problemă în aceleași vremuri... pisici! mai facem și abrogare? Din cauza activității și birourilor care se accesează")
print(doc.text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

Aceeași problemă în aceleași vremuri... pisici! mai facem și abrogare? Din cauza activității și birourilor care se accesează
Aceeași Aceeași DET det
problemă problemă NOUN ROOT
în în ADP case
aceleași același DET det
vremuri vremuri NOUN obl
... ... PUNCT punct
pisici pisică ADJ amod
! ! PUNCT punct
mai mai ADV advmod
facem face VERB ROOT
și și CCONJ cc
abrogare abrogar NOUN conj
? ? PUNCT punct
Din din ADP case
cauza cauză NOUN fixed
activității activitate NOUN ROOT
și și CCONJ cc
birourilor birou NOUN conj
care care PRON nsubj:pass
se sine PRON expl:pass
accesează accesa VERB acl


### German Example

In [12]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
from spacy.lang.de.examples import sentences

nlp = spacy.load("de_core_news_sm")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen
Die DET nk
ganze ADJ nk
Stadt NOUN sb
ist AUX ROOT
ein DET nk
Startup NOUN pd
: PUNCT punct
Shenzhen NOUN sb
ist AUX cj
das DET nk
Silicon PROPN pnc
Valley PROPN sb
für ADP mnr
Hardware-Firmen NOUN nk


In [15]:
doc = nlp("Das gleiche Problem zur gleichen Zeit... Katzen! tun wir auch aufheben? Wegen der Aktivität und der Ämter, die auf die")
print(doc.text)
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_)

Das gleiche Problem zur gleichen Zeit... Katzen! tun wir auch aufheben? Wegen der Aktivität und der Ämter, die auf die
Das der DET nk
gleiche gleich ADJ nk
Problem Problem NOUN ROOT
zur zu ADP mnr
gleichen gleich ADJ nk
Zeit Zeit NOUN nk
... -- PUNCT punct
Katzen Katze NOUN da
! -- PUNCT punct
tun tun VERB ROOT
wir wir PRON sb
auch auch ADV mo
aufheben aufheben VERB oc
? -- PUNCT punct
Wegen wegen ADP mo
der der DET nk
Aktivität Aktivität NOUN nk
und und CCONJ cd
der der DET nk
Ämter Amt NOUN cj
, -- PUNCT punct
die der PRON ROOT
auf auf ADP mnr
die der DET nk
