In [1]:
#!pip install -U -q --verbose spacy
#import spacy

In [2]:
# Import the English language class
import spacy
from spacy.lang.en import English


# Create the nlp object, which
#   * contains the processing pipeline
#   * includes language-specific rules for tokenization etc.
nlp = English()

# ** The Doc object

# Created by processing a string of text with the nlp object
#doc = nlp("Hello world!")
doc = nlp("I like tree kangaroos and narwhals.")

# ** The Token object
# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

I
like
tree
kangaroos
and
narwhals
.


In [3]:
# Index into the Doc to get a single Token
token = doc[0]

# Get the token text via the .text attribute
print(token.text)

I


In [4]:
# ** The Span object

# A slice from the Doc is a Span object.  It's only a view of the Doc and doesn't contain any data itself.
span = doc[1:4]

# Get the span text via the .text attribute
print(span.text)

like tree kangaroos


In [4]:
!python -m spacy download nl_core_news_sm

2022-11-08 14:00:24.049200: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /sw/arch/Debian10/EB_production/2021/software/Qhull/2020.2-GCCcore-10.3.0/lib:/sw/arch/Debian10/EB_production/2021/software/SciPy-bundle/2021.05-foss-2021a/lib/python3.9/site-packages/numpy/core/lib:/sw/arch/Debian10/EB_production/2021/software/PyTorch/1.10.0-foss-2021a-CUDA-11.3.1/lib/python3.9/site-packages/torch/lib:/sw/arch/Debian10/EB_production/2021/software/magma/2.6.1-foss-2021a-CUDA-11.3.1/lib:/sw/arch/Debian10/EB_production/2021/software/FFmpeg/4.3.2-GCCcore-10.3.0/lib:/sw/arch/Debian10/EB_production/2021/software/FriBidi/1.0.10-GCCcore-10.3.0/lib:/sw/arch/Debian10/EB_production/2021/software/x265/3.5-GCCcore-10.3.0/lib:/sw/arch/Debian10/EB_production/2021/software/LAME/3.100-GCCcore-10.3.0/lib:/sw/arch/Debian10/EB_production/2021/software/x2

Defaulting to user installation because normal site-packages is not writeable
Collecting nl-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.4.0/nl_core_news_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [90m---------------------------------------[0m [32m12.8/12.8 MB[0m [31m123.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('nl_core_news_sm')


In [5]:
# LEXICAL ATTRIBUTES

import spacy
nlp = spacy.load("en_core_web_sm")
#nlp = spacy.load("nl_core_news_sm")
#doc = nlp("I like tree kangaroos and narwhals.")
doc = nlp("De kat krapt de krullen van de trap")


print('Index:      ', [token.i for token in doc])
print('Text:       ', [token.text for token in doc])
print('like_num:   ', [token.like_num for token in doc])
print('base word:  ', [token.lemma_ for token in doc])
print('shape:      ', [token.shape_ for token in doc])
print('dependence: ', [token.dep_ for token in doc])
print('tag:        ', [token.tag_ for token in doc])
print('is_alpha:   ', [token.is_alpha for token in doc])
print('is_punct:   ', [token.is_punct for token in doc])
print('stop word:  ', [token.is_stop for token in doc])



Index:       [0, 1, 2, 3, 4, 5, 6, 7]
Text:        ['De', 'kat', 'krapt', 'de', 'krullen', 'van', 'de', 'trap']
like_num:    [False, False, False, False, False, False, False, False]
base word:   ['De', 'kat', 'krapt', 'de', 'krullen', 'van', 'de', 'trap']
shape:       ['Xx', 'xxx', 'xxxx', 'xx', 'xxxx', 'xxx', 'xx', 'xxxx']
dependence:  ['compound', 'compound', 'compound', 'nmod', 'compound', 'compound', 'compound', 'ROOT']
tag:         ['NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP', 'NNP']
is_alpha:    [True, True, True, True, True, True, True, True]
is_punct:    [False, False, False, False, False, False, False, False]
stop word:   [False, False, False, False, False, False, False, False]


In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("I like tree kangaroos and narwhals.")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

I I PRON PRP nsubj X True True
like like VERB VBP ROOT xxxx True False
tree tree NOUN NN compound xxxx True False
kangaroos kangaroo NOUN NNS dobj xxxx True False
and and CCONJ CC cc xxx True True
narwhals narwhal NOUN NNS conj xxxx True False
. . PUNCT . punct . False False
