In [26]:
import spacy
import tqdm as tqdm

In [27]:
nlp = spacy.load("en_core_web_sm")
text = "Mary, don't slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [28]:
from nltk.tokenize import TweetTokenizer

In [29]:
tweet = u"Snow White and Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


#### Генерация n-грамм на основе текста

In [30]:
def n_grams(text, n):
    nlp = spacy.load("en_core_web_sm")
    cleaned = [str(token) for token in nlp(text.lower())]
    return [cleaned[i:i+n] for i in range(len(cleaned)-n+1)]

In [31]:
text = "Mary, don't slap the green witch"
print(n_grams(text, 3))

[['mary', ',', 'do'], [',', 'do', "n't"], ['do', "n't", 'slap'], ["n't", 'slap', 'the'], ['slap', 'the', 'green'], ['the', 'green', 'witch']]


#### Лемматизация

In [32]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"he was running late")
for token in doc:
    print(f"{token} -> {token.lemma_}")

he -> he
was -> be
running -> run
late -> late


#### Маркирование частей речи

In [33]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slaped the green witch.")
for token in doc:
    print(f"{token} -> {token.pos_}")

Mary -> PROPN
slaped -> VERB
the -> DET
green -> ADJ
witch -> NOUN
. -> PUNCT


#### chunking

In [34]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slaped the green witch.")
for chunk in doc.noun_chunks:
    print(f"{chunk} -> {chunk.label_}")

Mary -> NP
the green witch -> NP


#### named entity

In [46]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slaped the green witch Kate in Alaska.")
for ent in doc.ents:
    print(f"{ent} -> {ent.label_}")

Mary -> PERSON
Kate -> PERSON
Alaska -> GPE
