In [1]:
import spacy
from spacy.lang.en import English
# Загрузка английской NLP-модели
nlp = spacy.load('en_core_web_sm')

ModuleNotFoundError: No module named 'spacy'

In [26]:
text = """
London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium.
"""

In [27]:
# Парсинг текста с помощью spaCy. Эта команда запускает целый конвейер
doc = nlp(text)

In [28]:
# например, распечатать все обнаруженные именованные сущности
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

London (GPE)
England (GPE)
the United Kingdom (GPE)
south east (LOC)
Great Britain (GPE)
London (GPE)
two millennia (DATE)
Romans (NORP)
Londinium (ORG)


In [29]:
# Если токен является именем, заменяем его словом "REDACTED" 
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.text

# Проверка всех сущностей
# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". 
In 1957, Noam Chomsky’s 
 Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of 
 syntactic structures.
 """

print(scrub(s))


In1950,[REDACTED] publishedhisfamousarticle"Computing MachineryandIntelligence".
In1957,[REDACTED] 
 SyntacticStructuresrevolutionized[REDACTED] with'universalgrammar',arulebasedsystemof
 syntacticstructures.
 


In [30]:
from spacy.lang.ru import Russian
nlp = Russian()
doc = nlp("Съешь ещё этих мягких французских булок, да выпей чаю.")

In [31]:
token = doc[0]
print(token.text)

span = doc[3:6]
print(span.text)




Съешь
мягких французских булок


In [32]:
print("is_alpha:    ", [token.is_alpha for token in doc])
print("is_punct:    ", [token.is_punct for token in doc])
print("like_num:    ", [token.like_num for token in doc])

is_alpha:     [True, True, True, True, True, True, False, True, True, True, False]
is_punct:     [False, False, False, False, False, False, True, False, False, False, True]
like_num:     [False, False, False, False, False, False, False, False, False, False, False]


In [33]:
for token in doc:
    if token.i+1 < len(doc):
        next_token = doc[token.i+1]
        if next_token.text == ".":
            print(token.text)

чаю


In [34]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("New Apple MacBook set launch tomorrow")

for token in doc:
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    token_head = token.head.text
    print(f"{token_text:<12}{token_pos:<10}"
          f"{token_dep:<10}{token_head:<12}")

New         PROPN     compound  MacBook     
Apple       PROPN     compound  MacBook     
MacBook     PROPN     nsubj     set         
set         VERB      ROOT      set         
launch      NOUN      dobj      set         
tomorrow    NOUN      npadvmod  set         


In [35]:
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True)

In [36]:
print(spacy.explain("aux"))
print(spacy.explain("PROPN"))

auxiliary
proper noun


In [37]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("I saw a movie yesterday")
print(' '.join([token.lemma_ for token in doc]))

'-PRON- see a movie yesterday'

I see a movie yesterday


'-PRON- see a movie yesterday'

In [38]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for 1$ billion")
for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
U.K. GPE
1$ billion MONEY


In [39]:
print(spacy.explain("GPE"))

Countries, cities, states


In [40]:
nlp = spacy.load("en_core_web_sm")
doc1 = nlp("I like burgers")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8278116583824158


  print(doc1.similarity(doc2))


In [41]:
# отсюда сама лекция

In [42]:
import spacy

# Загружаем языковую модель
nlp = spacy.load("en_core_web_sm")

# Входной текст
text = "spaCy is an amazing tool for natural language processing."

# Применяем токенизацию
doc = nlp(text)

# Выводим токены (слова и пунктуацию) из текста
for token in doc:
    print(token.text)

spaCy
is
an
amazing
tool
for
natural
language
processing
.


In [43]:
import spacy

# Загружаем языковую модель
nlp = spacy.load("en_core_web_sm")

# Входной текст с несколькими предложениями
text = "SpaCy is fast. It's also efficient."

# Применяем разбиение на предложения
doc = nlp(text)

# Выводим предложения из текста
for sentence in doc.sents:
    print(sentence.text)

SpaCy is fast.
It's also efficient.


In [44]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "running dogs are happily barking"

doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_)

running run
dogs dog
are be
happily happily
barking bark
