In [1]:
# You need moduls: spacy and textacy
# And do "python -m spacy download en_core_web_lg"
import spacy

In [2]:
# Загрузка английской NLP-модели
nlp = spacy.load('en_core_web_lg')

# Текст для анализа
text = """London is the capital and most populous city of England and 
the United Kingdom.  Standing on the River Thames in the south east 
of the island of Great Britain, London has been a major settlement 
for two millennia. It was founded by the Romans, who named it Londinium. 
"""

# Парсинг текста с помощью spaCy. Эта команда запускает целый конвейер
doc = nlp(text)

In [3]:
# в переменной 'doc' теперь содержится обработанная версия текста
# мы можем делать с ней все что угодно!
# например, распечатать все обнаруженные именованные сущности
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

London (GPE)
England (GPE)
the United Kingdom (GPE)
the River Thames (FAC)
Great Britain (GPE)
London (GPE)
two millennia (DATE)
Romans (NORP)
Londinium (PERSON)


# ТИП	        ОПИСАНИЕ
PERSON	    - Люди, в том числе вымышленные.

NORP	    - Национальности или религиозные или политические группы.

FAC	        - Здания, аэропорты, автомагистрали, мосты и т. Д.

ORG	        - Компании, агентства, учреждения и т. Д.

GPE	        - Страны, города, штаты.

LOC	        - Не-ГПЭ локации, горные хребты, водоемы.

PRODUCT 	- Предметы, транспортные средства, продукты питания и т. Д. (Не услуги.)

EVENT	    - Названы ураганы, сражения, войны, спортивные мероприятия и т. Д.

WORK_OF_ART	- Названия книг, песен и т. Д.

LAW	        - Названные документы внесены в законы.

LANGUAGE	- Любой названный язык.

DATE	    - Абсолютные или относительные даты или периоды.

TIME	    - Времена меньше дня.

PERCENT	    - Процент, в том числе "%".

MONEY	    - Денежные ценности, в том числе единица.

QUANTITY	- Измерения по весу или расстоянию.

ORDINAL	    - «первый», «второй» и т. д.

CARDINAL	- Цифры, которые не подпадают под другой тип.

<h3>Real Example

In [11]:
# Если токен является именем, заменяем его словом "REDACTED" 
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.string
 
# Проверка всех сущностей
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)
 
s = """
In 1950, Alan Turing published his famous article "Computing Machinery and Intelligence". In 1957, Noam Chomsky’s 
Syntactic Structures revolutionized Linguistics with 'universal grammar', a rule based system of syntactic structures.
"""

ss = """
Nina told to Nikita that he need to clean his computer room
"""
print(scrub(ss))

[REDACTED] told to [REDACTED] that he need to clean his computer room



In [12]:
import textacy.extract
 
# Загрузка английской NLP-модели
nlp = spacy.load('en_core_web_lg')
 
# Текст для анализа
text = """London is the capital and most populous city of England and  the United Kingdom.  
Standing on the River Thames in the south east of the island of Great Britain, 
London has been a major settlement  for two millennia.  It was founded by the Romans, 
who named it Londinium.
"""
 
# Анализ
doc = nlp(text)
 
# Извлечение полуструктурированных выражений со словом London
statements = textacy.extract.semistructured_statements(doc, "London")
 
# Вывод результатов
print("Here are the things I know about London:")
 
for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")

Here are the things I know about London:
 - the capital and most populous city of England and  the United Kingdom.  

 - a major settlement  for two millennia.  


In [14]:
import spacy
import textacy.extract
 
# Загрузка английской NLP-модели
nlp = spacy.load('en_core_web_lg')
 
# Текст для анализа
text = """London (/ˈlʌndən/ (About this soundlisten) LUN-dən) is the capital and largest city of both England and the United Kingdom.[9][10] Standing on the River Thames in the south-east of England, at the head of its 50-mile (80 km) estuary leading to the North Sea, London has been a major settlement for two millennia. Londinium was founded by the Romans.[11] The City of London, London's ancient core − an area of just 1.12 square miles (2.9 km2) and colloquially known as the Square Mile − retains boundaries that follow closely its medieval limits.[12][13][14][15][16][note 1] The City of Westminster is also an Inner London borough holding city status. Greater London is governed by the Mayor of London and the London Assembly.[17][note 2][18]

London is considered to be one of the world's most important global cities[19][20][21] and has been termed the world's most powerful,[22] most desirable,[23] most influential,[24] most visited,[25] most expensive,[26][27] innovative,[28] sustainable,[29] most investment friendly,[30] most popular for work,[31] and the most vegetarian friendly[32] city in the world. London exerts a considerable impact upon the arts, commerce, education, entertainment, fashion, finance, healthcare, media, professional services, research and development, tourism and transportation.[33][34] London ranks 26 out of 300 major cities for economic performance.[35] It is one of the largest financial centres[36] and has either the fifth or sixth largest metropolitan area GDP.[note 3][37][38][39][40][41] It is the most-visited city as measured by international arrivals[42] and has the busiest city airport system as measured by passenger traffic.[43] It is the leading investment destination,[44][45][46][47] hosting more international retailers[48][49] and ultra high-net-worth individuals[50][51] than any other city. London's universities form the largest concentration of higher education institutes in Europe.[52] In 2012, London became the first city to have hosted three modern Summer Olympic Games.[53]

London has a diverse range of people and cultures, and more than 300 languages are spoken in the region.[54] Its estimated mid-2016 municipal population (corresponding to Greater London) was 8,787,892,[4] the most populous of any city in the European Union[55] and accounting for 13.4% of the UK population.[56] London's urban area is the second most populous in the EU, after Paris, with 9,787,426 inhabitants at the 2011 census.[57] The population within the London commuter belt is the most populous in the EU with 14,040,163 inhabitants in 2016.[note 4][3][58] London was the world's most populous city from c. 1831 to 1925.[59]

London contains four World Heritage Sites: the Tower of London; Kew Gardens; the site comprising the Palace of Westminster, Westminster Abbey, and St Margaret's Church; and the historic settlement in Greenwich where the Royal Observatory, Greenwich defines the Prime Meridian, 0° longitude, and Greenwich Mean Time.[60] Other landmarks include Buckingham Palace, the London Eye, Piccadilly Circus, St Paul's Cathedral, Tower Bridge, Trafalgar Square and The Shard. London has numerous museums, galleries, libraries and sporting events. These include the British Museum, National Gallery, Natural History Museum, Tate Modern, British Library and West End theatres.[61] The London Underground is the oldest underground railway network in the world."""
 
# Анализ
doc = nlp(text)
 
# Извлечение фрагментов
noun_chunks = textacy.extract.noun_chunks(doc, min_freq=3)
 
# Перевод в нижний регистр
noun_chunks = map(str, noun_chunks)
noun_chunks = map(str.lower, noun_chunks)
 
# вывод всех фрагментов, состоящих из 2 слов и более
for noun_chunk in set(noun_chunks):
    if len(noun_chunk.split(" ")) > 1:
        print(noun_chunk)