In [1]:
import wikipedia

In [2]:
ny = wikipedia.page("New York City")

In [3]:
ny.content[:1000] # limit output just for jupyter's sake

"The City of New York, often called New York City or simply New York, is the most populous city in the United States. With an estimated 2016 population of 8,537,673 distributed over a land area of about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the state of New York, the city is the center of the New York metropolitan area, one of the most populous urban agglomerations in the world with an estimated 23.7 million residents as of 2016. A global power city, New York City has been described as the cultural, financial, and media capital of the world, and exerts a significant impact upon commerce, entertainment, research, technology, education, politics, and sports. The city's fast pace defines the term New York minute. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.\nSituated on one of the world's largest natural harbors, New York 

In [4]:
import spacy

In [5]:
nlp = spacy.load('en')

In [6]:
nynlp = nlp(ny.content[:10000])

In [7]:
for chunk in nynlp.noun_chunks:
    print(chunk.text, "/", chunk.root.text, "/", chunk.root.dep_, "/", chunk.root.head.text)

The City / City / nsubj / is
New York / York / pobj / of
the most populous city / city / attr / is
the United States / States / pobj / in
an estimated 2016 population / population / pobj / With
a land area / area / pobj / over
about 302.6 square miles / miles / pobj / of
784 km2 / km2 / appos / miles
New York City / City / nsubj / is
the most densely populated major city / city / attr / is
the United States / States / pobj / in
the southern tip / tip / pobj / at
the state / state / pobj / of
New York / York / pobj / of
the city / city / nsubj / is
the center / center / attr / is
the New York metropolitan area / area / pobj / of
the most populous urban agglomerations / agglomerations / pobj / of
the world / world / pobj / in
an estimated 23.7 million residents / residents / pobj / with
A global power city / city / nsubjpass / described
New York City / City / appos / city
the cultural, financial, and media capital / capital / pobj / as
the world / world / pobj / of
a significant impact /

In [8]:
for ent in nynlp.ents:
    print(ent.text, "/", ent.start_char, "/", ent.end_char, "/", ent.label_)

The City of New York / 0 / 20 / GPE
New York City / 35 / 48 / GPE
New York / 59 / 67 / GPE
the United States / 98 / 115 / GPE
2016 / 135 / 139 / DATE
8,537,673 / 154 / 163 / CARDINAL
about 302.6 square miles / 196 / 220 / QUANTITY
784 / 222 / 225 / CARDINAL
New York City / 232 / 245 / GPE
the United States / 295 / 312 / GPE
New York / 358 / 366 / GPE
New York / 398 / 406 / GPE
one / 426 / 429 / CARDINAL
an estimated 23.7 million / 490 / 515 / CARDINAL
2016 / 532 / 536 / DATE
New York City / 559 / 572 / GPE
New York / 807 / 815 / GPE
the United Nations / 852 / 870 / ORG
New York / 872 / 880 / GPE

 / 932 / 933 / GPE
one / 945 / 948 / CARDINAL
New York City / 989 / 1002 / GPE
five / 1015 / 1019 / CARDINAL
New York State / 1068 / 1082 / GPE
five / 1088 / 1092 / CARDINAL
Brooklyn / 1104 / 1112 / GPE
Queens / 1114 / 1120 / GPE
Manhattan / 1122 / 1131 / GPE
Bronx / 1137 / 1142 / GPE
Staten Island – were / 1148 / 1168 / PERSON
1898 / 1204 / 1208 / DATE
the United States / 1301 / 1318 / GPE
as

In [9]:
for token in nlp(u'The Statue of Liberty greeted millions of immigrants who came to the Americas by ship in the late 19th and early 20th centuries and is a world symbol of the United States and its ideals of liberty and peace.'):
    print(token.text, "/", token.pos_, "/", token.tag_, "/", token.is_alpha, "/", token.is_stop)

The / DET / DT / True / False
Statue / PROPN / NNP / True / False
of / ADP / IN / True / True
Liberty / PROPN / NNP / True / False
greeted / VERB / VBD / True / False
millions / NOUN / NNS / True / False
of / ADP / IN / True / True
immigrants / NOUN / NNS / True / False
who / NOUN / WP / True / True
came / VERB / VBD / True / False
to / ADP / IN / True / True
the / DET / DT / True / True
Americas / PROPN / NNPS / True / False
by / ADP / IN / True / True
ship / NOUN / NN / True / False
in / ADP / IN / True / True
the / DET / DT / True / True
late / ADJ / JJ / True / False
19th / ADJ / JJ / False / False
and / CCONJ / CC / True / True
early / ADJ / JJ / True / False
20th / ADJ / JJ / False / False
centuries / NOUN / NNS / True / False
and / CCONJ / CC / True / True
is / VERB / VBZ / True / True
a / DET / DT / True / True
world / NOUN / NN / True / False
symbol / NOUN / NN / True / False
of / ADP / IN / True / True
the / DET / DT / True / True
United / PROPN / NNP / True / False
States / 

In [10]:
from spacy.matcher import Matcher

In [11]:
from spacy import displacy

In [12]:
matcher = Matcher(nlp.vocab)
matched_sents = [] # collect data of matched sentences to be visualized
matched_phrases = [] # just the matched phrases without the sentence

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end] # matched span
    sent = span.sent # sentence containing matched span
    # append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{'start': span.start_char - sent.start_char, 
                   'end': span.end_char - sent.start_char,
                   'label': 'MATCH'}]
    matched_sents.append({'text': sent.text, 'ents': match_ents})
    matched_phrases.append(span.text)

In [13]:
pattern = [{'POS': 'NOUN'}, {'POS': 'NOUN'}]

In [14]:
matcher.add('nounphrase', collect_sents, pattern) # add pattern

In [15]:
matches = matcher(nynlp)

In [16]:
matches

[(8429085606845659861, 36, 38),
 (8429085606845659861, 110, 112),
 (8429085606845659861, 126, 128),
 (8429085606845659861, 371, 373),
 (8429085606845659861, 482, 484),
 (8429085606845659861, 565, 567),
 (8429085606845659861, 618, 620),
 (8429085606845659861, 629, 631),
 (8429085606845659861, 696, 698),
 (8429085606845659861, 700, 702),
 (8429085606845659861, 714, 716),
 (8429085606845659861, 742, 744),
 (8429085606845659861, 765, 767),
 (8429085606845659861, 915, 917),
 (8429085606845659861, 928, 930),
 (8429085606845659861, 944, 946),
 (8429085606845659861, 960, 962),
 (8429085606845659861, 1152, 1154),
 (8429085606845659861, 1313, 1315),
 (8429085606845659861, 1492, 1494),
 (8429085606845659861, 1493, 1495),
 (8429085606845659861, 1601, 1603),
 (8429085606845659861, 1622, 1624),
 (8429085606845659861, 1664, 1666),
 (8429085606845659861, 1721, 1723),
 (8429085606845659861, 1813, 1815),
 (8429085606845659861, 1924, 1926)]

In [17]:
matched_sents

[{'ents': [{'end': 75, 'label': 'MATCH', 'start': 66}],
  'text': 'With an estimated 2016 population of 8,537,673 distributed over a land area of about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States.'},
 {'ents': [{'end': 19, 'label': 'MATCH', 'start': 9}],
  'text': 'A global power city, New York City has been described as the cultural, financial, and media capital of the world, and exerts a significant impact upon commerce, entertainment, research, technology, education, politics, and sports.'},
 {'ents': [{'end': 99, 'label': 'MATCH', 'start': 86}],
  'text': 'A global power city, New York City has been described as the cultural, financial, and media capital of the world, and exerts a significant impact upon commerce, entertainment, research, technology, education, politics, and sports.'},
 {'ents': [{'end': 50, 'label': 'MATCH', 'start': 38}],
  'text': 'New York City traces its origins to a trading post founded by col

In [18]:
displacy.render(matched_sents, style='ent', manual=True, jupyter=True)

In [19]:
#patterns = [[{'POS': 'ADJ', 'IS_ALPHA': True, 'IS_STOP': False, 'OP': '*'},
#             {'POS': 'NOUN', 'IS_ALPHA': True, 'IS_STOP': False, 'OP': '+'}],
#            [{'POS': 'NOUN', 'IS_ALPHA': True, 'IS_STOP': False},
#             {'POS': 'ADP', 'IS_ALPHA': True},
#             {'POS': 'NOUN', 'IS_ALPHA': True, 'IS_STOP': False, 'OP': '+'}]
#           ]
patterns = [[{'POS': 'NOUN', 'IS_ALPHA': True, 'IS_STOP': False, 'OP': '+'}]]
matcher = Matcher(nlp.vocab)
matched_sents = []
matched_phrases = []
for pattern in patterns:
    matcher.add('keyword', collect_sents, pattern)

In [20]:
matches = matcher(nynlp)

In [21]:
sorted(matched_phrases)

['Chinatowns',
 'Crossroads',
 'Etymology',
 'GDP',
 'Gomes',
 'Harbour',
 'History',
 'Home',
 'Street',
 'agglomerations',
 'amounts',
 'area',
 'area',
 'area',
 'area',
 'area',
 'area',
 'area',
 'area',
 'attractions',
 'authority',
 'autonomy',
 'band',
 'beads',
 'bedrock',
 'belief',
 'bloodshed',
 'boroughs',
 'boroughs',
 'bridges',
 'brother',
 'brother',
 'capital',
 'capital',
 'capitalization',
 'captain',
 'center',
 'center',
 'center',
 'center',
 'centuries',
 'century',
 'citadel',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'city',
 'coast',
 'colleges',
 'colonial',
 'colonists',
 'colonists',
 'colony',
 'colony',
 'colony',
 'commerce',
 'concentration',
 'construction',
 'control',
 'control',
 'country',
 'country',
 'county',
 'creativity',
 'crown',
 'day',
 'day',
 'day exploration',
 'depth',
 'descent',
 'diplomacy',
 'districts'

In [22]:
from collections import Counter
keywords = dict(Counter(matched_phrases))

In [23]:
keywords

{'Chinatowns': 1,
 'Crossroads': 1,
 'Etymology': 1,
 'GDP': 1,
 'Gomes': 1,
 'Harbour': 1,
 'History': 1,
 'Home': 1,
 'Street': 1,
 'agglomerations': 1,
 'amounts': 1,
 'area': 8,
 'attractions': 1,
 'authority': 1,
 'autonomy': 1,
 'band': 1,
 'beads': 1,
 'bedrock': 1,
 'belief': 1,
 'bloodshed': 1,
 'boroughs': 2,
 'bridges': 1,
 'brother': 2,
 'capital': 2,
 'capitalization': 1,
 'captain': 1,
 'center': 4,
 'centuries': 1,
 'century': 1,
 'citadel': 1,
 'city': 20,
 'coast': 1,
 'colleges': 1,
 'colonial': 1,
 'colonists': 2,
 'colony': 3,
 'commerce': 1,
 'concentration': 1,
 'construction': 1,
 'control': 2,
 'country': 2,
 'county': 1,
 'creativity': 1,
 'crown': 1,
 'day': 2,
 'day exploration': 1,
 'depth': 1,
 'descent': 1,
 'diplomacy': 1,
 'districts': 1,
 'diversity': 1,
 'east coast': 1,
 'edge': 1,
 'education': 1,
 'effort': 1,
 'entertainment': 1,
 'entertainment industry': 1,
 'entrepreneurship': 1,
 'era': 1,
 'estate market': 1,
 'exchanges': 1,
 'expedition': 2,

In [24]:
import math
import re
keywords_cvalues = {}
for keyword in sorted(keywords.keys()):
    parent_terms = list(filter(lambda t: t != keyword and re.match('\\b%s\\b' % keyword, t), keywords.keys()))
    keywords_cvalues[keyword] = keywords[keyword]
    print("TERM:", keyword, "PARENT TERMS:", parent_terms)
    for pt in parent_terms:
        keywords_cvalues[keyword] -= float(keywords[pt])/float(len(parent_terms))
    keywords_cvalues[keyword] *= math.log(len(keyword.split()), 2)

TERM: Chinatowns PARENT TERMS: []
TERM: Crossroads PARENT TERMS: []
TERM: Etymology PARENT TERMS: []
TERM: GDP PARENT TERMS: []
TERM: Gomes PARENT TERMS: []
TERM: Harbour PARENT TERMS: []
TERM: History PARENT TERMS: []
TERM: Home PARENT TERMS: []
TERM: Street PARENT TERMS: []
TERM: agglomerations PARENT TERMS: []
TERM: amounts PARENT TERMS: []
TERM: area PARENT TERMS: []
TERM: attractions PARENT TERMS: []
TERM: authority PARENT TERMS: []
TERM: autonomy PARENT TERMS: []
TERM: band PARENT TERMS: []
TERM: beads PARENT TERMS: []
TERM: bedrock PARENT TERMS: []
TERM: belief PARENT TERMS: []
TERM: bloodshed PARENT TERMS: []
TERM: boroughs PARENT TERMS: []
TERM: bridges PARENT TERMS: []
TERM: brother PARENT TERMS: []
TERM: capital PARENT TERMS: []
TERM: capitalization PARENT TERMS: []
TERM: captain PARENT TERMS: []
TERM: center PARENT TERMS: []
TERM: centuries PARENT TERMS: []
TERM: century PARENT TERMS: []
TERM: citadel PARENT TERMS: []
TERM: city PARENT TERMS: []
TERM: coast PARENT TERMS: []

In [25]:
keywords_cvalues

{'Chinatowns': 0.0,
 'Crossroads': 0.0,
 'Etymology': 0.0,
 'GDP': 0.0,
 'Gomes': 0.0,
 'Harbour': 0.0,
 'History': 0.0,
 'Home': 0.0,
 'Street': 0.0,
 'agglomerations': 0.0,
 'amounts': 0.0,
 'area': 0.0,
 'attractions': 0.0,
 'authority': 0.0,
 'autonomy': 0.0,
 'band': 0.0,
 'beads': 0.0,
 'bedrock': 0.0,
 'belief': 0.0,
 'bloodshed': 0.0,
 'boroughs': 0.0,
 'bridges': 0.0,
 'brother': 0.0,
 'capital': 0.0,
 'capitalization': 0.0,
 'captain': 0.0,
 'center': 0.0,
 'centuries': 0.0,
 'century': 0.0,
 'citadel': 0.0,
 'city': 0.0,
 'coast': 0.0,
 'colleges': 0.0,
 'colonial': 0.0,
 'colonists': 0.0,
 'colony': 0.0,
 'commerce': 0.0,
 'concentration': 0.0,
 'construction': 0.0,
 'control': 0.0,
 'country': 0.0,
 'county': 0.0,
 'creativity': 0.0,
 'crown': 0.0,
 'day': 0.0,
 'day exploration': 1.0,
 'depth': 0.0,
 'descent': 0.0,
 'diplomacy': 0.0,
 'districts': 0.0,
 'diversity': 0.0,
 'east coast': 1.0,
 'edge': 0.0,
 'education': 0.0,
 'effort': 0.0,
 'entertainment': 0.0,
 'enterta

In [26]:
best_keywords = []
for keyword in sorted(keywords_cvalues, key=keywords_cvalues.get, reverse=True)[:10]:
    best_keywords.append([keyword, keywords_cvalues[keyword]])

In [27]:
best_keywords

[['ice sheet', 3.0],
 ['fur trade', 2.0],
 ['day exploration', 1.0],
 ['east coast', 1.0],
 ['entertainment industry', 1.0],
 ['estate market', 1.0],
 ['fledgling city', 1.0],
 ['fur trading', 1.0],
 ['geologic foundation', 1.0],
 ['glass beads', 1.0]]

In [46]:
import wikipedia
import spacy
from spacy.matcher import Matcher
import math
import re
from collections import Counter

nlp = spacy.load('en', disable=['parser', 'ner', 'textcat'])

matched_phrases = []
def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end]
    matched_phrases.append(span.lemma_)
    
patterns = [[{'POS': 'NOUN', 'IS_ALPHA': True, 'IS_STOP': False, 'OP': '+'}]]
matcher = Matcher(nlp.vocab)
for pattern in patterns:
    matcher.add('keyword', collect_sents, pattern)

def extract_keywords_wikipedia(pagename, num_keywords):
    global matched_phrases
    page = wikipedia.page(pagename)
    pagenlp = nlp(page.content)
    matched_phrases = []
    matches = matcher(pagenlp)
    keywords = dict(Counter(matched_phrases).most_common(100))
    keywords_cvalues = {}
    for keyword in sorted(keywords.keys()):
        parent_terms = list(filter(lambda t: t != keyword and re.match('\\b%s\\b' % keyword, t), keywords.keys()))
        keywords_cvalues[keyword] = keywords[keyword]
        for pt in parent_terms:
            keywords_cvalues[keyword] -= float(keywords[pt])/float(len(parent_terms))
        keywords_cvalues[keyword] *= 1 + math.log(len(keyword.split()), 2)
    best_keywords = []
    for keyword in sorted(keywords_cvalues, key=keywords_cvalues.get, reverse=True)[:num_keywords]:
        best_keywords.append([keyword, keywords_cvalues[keyword]])
    return best_keywords

In [47]:
extract_keywords_wikipedia("New York City", 10)

[['city', 209.0],
 ['world', 69.0],
 ['area', 49.0],
 ['population', 46.0],
 ['borough', 40.0],
 ['system', 40.0],
 ['center', 23.0],
 ['year', 22.0],
 ['park', 21.0],
 ['home', 19.0]]

In [48]:
extract_keywords_wikipedia("Python (programming language)", 10)

[['language', 58.0],
 ['statement', 27.0],
 ['expression', 26.0],
 ['programming language', 26.0],
 ['object', 23.0],
 ['type', 18.0],
 ['code', 17.0],
 ['implementation', 17.0],
 ['division', 16.0],
 ['operator', 16.0]]

In [49]:
extract_keywords_wikipedia("Artificial intelligence", 10)

[['intelligence', 103.0],
 ['problem', 67.0],
 ['machine', 52.0],
 ['network', 42.0],
 ['system', 39.0],
 ['research', 38.0],
 ['agent', 34.0],
 ['researcher', 29.0],
 ['ai', 28.0],
 ['learning', 28.0]]

In [50]:
extract_keywords_wikipedia("Computer science", 10)

[['computer science', 106.0],
 ['science', 47.0],
 ['computation', 26.0],
 ['system', 24.0],
 ['computer', 20.166666666666664],
 ['field', 20.0],
 ['study', 20.0],
 ['theory', 20.0],
 ['discipline', 18.0],
 ['software engineering', 18.0]]