In [20]:
import spacy
from spacy.matcher import Matcher
# matcher extracts the data in a different way 
# it stores the data in form a lexeme
# entity ruler is used when the extracted data is of a particular category
# matcher extracts the data, need not to be of a particular category

In [21]:
nlp = spacy.load("en_core_web_sm")

In [22]:
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}] # wanna extract something that looks like an email
matcher.add("EMAIL_ADDRESS", [pattern]) # label added as "EMAIL_ADDRESS"

In [23]:
doc = nlp("This is an email address : deyk905@gmail.com")
matches = matcher(doc)

In [24]:
print(matches)

[(16571425990740197027, 6, 7)]


In [25]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [26]:
with open("wiki_mlk.txt", "r") as f:
    text = f.read()

In [27]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 â€“ April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.

King participated in and led marches for blacks' right to vote, desegregation, labor rights, and other basic civil rights. King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his famous "

In [28]:
# target is to extract all proper noun
nlp = spacy.load("en_core_web_sm")

In [29]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text);
matches = matcher(doc)
print(len(matches))
for i in matches[:10]:
    print(i, doc[i[1]:i[2]])

102
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 23, 24) American


In [30]:
# multiword token
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP":"+"}]
# improving multiword token
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text);
matches = matcher(doc)
print(len(matches))
for i in matches[:10]:
    print(i, doc[i[1]:i[2]])

172
(451313080118390996, 0, 1) Martin
(451313080118390996, 0, 2) Martin Luther
(451313080118390996, 1, 2) Luther
(451313080118390996, 0, 3) Martin Luther King
(451313080118390996, 1, 3) Luther King
(451313080118390996, 2, 3) King
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 1, 4) Luther King Jr.
(451313080118390996, 2, 4) King Jr.
(451313080118390996, 3, 4) Jr.


In [31]:
# multiword token
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP":"+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
# taking the longest, improved multiword token
doc = nlp(text);
matches = matcher(doc)
print(len(matches))
for i in matches[:10]:
    print(i, doc[i[1]:i[2]])

61
(451313080118390996, 470, 475) Martin Luther King Jr. Day
(451313080118390996, 537, 542) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 84, 88) Martin Luther King Sr
(451313080118390996, 129, 133) Southern Christian Leadership Conference
(451313080118390996, 248, 252) Director J. Edgar Hoover
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 326, 329) Nobel Peace Prize
(451313080118390996, 423, 426) James Earl Ray
(451313080118390996, 464, 467) Congressional Gold Medal


In [32]:
# multiword token
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP":"+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
# taking the longest, improved multiword token
doc = nlp(text);
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
# sorting it to the appearence 
print(len(matches))
for i in matches[:10]:
    print(i, doc[i[1]:i[2]])

61
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 16, 17) April
(451313080118390996, 23, 25) American Baptist
(451313080118390996, 50, 51) King
(451313080118390996, 70, 72) Mahatma Gandhi
(451313080118390996, 84, 88) Martin Luther King Sr
(451313080118390996, 90, 91) King
(451313080118390996, 114, 115) King


In [33]:
# multiword token
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP":"+"}, {"POS": "VERB"}]
# first pos wwould be a proper noun and second pos would be a verb
matcher.add("PROPER_NOUNS", [pattern], greedy="LONGEST")
# taking the longest, improved multiword token
doc = nlp(text);
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
# sorting it to the appearence 
print(len(matches))
for i in matches[:10]:
    print(i, doc[i[1]:i[2]])

6
(3232560085755078826, 50, 52) King advanced
(3232560085755078826, 90, 92) King participated
(3232560085755078826, 114, 116) King led
(3232560085755078826, 248, 253) Director J. Edgar Hoover considered
(3232560085755078826, 323, 325) King won
(3232560085755078826, 486, 489) United States beginning


In [34]:
import json
with open ("alice.json", "r") as f:
    data = json.load(f)

In [35]:
text = data[0][2][0]
# text1 = data
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, `and what is the use of a book,' thought Alice `without pictures or conversation?'


In [36]:
text = text.replace("`", "'")
print(text)

Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'


In [43]:
# target is to extract text with quotation marks with the speaker
# 
speak_lemmas = ["think", "say"]
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "'"}, 
           # IS_ALPHA : token text consists of alphabetic characters
            {"IS_ALPHA": True, "OP": "+"},
           # IS_PUNCT : token is punctuation, whitespace, stop word
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"},
           # there will be verb after the quotation mark, of the lemmatized
           # form speak_leammas 
           {"POS": "VERB", "LEMMA": {"IN": speak_lemmas}},
           # adding any proper noun after the lemma 
           {"POS": "PROPN", "OP": "+"},
           {"ORTH": "'"}, 
           # IS_ALPHA : token text consists of alphabetic characters
            {"IS_ALPHA": True, "OP": "+"},
           # IS_PUNCT : token is punctuation, whitespace, stop word
           {"IS_PUNCT": True, "OP": "*"},
           {"ORTH": "'"},
          ]
matcher.add("PROPER_NOUNS", [pattern], greedy="LONGEST")
# taking the longest, improved multiword token
doc = nlp(text);
matches = matcher(doc)
matches.sort(key = lambda x: x[1])
# sorting it to the appearence 
print(len(matches))
for i in matches[:10]:
    print(i, doc[i[1]:i[2]])

1
(3232560085755078826, 47, 67) 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
