In [1]:
import spacy
from spacy import displacy # for visualization
nlp = spacy.load('en_core_web_lg')



In [2]:
spacy.__version__

'2.3.2'

In [3]:
import textacy

#### Redacting Names with Named Entity Recognition

In [4]:
text = "Madam Pomfrey, the nurse, was kept busy by a sudden spate of colds among the staff and students. Her Pepperup potion worked instantly, though it left the drinker smoking at the ears for several hours afterward. Ginny Weasley, who had been looking pale, was bullied into taking some by Percy."

In [5]:
# Parse the text with SpaCy. This runs the entire NLP pipeline
doc = nlp(text)

In [6]:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

Pomfrey (PERSON)
Pepperup (PERSON)
several hours (TIME)
Ginny Weasley (PERSON)
Percy (PERSON)


In [7]:
doc.ents

(Pomfrey, Pepperup, several hours, Ginny Weasley, Percy)

In [8]:
entity.label, entity.label_

(380, 'PERSON')

In [9]:
spacy.explain('GPE')

'Countries, cities, states'

In [10]:
def redact_names(text):
    doc = nlp(text)
    redacted_sentence = []
    for token in doc:
        if token.ent_type_ == "PERSON":
            redacted_sentence.append("[REDACTED]")
        else:
            redacted_sentence.append(token.string)
    return "".join(redacted_sentence)

In [11]:
redact_names(text)

'Madam [REDACTED], the nurse, was kept busy by a sudden spate of colds among the staff and students. Her [REDACTED]potion worked instantly, though it left the drinker smoking at the ears for several hours afterward. [REDACTED][REDACTED], who had been looking pale, was bullied into taking some by [REDACTED].'

#### Entity Types

In [12]:
def explain_text_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        print(f'{ent}, Label: {ent.label_}, {spacy.explain(ent.label_)}')

In [13]:
explain_text_entities('Tesla has gained 20% market share in the months since')

Tesla, Label: ORG, Companies, agencies, institutions, etc.
20%, Label: PERCENT, Percentage, including "%"
the months, Label: DATE, Absolute or relative dates or periods


In [14]:
explain_text_entities('Taj Mahal built by Mughal Emperor Shah Jahan stands tall on the banks of Yamuna in modern day Agra, India')

Taj Mahal, Label: ORG, Companies, agencies, institutions, etc.
Mughal, Label: NORP, Nationalities or religious or political groups
Shah Jahan, Label: PERSON, People, including fictional
Yamuna, Label: ORG, Companies, agencies, institutions, etc.
Agra, Label: GPE, Countries, cities, states
India, Label: GPE, Countries, cities, states


In [15]:
explain_text_entities('Ashoka was a great Indian king')

Ashoka, Label: ORG, Companies, agencies, institutions, etc.
Indian, Label: NORP, Nationalities or religious or political groups


In [16]:
explain_text_entities('The Ashoka University sponsors the Young India Fellowship')

Ashoka University, Label: ORG, Companies, agencies, institutions, etc.


## Automatic Question Generation

In [17]:
example_text = 'Bansoori is an Indian classical instrument. Tom plays Bansoori and Guitar.'

In [18]:
doc = nlp(example_text)

In [20]:
for idx, sentence in enumerate(doc.sents):
    for noun in sentence.noun_chunks:
        print(f'sentence{idx+1}', noun)

sentence1 Bansoori
sentence1 an Indian classical instrument
sentence2 Tom
sentence2 Bansoori
sentence2 Guitar


In [21]:
for token in doc:
    print(token, token.pos_, token.tag_)

Bansoori PROPN NNP
is AUX VBZ
an DET DT
Indian ADJ JJ
classical ADJ JJ
instrument NOUN NN
. PUNCT .
Tom PROPN NNP
plays VERB VBZ
Bansoori PROPN NNP
and CCONJ CC
Guitar PROPN NNP
. PUNCT .


### Creating a Ruleset

In [22]:
ruleset = [
    {
        'id': 1, 
        'req_tags': ['NNP', 'VBZ', 'NN'],
    }, 
    {
        'id': 2, 
        'req_tags': ['NNP', 'VBZ'],
    }
    ]

In [23]:
print(ruleset)

[{'id': 1, 'req_tags': ['NNP', 'VBZ', 'NN']}, {'id': 2, 'req_tags': ['NNP', 'VBZ']}]


In [25]:
def get_pos_tag(doc, tag):
    return [tok for tok in doc if tok.tag_ == tag]

In [26]:
def sent_to_ques(sent:str)->str:
    """
    Return a question string corresponding to a sentence string using a set of pre-written rules
    """
    doc = nlp(sent)
    pos_tags = [token.tag_ for token in doc]
    for idx, rule in enumerate(ruleset):
        if rule['id'] == 1:
            if all(key in pos_tags for key in rule['req_tags']): 
                print(f"Rule id {rule['id']} matched for sentence: {sent}")
                NNP = get_pos_tag(doc, "NNP")
                NNP = str(NNP[0])
                VBZ = get_pos_tag(doc, "VBZ")
                VBZ = str(VBZ[0])
                ques = f'What {VBZ} {NNP}?'
                return(ques)
        if rule['id'] == 2:
            if all(key in pos_tags for key in rule['req_tags']): #'NNP', 'VBZ' in sentence.
                print(f"Rule id {rule['id']} matched for sentence: {sent}")
                NNP = get_pos_tag(doc, "NNP")
                NNP = str(NNP[0])
                VBZ = get_pos_tag(doc, "VBZ")
                VBZ = str(VBZ[0].lemma_)
                ques = f'What does {NNP} {VBZ}?'
                return(ques)

In [27]:
for sent in doc.sents:
    print(f"The generated quietion is : {sent_to_ques(str(sent))}")

Rule id 1 matched for sentence: Bansoori is an Indian classical instrument.
The generated quietion is : What is Bansoori?
Rule id 2 matched for sentence: Tom plays Bansoori and Guitar.
The generated quietion is : What does Tom play?
