### Load Packages

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))
nltk.download("words")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

### Prepare Data

In [2]:
txt = "Heavy rain continues to lash parts of western Maharashtra, Raigad and Thane districts with more than 60 barrages submerged in Kolhapur district"
tokenized = sent_tokenize(txt)
print(tokenized)

['Heavy rain continues to lash parts of western Maharashtra, Raigad and Thane districts with more than 60 barrages submerged in Kolhapur district']


### String Tokenization And Parts of Speech Tagging

In [3]:
# Passing sentence by sentence in the for loop
for i in tokenized:
    # Tokenizing the sentence
    wordsList = nltk.word_tokenize(i)
    # Removing stop words like is, on , and, ...
    wordsList = [w for w in wordsList if not w in stop_words]
    # Task1 Call the method that does Parts of Speech Tagging from nltk library and the pass the wordsList
    # tagged = ...
    # Your code here
    
    tagged = nltk.pos_tag(wordsList)
    
    print(tagged)

[('Heavy', 'NNP'), ('rain', 'NN'), ('continues', 'VBZ'), ('lash', 'JJ'), ('parts', 'NNS'), ('western', 'JJ'), ('Maharashtra', 'NNP'), (',', ','), ('Raigad', 'NNP'), ('Thane', 'NNP'), ('districts', 'NNS'), ('60', 'CD'), ('barrages', 'NNS'), ('submerged', 'VBN'), ('Kolhapur', 'NNP'), ('district', 'NN')]


In [4]:
wordsList

['Heavy',
 'rain',
 'continues',
 'lash',
 'parts',
 'western',
 'Maharashtra',
 ',',
 'Raigad',
 'Thane',
 'districts',
 '60',
 'barrages',
 'submerged',
 'Kolhapur',
 'district']

Representations:

CC 	coordinating conjunction
CD 	cardinal digit
DT 	determiner
EX 	existential there
FW 	foreign word
IN 	preposition/subordinating conjunction
JJ 	This NLTK POS Tag is an adjective (large)
JJR 	adjective, comparative (larger)
JJS 	adjective, superlative (largest)
LS 	list market
MD 	modal (could, will)
NN 	noun, singular (cat, tree)
NNS 	noun plural (desks)
NNP 	proper noun, singular (sarah)
NNPS 	proper noun, plural (indians or americans)
PDT 	predeterminer (all, both, half)
POS 	possessive ending (parent\ 's)
PRP 	personal pronoun (hers, herself, him,himself)
PRP$ 	possessive pronoun (her, his, mine, my, our )
RB 	adverb (occasionally, swiftly)
RBR 	adverb, comparative (greater)
RBS 	adverb, superlative (biggest)
RP 	particle (about)
TO 	infinite marker (to)
UH 	interjection (goodbye)
VB 	verb (ask)
VBG 	verb gerund (judging)
VBD 	verb past tense (pleaded)
VBN 	verb past participle (reunified)
VBP 	verb, present tense not 3rd person singular(wrap)
VBZ 	verb, present tense with 3rd person singular (bases)
WDT 	wh-determiner (that, what)
WP 	wh- pronoun (who)
WRB 	wh- adverb (how) 


### Named Entity Recognition

**Named Entity Recognition** (NER) is a standard NLP problem which involves spotting named entities (people, places, organizations etc.) from a chunk of text, and classifying them into a predefined set of categories.

Refer: https://www.nltk.org/book/ch07.html

In [5]:
# Sample sentence

sentence = "Heavy rain continues to lash parts of western Maharashtra districts with more than 60 barrages submerged in Kolhapur district."

# Sending sentence by sentence inside the loop
for sent in nltk.sent_tokenize(sentence):
    # We will form chunks based on the tokenized words and pos tagging
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        # The chunks will hold a attribute label if it has identified that the chunk is a Named Entity
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Heavy
GPE Kolhapur


***NE Type 	Examples***
ORGANIZATION 	Georgia-Pacific Corp., 
PERSON 	Eddy Bonte, President Obama
LOCATION 	Murray River, Mount Everest
DATE 	June, 2008-06-29
TIME 	two fifty a m, 1:30 p.m.
MONEY 	175 million Canadian Dollars, 
PERCENT 	twenty pct, 18.75 %
FACILITY 	Washington Monument, Stonehenge
GPE 	South East Asia, Midlothian

In [9]:
# Task 2 
# List a few named entities that you can think of.
# Check if those entities are being caputered by nltk library by passing a sentence containing that named entitiy in the above code.
sentence = "Sensex and Nifty were set to open in green as Nifty futures on the Singapore Exchange traded 100.5 points, or 0.64 per cent higher at 15,848. Benchmark indices snapped their four-session losing run on Friday led by gains in index majors Reliance Industries, ICICI Bank and HDFC. Sensex ended 166 points higher at 52,484. Nifty advanced 42.20 points at 15,722."
for sent in nltk.sent_tokenize(sentence):
    # We will form chunks based on the tokenized words and pos tagging
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        # The chunks will hold a attribute label if it has identified that the chunk is a Named Entity
        if hasattr(chunk, 'label'):
            print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Sensex
PERSON Nifty
GPE Nifty
ORGANIZATION Singapore Exchange
GPE Benchmark
ORGANIZATION Reliance Industries
ORGANIZATION ICICI Bank
ORGANIZATION HDFC
GPE Sensex
PERSON Nifty


In [7]:
# Example for other Parsers

In [11]:
from nltk.parse.stanford import StanfordDependencyParser

# Download the parser from https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip
# Extract the folder and link stanford-parser.jar
path_jar = "/content/sample_data/stanford-parser.jar"

# Link stanford-parser-4.2.0-models.jar
path_models_jar = "/content/sample_data/stanford-parser-4.2.0-models.jar"

dep_parser = StanfordDependencyParser(path_to_jar = path_jar, path_to_models_jar = path_models_jar)

result = dep_parser.raw_parse("Heavy rain continues to lash parts of western Maharashtra, Raigad and Thane districts with more than 60 barrages submerged in Kolhapur district")
dependency = result.__next__()

#Print the results of the parser
print(list(dependency.triples()))

[(('continues', 'VBZ'), 'nsubj', ('rain', 'NN')), (('rain', 'NN'), 'amod', ('Heavy', 'JJ')), (('continues', 'VBZ'), 'xcomp', ('lash', 'VB')), (('lash', 'VB'), 'mark', ('to', 'TO')), (('lash', 'VB'), 'obj', ('parts', 'NNS')), (('parts', 'NNS'), 'nmod', ('districts', 'NNS')), (('districts', 'NNS'), 'case', ('of', 'IN')), (('districts', 'NNS'), 'amod', ('western', 'JJ')), (('districts', 'NNS'), 'compound', ('Maharashtra', 'NNP')), (('Maharashtra', 'NNP'), 'conj', ('Raigad', 'NNP')), (('Maharashtra', 'NNP'), 'conj', ('Thane', 'NNP')), (('Thane', 'NNP'), 'cc', ('and', 'CC')), (('lash', 'VB'), 'obl', ('barrages', 'NNS')), (('barrages', 'NNS'), 'case', ('with', 'IN')), (('barrages', 'NNS'), 'nummod', ('60', 'CD')), (('60', 'CD'), 'advmod', ('more', 'JJR')), (('more', 'JJR'), 'fixed', ('than', 'IN')), (('barrages', 'NNS'), 'acl', ('submerged', 'VBN')), (('submerged', 'VBN'), 'obl', ('district', 'NN')), (('district', 'NN'), 'case', ('in', 'IN')), (('district', 'NN'), 'compound', ('Kolhapur', 'N

In [12]:
sentence = "Sensex and Nifty were set to open in green as Nifty futures on the Singapore Exchange traded 100.5 points, or 0.64 per cent higher at 15,848. Benchmark indices snapped their four-session losing run on Friday led by gains in index majors Reliance Industries, ICICI Bank and HDFC. Sensex ended 166 points higher at 52,484. Nifty advanced 42.20 points at 15,722."
result = dep_parser.raw_parse(sentence)
dependency = result.__next__()

#Print the results of the parser
print(list(dependency.triples()))

[(('set', 'VBN'), 'nsubj:pass', ('Sensex', 'NNP')), (('Sensex', 'NNP'), 'conj', ('Nifty', 'NNP')), (('Nifty', 'NNP'), 'cc', ('and', 'CC')), (('set', 'VBN'), 'aux:pass', ('were', 'VBD')), (('set', 'VBN'), 'xcomp', ('open', 'VB')), (('open', 'VB'), 'mark', ('to', 'TO')), (('open', 'VB'), 'obl', ('futures', 'NNS')), (('futures', 'NNS'), 'case', ('in', 'IN')), (('futures', 'NNS'), 'amod', ('green', 'JJ')), (('green', 'JJ'), 'obl', ('Nifty', 'NNP')), (('Nifty', 'NNP'), 'case', ('as', 'IN')), (('open', 'VB'), 'obl', ('Exchange', 'NNP')), (('Exchange', 'NNP'), 'case', ('on', 'IN')), (('Exchange', 'NNP'), 'det', ('the', 'DT')), (('Exchange', 'NNP'), 'compound', ('Singapore', 'NNP')), (('Exchange', 'NNP'), 'acl', ('traded', 'VBN')), (('traded', 'VBN'), 'obj', ('points', 'NNS')), (('points', 'NNS'), 'nummod', ('100.5', 'CD')), (('set', 'VBN'), 'conj', ('snapped', 'VBD')), (('snapped', 'VBD'), 'cc', ('or', 'CC')), (('snapped', 'VBD'), 'obl', ('15,848', 'CD')), (('15,848', 'CD'), 'advmod', ('highe