In [1]:
# Parts of Speech: Tagging words in a corpus
# Rules-Based POS Tagging: Tag words as Noun, Verb etc.
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [2]:
# Tokenise and Tag Words
sentence = 'i enjoy the piano'
tokens = nltk.word_tokenize(sentence)
tags = nltk.pos_tag(tokens)
print(tags,'\n')

nltk.help.upenn_tagset('NN')

[('i', 'NN'), ('enjoy', 'VBP'), ('the', 'DT'), ('piano', 'NN')] 

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [3]:
# Stochastic POS Tagging: # Use anything other than rules-based methods.

# Unigram/Word Frequency Approach: Tag a word based on based on the probablity that
# similar word has tag. E.g. Beaut would be tagged as ADJ if Beautiful is in dataset
# N-Gram: N is how many words are considered when matching term with word in corpus
# Unigram appraoch sets n=1
# Hidden Markov Model combines both

In [4]:
# Stochastic POS Tagging with Spacy
import spacy
from spacy import displacy
# Below model pre-trained on English language examples (blogs etc.)
nlp = spacy.load('en_core_web_sm')
# POS tagging 
doc = nlp(u'and so i said i will play the piano at home tonight')
# Tokenise and Assign Tags
for token in doc:
  print(token.text,token.pos_,token.tag_)

print('\n')
# spacy.explain('DET')

displacy.render(doc, style="dep",jupyter=True)

and CCONJ CC
so ADV RB
i PRON PRP
said VERB VBD
i PRON PRP
will VERB MD
play VERB VB
the DET DT
piano NOUN NN
at ADP IN
home NOUN NN
tonight NOUN NN




In [5]:
# Chunking: Takes words and their tags as input to see if they can be combined
# E.g. United and Kingdom combined as United Kingdom. 5 major chunk tags:
# Noun Phrase (Head word is a noun), Verb, Adjective, Adverb and Prepositional Phrases

# Chunking with NLTK
from matplotlib.pyplot import *
rule = r'''Noun Phrase: {<DT>?<JJ>*<NN>}'''
sentence = 'a nice bird flew away across the horizon'
tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
chunkParser = nltk.RegexpParser(rule)
chunked = chunkParser.parse(tagged)
print(chunked)
print('This gives us phrase tree')
# chunked.draw() - Not Working

(S
  (Noun Phrase a/DT nice/JJ bird/NN)
  flew/VBD
  away/RB
  across/IN
  (Noun Phrase the/DT horizon/NN))
This gives us phrase tree


In [6]:
# Chunking with Spacy
doc = nlp('a nice bird flew away across the horizon')
for chunk in doc.noun_chunks:
  print(chunk.text,chunk.root.text,chunk.root.dep_)

a nice bird bird nsubj
the horizon horizon pobj


In [7]:
# Chinking can be performed after Chunking, can be used to extract chunks and drop noise
# E.g. If you only want Nouns use Chinking to extract them. See below:

rule = r'''Chink:{<.*>+}
       }<VB.?|CC|RB|JJ|IN|DT|TO>+{'''
sentence = 'a nice bird flew away across the horizon'
tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
chinkParser = nltk.RegexpParser(rule)
chinked = chinkParser.parse(tagged)
print(chinked)

(S
  a/DT
  nice/JJ
  (Chink bird/NN)
  flew/VBD
  away/RB
  across/IN
  the/DT
  (Chink horizon/NN))


In [16]:
# Names Entity Recognition: ID + extract named entities from corpora and assign to category
# Most are supervised learning, input is tokens with POS tags
# Rule-Based: Same as Rule-Based POS
# Stochastic: Uses statistics to name/recognise entities (Max Entropy/Hidden Markov)

#NLTK example
nltk.download('maxent_ne_chunker')
nltk.download('words')
sentence = 'Dave visitied London after taking an EasyJet flight from Paris'
# Tokenise and Tag
tags = nltk.pos_tag(nltk.word_tokenize(sentence))
# NER using NLTK ne_chunk
ner = nltk.ne_chunk(tags,binary=True)
print(ner)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
(S
  (NE Dave/NNP)
  visitied/VBD
  (NE London/NNP)
  after/IN
  taking/VBG
  an/DT
  (NE EasyJet/NNP)
  flight/NN
  from/IN
  (NE Paris/NNP))


In [27]:
# NER with Spacy
doc = nlp(u'Shubjangi visitied London after taking an EasyJet flight from Paris')
# For each entity, print it's text and label
for entity in doc.ents:
  print(entity.text,entity.label_)
print('\n')

# Try adding surname for Shubjangi
doc = nlp(u'Shubjangi Hora visitied London after taking an EasyJet flight from Paris')
for entity in doc.ents:
  print(entity.text,entity.label_)
  
print('\n')
spacy.explain('GPE')


Shubjangi ORG
London GPE
EasyJet ORG
Paris GPE


Shubjangi Hora PERSON
London GPE
EasyJet ORG
Paris GPE




'Countries, cities, states'