# INIT

In [30]:
%matplotlib inline
import nltk
import re

In [32]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [35]:
text = "This is for display. It is a demo. A demo of NLTK."

# Tokenization

In [36]:
sentences = nltk.sent_tokenize(text)
sentences

['This is for display.', 'It is a demo.', 'A demo of NLTK.']

In [37]:
words = nltk.word_tokenize(text)
words

['This',
 'is',
 'for',
 'display',
 '.',
 'It',
 'is',
 'a',
 'demo',
 '.',
 'A',
 'demo',
 'of',
 'NLTK',
 '.']

In [38]:
wordsInSentences = [nltk.word_tokenize(sentence) for sentence in sentences]
wordsInSentences

[['This', 'is', 'for', 'display', '.'],
 ['It', 'is', 'a', 'demo', '.'],
 ['A', 'demo', 'of', 'NLTK', '.']]

# Stemming

In [39]:
demoWords = ["displayed", "display", "displays"]
pst = nltk.stem.PorterStemmer()
stemmedWords = [pst.stem(demoWord) for demoWord in demoWords]
stemmedWords

['display', 'display', 'display']

# Tagging

In [40]:
taggedWords = nltk.pos_tag(words)
taggedWords

[('This', 'DT'),
 ('is', 'VBZ'),
 ('for', 'IN'),
 ('display', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('demo', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('demo', 'NN'),
 ('of', 'IN'),
 ('NLTK', 'NNP'),
 ('.', '.')]

In [41]:
DT = "Determiner"
VBZ = "verb, 3rd person sing. present takes"
IN = "preposition/subordinating conjunction"
NN = "noun, singular"
. = "Sentence-final punctuation"
PRP = "personal pronoun:  I, he, she"
NNP = "proper noun, singular"
JJ = "adjective"

In [42]:
taggedSentences = [nltk.pos_tag(words) for words in wordsInSentences]
taggedSentences

[[('This', 'DT'), ('is', 'VBZ'), ('for', 'IN'), ('display', 'NN'), ('.', '.')],
 [('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('demo', 'NN'), ('.', '.')],
 [('A', 'DT'), ('demo', 'NN'), ('of', 'IN'), ('NLTK', 'NNP'), ('.', '.')]]

# Chunking

In [44]:
regex = "NP: { <DT >? <JJ >* <NN >}"
parser = nltk.RegexpParser(regex)
chunkedSentences = [parser.parse(sentence) for sentence in taggedSentences]
chunkedSentences

[Tree('S', [('This', 'DT'), ('is', 'VBZ'), ('for', 'IN'), Tree('NP', [('display', 'NN')]), ('.', '.')]),
 Tree('S', [('It', 'PRP'), ('is', 'VBZ'), Tree('NP', [('a', 'DT'), ('demo', 'NN')]), ('.', '.')]),
 Tree('S', [Tree('NP', [('A', 'DT'), ('demo', 'NN')]), ('of', 'IN'), ('NLTK', 'NNP'), ('.', '.')])]

In [45]:
chunkedSentences[1].draw()

# Named Entity Recognition

In [22]:
namedEntities = [nltk.ne_chunk(sentence, binary=True) for sentence in taggedSentences]
namedEntities

[Tree('S', [('This', 'DT'), ('is', 'VBZ'), ('for', 'IN'), ('display', 'NN'), ('.', '.')]),
 Tree('S', [('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('demo', 'NN'), ('.', '.')]),
 Tree('S', [('A', 'DT'), ('demo', 'NN'), ('of', 'IN'), Tree('NE', [('NLTK', 'NNP')])])]

In [29]:
namedEntities[2].draw()

#  Relation Extraction

In [28]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
                                      corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
