In [49]:
from pprint import pprint

## Tokenizing Text and Wordnet basics 

- practice based on Jacob Perkins-Python 3 Text Processing with NLTK

In [6]:
from nltk.tokenize import sent_tokenize
para = "Given an array A of distinct integers sorted in ascending order, return the smallest index i that satisfies A[i] == i.  Return -1 if no such i exists."
print(sent_tokenize(para))
# if we're going to be tokenizing a lot of sentences, it's more efficient to load the PunktSentenceTokenizer class once, and call its tokenize() method instead:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
print(tokenizer.tokenize(para))

['Given an array A of distinct integers sorted in ascending order, return the smallest index i that satisfies A[i] == i.', 'Return -1 if no such i exists.']
['Given an array A of distinct integers sorted in ascending order, return the smallest index i that satisfies A[i] == i.', 'Return -1 if no such i exists.']


In [4]:
from nltk.tokenize import word_tokenize
print(word_tokenize('Hello World.'))

# this is equivalent to the following code
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize('Hello World.'))

['Hello', 'World', '.']
['Hello', 'World', '.']


## WordPunctTokenizer

In [10]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
tokenizer.tokenize("Can't is a contraction.")

['Can', "'", 't', 'is', 'a', 'contraction', '.']

In [11]:
word_tokenize("Can't is a contraction.")

['Ca', "n't", 'is', 'a', 'contraction', '.']

## PunktWordTokenizer

There is no PunktWordTokenizer. 
There is only sentence level tokenizer: PunktSentenceTokenizer

In [15]:
from nltk.tokenize import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()
tokenizer.tokenize("Can't is a contraction.")

["Can't is a contraction."]

## Tokening sentences using regular expressions

In [17]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction']

### Simple whitespace tokenizer

In [19]:
tokenizer = RegexpTokenizer('\s+', gaps=True)
tokenizer.tokenize("Can't is a contraction.")

["Can't", 'is', 'a', 'contraction.']

## Training a sentence tokenizer

In [39]:
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)

sents1 = sent_tokenizer.tokenize(text)
print("Trained sentence tokenizer")
print(sents1[0])
print(sents1[678])

# compare it with original sent_tokenize
print("\nOriginal sentence tokenizer")
sents2 = sent_tokenize(text)
print(sents2[0])
print(sents2[678])

Trained sentence tokenizer
White guy: So, do you have any plans for this evening?
Girl: But you already have a Big Mac...

Original sentence tokenizer
White guy: So, do you have any plans for this evening?
Girl: But you already have a Big Mac...
Hobo: Oh, this is all theatrical.


## Filtering stopwords in a tokenized sentence

* The stopwords corpus is an instance of nltk.corpus.reader

In [40]:
from nltk.corpus import stopwords
english_stops = set(stopwords.words('english'))
words = tokenizer.tokenize("Can't is a contraction.")
[word for word in words if word not in english_stops]

["Can't", 'contraction.']

In [41]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish',
 'turkish']

## Looking up Synsets for a word in WordNet

WordNet is a lexical database for the English language.

In [43]:
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
print(syn.name())
print(syn.definition())

cookbook.n.01
a book of recipes and cooking directions


In [46]:
wordnet.synsets('cooking')[0].examples()

['cooking can be a great art',
 'people are needed who have experience in cookery',
 'he left the preparation of meals to his wife']

### Working with hypernyms

Synsets are organized in a structure similar to that of an inheritance tree.

In [47]:
syn.hypernyms()

[Synset('reference_book.n.01')]

In [54]:
pprint(syn.hypernyms()[0].hyponyms())

[Synset('annual.n.02'),
 Synset('atlas.n.02'),
 Synset('cookbook.n.01'),
 Synset('directory.n.01'),
 Synset('encyclopedia.n.01'),
 Synset('handbook.n.01'),
 Synset('instruction_book.n.01'),
 Synset('source_book.n.01'),
 Synset('wordbook.n.01')]


In [55]:
syn.root_hypernyms()

[Synset('entity.n.01')]

In [57]:
syn.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('creation.n.02'),
  Synset('product.n.02'),
  Synset('work.n.02'),
  Synset('publication.n.01'),
  Synset('book.n.01'),
  Synset('reference_book.n.01'),
  Synset('cookbook.n.01')]]

### Part of speech (POS)

In [59]:
syn.pos()

'n'

In [60]:
len(wordnet.synsets('great'))

7

In [67]:
# great has 1 noun Synset and 6 adjective Synsets
print(len(wordnet.synsets('great', pos='n')))
print(len(wordnet.synsets('great', pos='a')))
[word.pos() for word in wordnet.synsets('great')]

1
6


['n', 's', 's', 's', 's', 's', 's']

In [70]:
# fun has 4 noun Synset
print(len(wordnet.synsets('fun', pos='n')))
print(len(wordnet.synsets('fun', pos='a')))
print(len(wordnet.synsets('fun', pos='r')))
print(len(wordnet.synsets('fun', pos='v')))
[word.pos() for word in wordnet.synsets('fun')]

4
0
0
0


['n', 'n', 'n', 'n']

## Looking up lemmas and synonyms in WordNet

- A lemma (in linguistics), is the canonical form or morphological form of a word.
- Some lemmas also have antonyms.

In [77]:
from nltk.corpus import wordnet
syn = wordnet.synsets('cookbook')[0]
lemmas = syn.lemmas()
print(len(lemmas))

for lemma in lemmas:
    print(lemma.name())

# print all possible synonyms
synonyms = []
for syn in wordnet.synsets('book'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(len(synonyms), len(set(synonyms)))

2
cookbook
cookery_book
38 25


In [78]:
synonyms

['book',
 'book',
 'volume',
 'record',
 'record_book',
 'book',
 'script',
 'book',
 'playscript',
 'ledger',
 'leger',
 'account_book',
 'book_of_account',
 'book',
 'book',
 'book',
 'rule_book',
 'Koran',
 'Quran',
 "al-Qur'an",
 'Book',
 'Bible',
 'Christian_Bible',
 'Book',
 'Good_Book',
 'Holy_Scripture',
 'Holy_Writ',
 'Scripture',
 'Word_of_God',
 'Word',
 'book',
 'book',
 'book',
 'reserve',
 'hold',
 'book',
 'book',
 'book']

In [83]:
gn2 = wordnet.synset('good.n.02')
print(gn2.definition())
evil = gn2.lemmas()[0].antonyms()[0]
print(evil.name())
print(evil.synset().definition())

moral excellence or admirableness
evil
the quality of being morally wrong in principle or practice


In [84]:
ga1 = wordnet.synset('good.a.01')
print(ga1.definition())
bad = ga1.lemmas()[0].antonyms()[0]
print(bad.name())
print(bad.synset().definition())

having desirable or positive qualities especially those suitable for a thing specified
bad
having undesirable or negative qualities


## Calculating WordNet Synset similarity

- Using wup_similarity method to calculate the similarity for nouns
 - Wu-Palmer Similarity is a scoring method based on how similar the word senses are and where the Synsets occur relative to each other in the hypernym tree.
- Can also use wup_similarity method to calculate the similarity for verbs
- Two other similarity comparisons:
 - Path similarity
 - Leacock Chordorow (LCH) similarity

In [86]:
from nltk.corpus import wordnet
cb = wordnet.synset('cookbook.n.01')
ib = wordnet.synset('instruction_book.n.01')
cb.wup_similarity(ib)

0.9166666666666666

In [93]:
dog = wordnet.synset('dog.n.01')
dog.wup_similarity(cb)

0.38095238095238093

In [94]:
cook = wordnet.synset('cook.v.01')
bake = wordnet.synset('bake.v.02')
cook.wup_similarity(bake)

0.6666666666666666

In [97]:
print(cb.path_similarity(ib))
print(cb.path_similarity(dog))
print(cb.lch_similarity(ib))
print(cb.lch_similarity(dog))

0.3333333333333333
0.07142857142857142
2.538973871058276
0.9985288301111273


## Discovering word collocations

Collocations are two or more words that tend to appear frequently together, such as United States.

- scoring functions
 - likelihood_ratio()
 - raw_freq() (check NgramAssocMeasures in the nltk.metrics)
- scoring ngrams
 - nbest(score_fun, number)
 - above_score(score_fun, min_score): return all ngrams with scores that are at least min_score
 - score_ngrams(score_fn): return a list with tuple pairs of (ngram, score)
### How it works

BigramCollocationFinder constructs two frequency distributions: one for each word, and another for bigrams. A frequency distribution, or FreqDist in NLTK, is basically an enhanced Python dictionary where the keys are what's being counted, and the values are the counts.

In [101]:
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

# naive approach
words = [w.lower() for w in webtext.words('grail.txt')]
bcf = BigramCollocationFinder.from_words(words)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))

# refine it by adding a word filter to remove punctuation and stopwords
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 10))

[("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]
[('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble'), ('squeak', 'squeak'), ('saw', 'saw'), ('holy', 'grail'), ('run', 'away'), ('french', 'guard'), ('cartoon', 'character')]
