In [2]:
import nltk

In [2]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

In [5]:
len(emma)

192427

In [6]:
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))

In [8]:
emma.concordance('girl')

Displaying 25 of 47 matches:
t : she is a civil , pretty - spoken girl ; I have a great opinion of her . Wh
lcome request : for Miss Smith was a girl of seventeen , whom Emma knew very w
ere with her . She was a very pretty girl , and her beauty happened to be of a
 very unfit to be the intimates of a girl who wanted only a little more knowle
e ; but the humble , grateful little girl went off with highly gratified feeli
 that he thought Harriet a beautiful girl , which she trusted , with such freq
 could not dispense with :-- but the girl who could be gratified by a Robert M
r Emma , that there should be such a girl in Highbury for her to associate wit
h of mind , or tend at all to make a girl adapt herself rationally to the vari
. You have cured her of her school - girl ' s giggle ; she really does you cre
r believed her . What is the foolish girl about ?" " Oh ! to be sure ," cried 
. Emma , your infatuation about that girl blinds you . What are Harriet Smith 
ommon school . She is n

In [9]:
emma.similar('girl')

man woman manner little friend situation day time visit look question
word person blush moment crowd home world house family


In [10]:
emma.similar('boy')

friend so man morning change visit distance while doubt sort letter
farther blush john emma jane i and home had


In [11]:
emma.collocations()

Frank Churchill; Miss Woodhouse; Miss Bates; Jane Fairfax; Miss
Fairfax; every thing; young man; every body; great deal; dare say;
John Knightley; Maple Grove; Miss Smith; Miss Taylor; Robert Martin;
Colonel Campbell; Box Hill; said Emma; Harriet Smith; William Larkins


In [15]:
from nltk.corpus import gutenberg

In [16]:
emma = gutenberg.words('austen-emma.txt')

In [17]:
raw = gutenberg.raw("burgess-busterbrown.txt")

In [19]:
raw[:20]

'[The Adventures of B'

In [20]:
word = gutenberg.words("burgess-busterbrown.txt")

In [21]:
word

['[', 'The', 'Adventures', 'of', 'Buster', 'Bear', ...]

In [22]:
sents = gutenberg.sents("burgess-busterbrown.txt")
sents

[['[', 'The', 'Adventures', 'of', 'Buster', 'Bear', 'by', 'Thornton', 'W', '.', 'Burgess', '1920', ']'], ['I'], ...]

# 2   Conditional Frequency Distributions

In [23]:
from nltk.corpus import brown

In [24]:
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

In [25]:
genre_word = [
    (genre, word)
    for genre in ['news', 'romance']
    for word in brown.words(categories=genre)
]

In [31]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [32]:
len(brown.words())

1161192

In [35]:
len(brown.words(categories='romance'))

70022

In [36]:
genre_word[:10]

[('news', 'The'),
 ('news', 'Fulton'),
 ('news', 'County'),
 ('news', 'Grand'),
 ('news', 'Jury'),
 ('news', 'said'),
 ('news', 'Friday'),
 ('news', 'an'),
 ('news', 'investigation'),
 ('news', 'of')]

In [37]:
cond_dict = nltk.ConditionalFreqDist(genre_word)

In [40]:
cond_dict['news']

FreqDist({'the': 5580, ',': 5188, '.': 4030, 'of': 2849, 'and': 2146, 'to': 2116, 'a': 1993, 'in': 1893, 'for': 943, 'The': 806, ...})

## 2.3   Plotting and Tabulating Distributions

In [41]:
from nltk.corpus import inaugural

In [42]:
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)

In [44]:
cfd['america']

FreqDist({'2017': 35, '1993': 33, '1997': 31, '2005': 30, '1921': 24, '1973': 23, '1985': 21, '2001': 20, '2013': 19, '1981': 16, ...})

In [48]:
cfd.tabulate(conditions=['america', 'citizen'], samples=range(10), cumulative=True)

        0 1 2 3 4 5 6 7 8 9 
america 0 0 0 0 0 0 0 0 0 0 
citizen 0 0 0 0 0 0 0 0 0 0 


You may have noticed that the multi-line expressions we have been using with conditional frequency distributions look like list comprehensions, but without the brackets. In general, when we use a list comprehension as a parameter to a function, like set([w.lower() for w in t]), we are permitted to omit the square brackets and just write: set(w.lower() for w in t).

In [51]:
def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'xs' or word[-2:] in ['ch', 'sh']:
        return word + 'es'
    elif word.endswith('an'):
        return word[:-2] + 'en'
    else:
        return word + 's'

In [52]:
plural('man')

'men'

In [54]:
plural('match')

'matches'

In [55]:
plural('fairy')

'fairies'

# 4   Lexical Resources

## 4.1   Wordlist Corpora

In [58]:
def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    eng_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - eng_vocab
    return sorted(unusual)

In [59]:
unusual_words(nltk.corpus.gutenberg.words('austen-sense.txt'))

['abbeyland',
 'abhorred',
 'abilities',
 'abounded',
 'abridgement',
 'abused',
 'abuses',
 'accents',
 'accepting',
 'accommodations',
 'accompanied',
 'accounted',
 'accounts',
 'accustomary',
 'aches',
 'acknowledging',
 'acknowledgment',
 'acknowledgments',
 'acquaintances',
 'acquiesced',
 'acquitted',
 'acquitting',
 'acted',
 'actions',
 'adapted',
 'adding',
 'additions',
 'addressed',
 'addresses',
 'addressing',
 'adhering',
 'adieus',
 'adjusting',
 'administering',
 'admirers',
 'admires',
 'admitting',
 'adorned',
 'advances',
 'advantages',
 'affairs',
 'affections',
 'affects',
 'affixed',
 'afflictions',
 'afforded',
 'affording',
 'ages',
 'agitated',
 'agonies',
 'ailments',
 'aimed',
 'alarms',
 'alienated',
 'alighted',
 'alleged',
 'allenham',
 'allowances',
 'allowed',
 'allowing',
 'alluded',
 'alterations',
 'altered',
 'altering',
 'amended',
 'amounted',
 'amusements',
 'ankles',
 'annamaria',
 'annexed',
 'announced',
 'announcing',
 'annuities',
 'annum',
 

In [60]:
from nltk.corpus import stopwords

In [62]:
STOPWORDS = stopwords.words('english')

In [65]:
def content_fraction(text):
    content = [w.lower() for w in text if w not in STOPWORDS]
    return len(content) / len(text)

In [66]:
content_fraction(nltk.corpus.reuters.words())

0.7605545002298215

In [67]:
names = nltk.corpus.names

In [68]:
names.fileids()

['female.txt', 'male.txt']

In [69]:
male_names = names.words('male.txt')

In [70]:
female_names = names.words('female.txt')

In [71]:
[w for w in male_names if w in female_names]

['Abbey',
 'Abbie',
 'Abby',
 'Addie',
 'Adrian',
 'Adrien',
 'Ajay',
 'Alex',
 'Alexis',
 'Alfie',
 'Ali',
 'Alix',
 'Allie',
 'Allyn',
 'Andie',
 'Andrea',
 'Andy',
 'Angel',
 'Angie',
 'Ariel',
 'Ashley',
 'Aubrey',
 'Augustine',
 'Austin',
 'Averil',
 'Barrie',
 'Barry',
 'Beau',
 'Bennie',
 'Benny',
 'Bernie',
 'Bert',
 'Bertie',
 'Bill',
 'Billie',
 'Billy',
 'Blair',
 'Blake',
 'Bo',
 'Bobbie',
 'Bobby',
 'Brandy',
 'Brett',
 'Britt',
 'Brook',
 'Brooke',
 'Brooks',
 'Bryn',
 'Cal',
 'Cam',
 'Cammy',
 'Carey',
 'Carlie',
 'Carlin',
 'Carmine',
 'Carroll',
 'Cary',
 'Caryl',
 'Casey',
 'Cass',
 'Cat',
 'Cecil',
 'Chad',
 'Chris',
 'Chrissy',
 'Christian',
 'Christie',
 'Christy',
 'Clair',
 'Claire',
 'Clare',
 'Claude',
 'Clem',
 'Clemmie',
 'Cody',
 'Connie',
 'Constantine',
 'Corey',
 'Corrie',
 'Cory',
 'Courtney',
 'Cris',
 'Daffy',
 'Dale',
 'Dallas',
 'Dana',
 'Dani',
 'Daniel',
 'Dannie',
 'Danny',
 'Darby',
 'Darcy',
 'Darryl',
 'Daryl',
 'Deane',
 'Del',
 'Dell',
 'Deme

In [None]:
cfd = nltk.ConditionalFreqDist(
    (fileid, name[-1])
    
)

## 4.2   A Pronouncing Dictionary

In [3]:
entries = nltk.corpus.cmudict.entries()

In [4]:
for entry in entries[42371:42379]:
    print(entry)

('fir', ['F', 'ER1'])
('fire', ['F', 'AY1', 'ER0'])
('fire', ['F', 'AY1', 'R'])
('firearm', ['F', 'AY1', 'ER0', 'AA2', 'R', 'M'])
('firearm', ['F', 'AY1', 'R', 'AA2', 'R', 'M'])
('firearms', ['F', 'AY1', 'ER0', 'AA2', 'R', 'M', 'Z'])
('firearms', ['F', 'AY1', 'R', 'AA2', 'R', 'M', 'Z'])
('fireball', ['F', 'AY1', 'ER0', 'B', 'AO2', 'L'])


In [10]:
syllable = ['N', 'IH0', 'K', 'S']
[(word, pron) for word, pron in entries if pron[-4:] == syllable]

[("atlantic's", ['AH0', 'T', 'L', 'AE1', 'N', 'IH0', 'K', 'S']),
 ('audiotronics',
  ['AO2', 'D', 'IY0', 'OW0', 'T', 'R', 'AA1', 'N', 'IH0', 'K', 'S']),
 ('avionics', ['EY2', 'V', 'IY0', 'AA1', 'N', 'IH0', 'K', 'S']),
 ('beatniks', ['B', 'IY1', 'T', 'N', 'IH0', 'K', 'S']),
 ('calisthenics',
  ['K', 'AE2', 'L', 'AH0', 'S', 'TH', 'EH1', 'N', 'IH0', 'K', 'S']),
 ('centronics', ['S', 'EH2', 'N', 'T', 'R', 'AA1', 'N', 'IH0', 'K', 'S']),
 ('chamonix', ['CH', 'AE1', 'M', 'AH0', 'N', 'IH0', 'K', 'S']),
 ('chetniks', ['CH', 'EH1', 'T', 'N', 'IH0', 'K', 'S']),
 ("clinic's", ['K', 'L', 'IH1', 'N', 'IH0', 'K', 'S']),
 ('clinics', ['K', 'L', 'IH1', 'N', 'IH0', 'K', 'S']),
 ('conics', ['K', 'AA1', 'N', 'IH0', 'K', 'S']),
 ('conics', ['K', 'OW1', 'N', 'IH0', 'K', 'S']),
 ('cryogenics', ['K', 'R', 'AY1', 'AH0', 'JH', 'EH2', 'N', 'IH0', 'K', 'S']),
 ('cynics', ['S', 'IH1', 'N', 'IH0', 'K', 'S']),
 ('diasonics', ['D', 'AY2', 'AH0', 'S', 'AA1', 'N', 'IH0', 'K', 'S']),
 ("dominic's", ['D', 'AA1', 'M', 'AH

In [11]:
def stress(pron):
    return [char for phone in pron for char in phone if char.isdigit()]

In [12]:
[w for w, pron in entries if stress(pron) == ['0', '1', '0', '2', '0']]

['abbreviated',
 'abbreviated',
 'abbreviating',
 'accelerated',
 'accelerating',
 'accelerator',
 'accelerators',
 'accentuated',
 'accentuating',
 'accommodated',
 'accommodating',
 'accommodative',
 'accumulated',
 'accumulating',
 'accumulative',
 'accumulator',
 'accumulators',
 'accusatory',
 'adenovirus',
 'adjudicated',
 'adjudicating',
 'administrating',
 'administrative',
 'administrator',
 "administrators'",
 "administrator's",
 'administrators',
 'adulterated',
 'adventurism',
 'adventurism',
 'affiliated',
 'affiliated',
 "affiliated's",
 'affiliating',
 'alleviated',
 'alleviated',
 'alleviating',
 'alliteration',
 'alliterative',
 'amalgamated',
 "amalgamated's",
 'amalgamating',
 'ameliorated',
 'ameridata',
 'amoxicillin',
 'anachronism',
 'anachronisms',
 'annihilated',
 'annihilating',
 'antagonism',
 'antagonisms',
 'antagonizing',
 'anticipated',
 'anticipated',
 'anticipating',
 'apologizes',
 'apologizing',
 'apothecary',
 'appreciated',
 'appreciating',
 'apprec

## 4.3   Comparative Wordlists

In [13]:
from nltk.corpus import swadesh

In [14]:
en2fr = swadesh.entries(['en', 'fr'])

In [16]:
type(en2fr)

list

In [20]:
en2fr

[('I', 'je'),
 ('you (singular), thou', 'tu, vous'),
 ('he', 'il'),
 ('we', 'nous'),
 ('you (plural)', 'vous'),
 ('they', 'ils, elles'),
 ('this', 'ceci'),
 ('that', 'cela'),
 ('here', 'ici'),
 ('there', 'là'),
 ('who', 'qui'),
 ('what', 'quoi'),
 ('where', 'où'),
 ('when', 'quand'),
 ('how', 'comment'),
 ('not', 'ne...pas'),
 ('all', 'tout'),
 ('many', 'plusieurs'),
 ('some', 'quelques'),
 ('few', 'peu'),
 ('other', 'autre'),
 ('one', 'un'),
 ('two', 'deux'),
 ('three', 'trois'),
 ('four', 'quatre'),
 ('five', 'cinq'),
 ('big', 'grand'),
 ('long', 'long'),
 ('wide', 'large'),
 ('thick', 'épais'),
 ('heavy', 'lourd'),
 ('small', 'petit'),
 ('short', 'court'),
 ('narrow', 'étroit'),
 ('thin', 'mince'),
 ('woman', 'femme'),
 ('man (adult male)', 'homme'),
 ('man (human being)', 'homme'),
 ('child', 'enfant'),
 ('wife', 'femme, épouse'),
 ('husband', 'mari, époux'),
 ('mother', 'mère'),
 ('father', 'père'),
 ('animal', 'animal'),
 ('fish', 'poisson'),
 ('bird', 'oiseau'),
 ('dog', 'chien'

In [17]:
en2fr_dict = dict(en2fr)

In [21]:
en2fr_dict['what']

'quoi'

In [22]:
languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']

In [30]:
[swadesh.entries([i])[0] for i in languages]

[('I',), ('ich',), ('ik',), ('yo',), ('je',), ('eu',), ('ego',)]

# 5   WordNet

WordNet is a semantically-oriented dictionary of English, similar to a traditional thesaurus but with a richer structure. NLTK includes the English WordNet, with 155,287 words and 117,659 synonym sets. We'll begin by looking at synonyms and how they are accessed in WordNet.

## 5.1   Senses and Synonyms

In [42]:
from nltk.corpus import wordnet as wn 

In [43]:
wn.synsets('motorcar')

[Synset('car.n.01')]

In [52]:
wn.synsets('mobile')

[Synset('mobile.n.01'),
 Synset('mobile.n.02'),
 Synset('mobile.n.03'),
 Synset('mobile.s.01'),
 Synset('mobile.a.02'),
 Synset('mobile.s.03'),
 Synset('mobile.s.04'),
 Synset('fluid.s.05')]

In [61]:
wn.synset('mobile.n.02').lemmas()

[Lemma('mobile.n.02.Mobile')]

In [44]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [45]:
wn.synset('car.n.01').definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [48]:
wn.synset('car.n.01').examples()

['he needs a car to get to work']

In [50]:
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [63]:
wn.lemma('car.n.01.car').synset()

Synset('car.n.01')

In [64]:
wn.lemma('car.n.01.car').name()

'car'

In [68]:
wn.synsets('mobile')

[Synset('mobile.n.01'),
 Synset('mobile.n.02'),
 Synset('mobile.n.03'),
 Synset('mobile.s.01'),
 Synset('mobile.a.02'),
 Synset('mobile.s.03'),
 Synset('mobile.s.04'),
 Synset('fluid.s.05')]

In [69]:
[s.lemma_names() for s in wn.synsets('mobile')]

[['Mobile', 'Mobile_River'],
 ['Mobile'],
 ['mobile'],
 ['mobile', 'nomadic', 'peregrine', 'roving', 'wandering'],
 ['mobile'],
 ['mobile'],
 ['mobile'],
 ['fluid', 'mobile']]