# Chapter 3 Processing Raw Text

In [1]:
import nltk, re, pprint

## 3.1 Accessing Text from the Web and from Disk

### Electronic Books

In [60]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
raw = str(request.urlopen(url).read())
raw[:75]

"b'\\xef\\xbb\\xbfThe Project Gutenberg EBook of Crime and Punishment, by Fyodo"

In [48]:
type(raw)

str

In [49]:
len(raw)

1358498

In [50]:
tokens = nltk.word_tokenize(raw)

In [51]:
type(tokens)

list

In [52]:
len(tokens)

225597

In [53]:
tokens[:10]

["b'\\xef\\xbb\\xbfThe",
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

In [33]:
text = nltk.Text(tokens)

In [54]:
type(text)

nltk.text.Text

In [55]:
print(text[1020:1060])

['I', 'CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.', 'He']


In [61]:
raw.find("PART I")

5866

In [62]:
raw.rfind("End of Project Gutenberg's Crime")

-1

In [63]:
raw = raw[5866:]

In [64]:
raw.find("PART I")

0

### Dealing with HTML

In [65]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"

In [68]:
html = str(request.urlopen(url).read())

In [69]:
html[:60]

'b\'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//'

In [72]:
from bs4 import BeautifulSoup

In [73]:
raw = BeautifulSoup(html).get_text() # not implemented

In [77]:
tokens = nltk.word_tokenize(raw)

In [78]:
print(tokens[:10])

["b'\\r\\n\\r\\n\\r\\nBBC", 'NEWS', '|', 'Health', '|', 'Blondes', "\\'to", 'die', 'out', 'in']


In [79]:
tokens = tokens[96:399]

In [80]:
text = nltk.Text(tokens)

In [81]:
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . \r\n\r\nIn order for a child to ha
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They don\'t disappear
des would disappear is if having the gene was a disadvantage and I do not thin


### Processing RSS Feeds

In [84]:
import feedparser

In [85]:
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")

In [86]:
llog['feed']['title']

'Language Log'

In [87]:
len(llog.entries)

13

In [88]:
post = llog.entries[2]

In [89]:
post.title

'Super Bowl Dialectology'

In [90]:
content = post.content[0].value

In [91]:
content[:70]

"<p>One of today's Super Bowl commercials features Boston r-lessness:</"

In [96]:
print(nltk.word_tokenize(BeautifulSoup(content).get_text())[:10])

['One', 'of', 'today', "'s", 'Super', 'Bowl', 'commercials', 'features', 'Boston', 'r-lessness']


In [97]:
print(nltk.word_tokenize(BeautifulSoup(llog.entries[2].content[0].value).get_text())[:10])

['One', 'of', 'today', "'s", 'Super', 'Bowl', 'commercials', 'features', 'Boston', 'r-lessness']


### Capturing User Input

In [99]:
s = input("Enter some text: ")

Enter some text: On an exceptionally hot evening early in July


In [100]:
print("You typed", len(nltk.word_tokenize(s)), "words.")

You typed 8 words.


## 3.3 Text Processing with Unicode

### Extracting Encoded Text from Files

In [101]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

In [102]:
import codecs

In [103]:
f = codecs.open(path, encoding='latin2')

In [104]:
for line in f:
    line = line.strip()
    print(line.encode('unicode_escape'))

b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'


In [105]:
ord('a')

97

In [106]:
a = u'\u0061'

In [107]:
a

'a'

In [108]:
print(a)

a


In [109]:
nacute = u'\u0144'

In [110]:
nacute

'ń'

In [111]:
nacute_utf = nacute.encode('utf8')

In [112]:
print(repr(nacute_utf))

b'\xc5\x84'


In [113]:
import unicodedata

In [114]:
lines = codecs.open(path, encoding='latin2').readlines()

In [115]:
line = lines[2]

In [116]:
print(line.encode('unicode_escape'))

b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'


In [117]:
for c in line:
    if ord(c) > 127:
        print('%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c)))

b'\xc3\xb3' U+00f3 LATIN SMALL LETTER O WITH ACUTE
b'\xc5\x9b' U+015b LATIN SMALL LETTER S WITH ACUTE
b'\xc5\x9a' U+015a LATIN CAPITAL LETTER S WITH ACUTE
b'\xc4\x85' U+0105 LATIN SMALL LETTER A WITH OGONEK
b'\xc5\x82' U+0142 LATIN SMALL LETTER L WITH STROKE


In [119]:
line.find(u'zosta\u0142y')

54

In [120]:
line = line.lower()

In [121]:
print(line.encode('unicode_escape'))

b'niemc\\xf3w pod koniec ii wojny \\u015bwiatowej na dolny \\u015bl\\u0105sk, zosta\\u0142y\\n'


In [122]:
import re

In [123]:
m = re.search(u'\u015b\w*', line)

In [124]:
m.group()

'światowej'

In [126]:
print(nltk.word_tokenize(line))

['niemców', 'pod', 'koniec', 'ii', 'wojny', 'światowej', 'na', 'dolny', 'śląsk', ',', 'zostały']


## 3.5 Useful Applications of Regular Expressions

### Extracting Word Pieces

In [127]:
word = 'supercalifragilisticexpialidocious'

In [128]:
re.findall(r'[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [129]:
len(re.findall(r'[aeiou]', word))

16

In [130]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [131]:
fd = nltk.FreqDist(vs for word in wsj
                   for vs in re.findall(r'[aeiou]{2,}', word))

In [132]:
fd.items()

dict_items([('ea', 476), ('oi', 65), ('ou', 329), ('io', 549), ('ee', 217), ('ie', 331), ('ui', 95), ('ua', 109), ('ai', 261), ('ue', 105), ('ia', 253), ('ei', 86), ('iai', 1), ('oo', 174), ('au', 106), ('eau', 10), ('oa', 59), ('oei', 1), ('oe', 15), ('eo', 39), ('uu', 1), ('eu', 18), ('iu', 14), ('aii', 1), ('aiia', 1), ('ae', 11), ('aa', 3), ('oui', 6), ('ieu', 3), ('ao', 6), ('iou', 27), ('uee', 4), ('eou', 5), ('aia', 1), ('uie', 3), ('iao', 1), ('eei', 2), ('uo', 8), ('uou', 5), ('eea', 1), ('ueui', 1), ('ioa', 1), ('ooi', 1)])

## Doing More with Word Pieces

In [133]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

In [134]:
def compress(word):
    pieces = re.findall(regexp, word)
    return(''.join(pieces))

In [135]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')

In [136]:
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [137]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')

In [138]:
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]

In [139]:
cfd = nltk.ConditionalFreqDist(cvs)

In [140]:
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [141]:
cv_word_pairs = [(cv, w) for w in rotokas_words
                 for cv in re.findall(r'[ptksvr][aeiou]', w)]

In [142]:
cv_index = nltk.Index(cv_word_pairs)

In [143]:
cv_index['su']

['kasuari']

In [144]:
cv_index['po']

['kaapo',
 'kaapopato',
 'kaipori',
 'kaiporipie',
 'kaiporivira',
 'kapo',
 'kapoa',
 'kapokao',
 'kapokapo',
 'kapokapo',
 'kapokapoa',
 'kapokapoa',
 'kapokapora',
 'kapokapora',
 'kapokaporo',
 'kapokaporo',
 'kapokari',
 'kapokarito',
 'kapokoa',
 'kapoo',
 'kapooto',
 'kapoovira',
 'kapopaa',
 'kaporo',
 'kaporo',
 'kaporopa',
 'kaporoto',
 'kapoto',
 'karokaropo',
 'karopo',
 'kepo',
 'kepoi',
 'keposi',
 'kepoto']

### Finding Word Stems

In [146]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [147]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [148]:
tokens = nltk.word_tokenize(raw)

In [150]:
print([stem(t) for t in tokens])

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'Supreme', 'execut', 'power', 'deriv', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


## Searching Tokenized Text

In [151]:
from nltk.corpus import gutenberg, nps_chat

In [152]:
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

In [153]:
moby.findall(r"<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [154]:
chat = nltk.Text(nps_chat.words())

In [155]:
chat.findall(r"<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [156]:
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [157]:
from nltk.corpus import brown

In [158]:
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))

In [161]:
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


## 3.6 Normalizing Text

In [162]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

In [163]:
tokens = nltk.word_tokenize(raw)

In [164]:
porter = nltk.PorterStemmer()

In [165]:
lancaster = nltk.LancasterStemmer()

In [167]:
print([porter.stem(t) for t in tokens])

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']


In [168]:
print([lancaster.stem(t) for t in tokens])

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


In [169]:
wnl = nltk.WordNetLemmatizer()

In [170]:
print([wnl.lemmatize(t) for t in tokens])

['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


## 3.7 Regular Expressions for Tokenizing Text

### Simple Approaches to Tokenization

In [171]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [173]:
print(re.split(r' ', raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone\nthough),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very\nwell', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [174]:
print(re.split(r'[ \t\n]+', raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [175]:
print(re.split(r'\W+', raw))

['', 'When', 'I', 'M', 'a', 'Duchess', 'she', 'said', 'to', 'herself', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', 'I', 'won', 't', 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', 'Soup', 'does', 'very', 'well', 'without', 'Maybe', 'it', 's', 'always', 'pepper', 'that', 'makes', 'people', 'hot', 'tempered', '']


In [176]:
print(re.findall(r'\w+|\S\w*', raw))

["'When", 'I', "'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'I", 'won', "'t", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '-', '-Maybe', 'it', "'s", 'always', 'pepper', 'that', 'makes', 'people', 'hot', '-tempered', ',', "'", '.', '.', '.']


In [177]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


## 3.8 Segmentation

### Sentence Segmentation

In [179]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

20.250994070456922

In [180]:
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

In [181]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')

In [182]:
sents = sent_tokenizer.tokenize(text)

In [183]:
pprint.pprint(sents[171:181])

['In the wild events which were to follow this girl had no\n'
 'part at all; he never saw her again until all his tale was over.',
 'And yet, in some indescribable way, she kept recurring like a\n'
 'motive in music through all his mad adventures afterwards, and the\n'
 'glory of her strange hair ran like a red thread through those dark\n'
 'and ill-drawn tapestries of the night.',
 'For what followed was so\nimprobable, that it might well have been a dream.',
 'When Syme went out into the starlit street, he found it for the\n'
 'moment empty.',
 'Then he realised (in some odd way) that the silence\n'
 'was rather a living silence than a dead one.',
 'Directly outside the\n'
 'door stood a street lamp, whose gleam gilded the leaves of the tree\n'
 'that bent out over the fence behind him.',
 'About a foot from the\n'
 'lamp-post stood a figure almost as rigid and motionless as the\n'
 'lamp-post itself.',
 'The tall hat and long frock coat were black; the\n'
 'face, in an abrupt shadow

### Word Segmentation

In [184]:
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words

In [185]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"

In [186]:
seg1 = "0000000000000001000000000010000000000000000100000000000"

In [187]:
seg2 = "0100100100100001001001000010100100010010000100010010000"

In [188]:
segment(text, seg1)

['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']

In [189]:
segment(text, seg2)

['do',
 'you',
 'see',
 'the',
 'kitty',
 'see',
 'the',
 'doggy',
 'do',
 'you',
 'like',
 'the',
 'kitty',
 'like',
 'the',
 'doggy']

In [190]:
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size

In [191]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"

In [192]:
seg1 = "0000000000000001000000000010000000000000000100000000000"

In [193]:
seg2 = "0100100100100001001001000010100100010010000100010010000"

In [194]:
seg3 = "0000100100000011001000000110000100010000001100010000001"

In [195]:
segment(text, seg3)

['doyou',
 'see',
 'thekitt',
 'y',
 'see',
 'thedogg',
 'y',
 'doyou',
 'like',
 'thekitt',
 'y',
 'like',
 'thedogg',
 'y']

In [196]:
evaluate(text, seg3)

46

In [197]:
evaluate(text, seg2)

47

In [198]:
evaluate(text, seg1)

63

In [199]:
from random import randint

In [200]:
def flip(segs, pos):
    return segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:]

In [201]:
def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0,len(segs)-1))
    return segs

In [202]:
def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return segs

In [203]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"

In [204]:
seg1 = "0000000000000001000000000010000000000000000100000000000"

In [207]:
anneal(text, seg1, 5000, 1.2)

63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
63 ['doyouseethekitty', 'seethedoggy', 'doyoulikethekitty', 'likethedoggy']
59 ['doyouseethekitty', 'se', 'ethedoggy', 'doyoulik', 'ethekitty', 'lik', 'ethedoggy']
59 ['doyouseethekitty', 'se', 'ethedoggy', 'doyoulik', 'ethekitty', 'lik', 'ethedoggy']
54 ['doyouse', 'etheki', 'tty', 'se', 'ethedoggy', 'doyoulik', 'etheki', 'tty', 'lik', 'ethedoggy']
54 ['doyouse', 'etheki', 'tty', 'se', 'ethedoggy', 'doyoulik', 'etheki', 'tty', 'lik', 'ethedoggy']
54 ['doyouse', 'etheki', 'tty', 'se', 'ethedoggy', 'doyoulik', 'etheki', 'tty', 'lik', 'ethe

'0000101000000001010000000010000100100000000100100000000'

## 3.9 Formatting: From Lists to Strings

In [222]:
# simple python recap