# Processing Raw Text

In [2]:
import nltk,re

In [10]:
from urllib.request import urlopen
url = "https://en.wikipedia.org/wiki/Text_messaging"
raw = urlopen(url).read()
type(raw)

bytes

In [11]:
len(raw)

421881

In [12]:
raw[:50]

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en'

In [26]:
tokens = nltk.word_tokenize('mohammad loves NLP course, united states')
print(type(tokens))
print(tokens)

<class 'list'>
['mohammad', 'loves', 'NLP', 'course', ',', 'united', 'states']


In [27]:
text = nltk.Text(tokens)
print(type(text))
print(text)

<class 'nltk.text.Text'>
<Text: mohammad loves NLP course , united states...>


In [30]:
text[:3]

['mohammad', 'loves', 'NLP']

### Dealing with HTML

In [37]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
html[:60]

b'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

### Reading Local Files

In [40]:
f = open('sample.txt')
raw = f.read()

In [41]:
print(raw)

Mohammad lives in UAE and want to be a greatest data scientist inshaAllah.


### Regular Expressions for Detecting Word Patterns

In [48]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [49]:
wordlist[:5]

['a', 'aa', 'aal', 'aalii', 'aam']

In [55]:
[w for w in wordlist if re.findall('ed$', w)][:5]

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised']

In [60]:
sum(1 for w in wordlist if re.search('i$', w))

964

In [62]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

['gold', 'golf', 'hold', 'hole']

In [63]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [66]:
 wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][:10]

['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']

In [67]:
[w for w in wsj if re.search('^[A-Z]+\$$', w)]

['C$', 'US$']

In [70]:
[w for w in wsj if re.search('^[0-9]{4}$', w)][:10]

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934']

In [72]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][:5]

['10-day', '10-lap', '10-year', '100-share', '12-point']

In [73]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

In [75]:
>>> [w for w in wsj if re.search('(ed|ing)$', w)][:5]

['62%-owned', 'Absorbed', 'According', 'Adopting', 'Advanced']

In [83]:
word = 'supercalifragilisticexpialidocious'
print(re.findall(r'[aeiou]', word))
print(len(re.findall(r'[aeiou]', word)))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']
16


In [89]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')

cv_word_pairs = [(cv, w) for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
print(cv_index['po'])

['kaapo', 'kaapopato', 'kaipori', 'kaiporipie', 'kaiporivira', 'kapo', 'kapoa', 'kapokao', 'kapokapo', 'kapokapo', 'kapokapoa', 'kapokapoa', 'kapokapora', 'kapokapora', 'kapokaporo', 'kapokaporo', 'kapokari', 'kapokarito', 'kapokoa', 'kapoo', 'kapooto', 'kapoovira', 'kapopaa', 'kaporo', 'kaporo', 'kaporopa', 'kaporoto', 'kapoto', 'karokaropo', 'karopo', 'kepo', 'kepoi', 'keposi', 'kepoto']


In [91]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

### Normalizing Text

In [94]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from """
tokens = nltk.word_tokenize(raw)
print(tokens)

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'lying', 'in', 'ponds', 'distributing', 'swords', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'masses', ',', 'not', 'from']


In [96]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [98]:
stem = [porter.stem(t) for t in tokens]
print(stem)

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from']


In [99]:
stem = [lancaster.stem(t) for t in tokens]
print(stem)

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from']


#### Lemmatization

In [100]:
# The WordNet lemmatizer removes affixes only if the resulting word is in its dictionary.

In [102]:
wnl = nltk.WordNetLemmatizer()
lem =[wnl.lemmatize(t) for t in tokens]
print(lem)

['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from']


### Writing Results to a File

In [103]:
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
    
    output_file.write(word + "\n")

In [105]:
print(len(words))
output_file.write(str(len(words)) + '\n')

2789


5

In [106]:
output_file.close()