#### Installing nltk

In [1]:
import nltk
#nltk.download()

#### Extracting text from website and Tokenizing

In [2]:
import urllib.request
from bs4 import BeautifulSoup 

response = urllib.request.urlopen("http://en.wikipedia.org/wiki/Natural_language_processing")
html = response.read()
soup = BeautifulSoup(html,"html5lib") 
text = soup.get_text(strip=True) 
tokens = [t for t in text.split()] 
#print(tokens)

#### Tokenize Text Using nltk

In [3]:
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize 

tokens = """Time is endless in thy hands, my lord. 
There is none to count thy minutes. 

Days and nights pass and ages bloom and fade like flowers. 
Thou knowest how to wait. 

Thy centuries follow each other perfecting a small wild flower. 

We have no time to lose, 
and having no time we must scramble for a chance. 
We are too poor to be late. 

And thus it is that time goes by 
while I give it to every querulous man who claims it, 
and thine altar is empty of all offerings to the last. 

At the end of the day I hasten in fear lest thy gate be shut; 
but I find that yet there is time.""" 

print(sent_tokenize(tokens, "english"))

['Time is endless in thy hands, my lord.', 'There is none to count thy minutes.', 'Days and nights pass and ages bloom and fade like flowers.', 'Thou knowest how to wait.', 'Thy centuries follow each other perfecting a small wild flower.', 'We have no time to lose, \nand having no time we must scramble for a chance.', 'We are too poor to be late.', 'And thus it is that time goes by \nwhile I give it to every querulous man who claims it, \nand thine altar is empty of all offerings to the last.', 'At the end of the day I hasten in fear lest thy gate be shut; \nbut I find that yet there is time.']


In [4]:
from nltk.tokenize import word_tokenize

print(word_tokenize(tokens,"english"))

['Time', 'is', 'endless', 'in', 'thy', 'hands', ',', 'my', 'lord', '.', 'There', 'is', 'none', 'to', 'count', 'thy', 'minutes', '.', 'Days', 'and', 'nights', 'pass', 'and', 'ages', 'bloom', 'and', 'fade', 'like', 'flowers', '.', 'Thou', 'knowest', 'how', 'to', 'wait', '.', 'Thy', 'centuries', 'follow', 'each', 'other', 'perfecting', 'a', 'small', 'wild', 'flower', '.', 'We', 'have', 'no', 'time', 'to', 'lose', ',', 'and', 'having', 'no', 'time', 'we', 'must', 'scramble', 'for', 'a', 'chance', '.', 'We', 'are', 'too', 'poor', 'to', 'be', 'late', '.', 'And', 'thus', 'it', 'is', 'that', 'time', 'goes', 'by', 'while', 'I', 'give', 'it', 'to', 'every', 'querulous', 'man', 'who', 'claims', 'it', ',', 'and', 'thine', 'altar', 'is', 'empty', 'of', 'all', 'offerings', 'to', 'the', 'last', '.', 'At', 'the', 'end', 'of', 'the', 'day', 'I', 'hasten', 'in', 'fear', 'lest', 'thy', 'gate', 'be', 'shut', ';', 'but', 'I', 'find', 'that', 'yet', 'there', 'is', 'time', '.']


#### Remove Punctuation

In [5]:
words = [word for word in word_tokenize(tokens) if word.isalpha()]

#### Word Frequency Count

In [6]:
import nltk

freq = nltk.FreqDist(words) 
'''
for key,val in freq.items(): 
    print (str(key) + ': ' + str(val))
'''

"\nfor key,val in freq.items(): \n    print (str(key) + ': ' + str(val))\n"

In [7]:
freq.items()

dict_items([('Time', 1), ('is', 5), ('endless', 1), ('in', 2), ('thy', 3), ('hands', 1), ('my', 1), ('lord', 1), ('There', 1), ('none', 1), ('to', 6), ('count', 1), ('minutes', 1), ('Days', 1), ('and', 5), ('nights', 1), ('pass', 1), ('ages', 1), ('bloom', 1), ('fade', 1), ('like', 1), ('flowers', 1), ('Thou', 1), ('knowest', 1), ('how', 1), ('wait', 1), ('Thy', 1), ('centuries', 1), ('follow', 1), ('each', 1), ('other', 1), ('perfecting', 1), ('a', 2), ('small', 1), ('wild', 1), ('flower', 1), ('We', 2), ('have', 1), ('no', 2), ('time', 4), ('lose', 1), ('having', 1), ('we', 1), ('must', 1), ('scramble', 1), ('for', 1), ('chance', 1), ('are', 1), ('too', 1), ('poor', 1), ('be', 2), ('late', 1), ('And', 1), ('thus', 1), ('it', 3), ('that', 2), ('goes', 1), ('by', 1), ('while', 1), ('I', 3), ('give', 1), ('every', 1), ('querulous', 1), ('man', 1), ('who', 1), ('claims', 1), ('thine', 1), ('altar', 1), ('empty', 1), ('of', 2), ('all', 1), ('offerings', 1), ('the', 3), ('last', 1), ('At',

#### Removing Stop Words

In [8]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

tokens = list(freq.keys())

for token in tokens:
    if token in stopwords.words('english'):
        tokens.remove(token)

freq = nltk.FreqDist(tokens) 

for key,val in freq.items(): 
    print (str(key) + ':' + str(val))

Time:1
endless:1
thy:1
hands:1
lord:1
There:1
none:1
count:1
minutes:1
Days:1
nights:1
pass:1
ages:1
bloom:1
fade:1
like:1
flowers:1
Thou:1
knowest:1
wait:1
Thy:1
centuries:1
follow:1
other:1
perfecting:1
small:1
wild:1
flower:1
We:1
no:1
time:1
lose:1
we:1
must:1
scramble:1
chance:1
too:1
poor:1
late:1
And:1
thus:1
that:1
goes:1
while:1
I:1
give:1
every:1
querulous:1
man:1
claims:1
thine:1
altar:1
empty:1
all:1
offerings:1
last:1
At:1
end:1
day:1
hasten:1
fear:1
lest:1
gate:1
shut:1
find:1
yet:1


In [9]:
#import matplotlib.pyplot as plt

freq.plot(20,cumulative=False)

<matplotlib.figure.Figure at 0x2c31a526f60>

#### Get Synonyms

In [10]:
#nltk.download('wordnet')
from nltk.corpus import wordnet

syn = wordnet.synsets("happy")
print(syn[0].definition())
print(syn[0].examples())

enjoying or showing or marked by joy or pleasure
['a happy smile', 'spent many happy days on the beach', 'a happy marriage']


In [11]:
from nltk.corpus import wordnet 

synonyms = []
for syn in wordnet.synsets('NLP'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
        
print(synonyms)

['natural_language_processing', 'NLP', 'human_language_technology']


#### Get Antonyms

In [12]:
from nltk.corpus import wordnet

antonyms = []
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print(antonyms)

['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


### Word Stemming

Removing affixes from words and returning the root word

In [13]:
from nltk.stem import PorterStemmer, SnowballStemmer

stemmer = PorterStemmer() 

print(stemmer.stem('working'))

work


In [14]:
# Languages supported by SnowballStemmer

print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [15]:
french_stemmer = SnowballStemmer('english')

print(french_stemmer.stem("watching"))

watch


### Words Lemmatizing 

Similar to stemming, but give us a real word. 

In [16]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print("Lemmatizing: " + lemmatizer.lemmatize('increases'))
print("       vs")
print("Stemming: " + stemmer.stem('increases'))

Lemmatizing: increase
       vs
Stemming: increas


In [17]:
print(lemmatizer.lemmatize('playing', pos="v")) 
print(lemmatizer.lemmatize('playing', pos="n")) 
print(lemmatizer.lemmatize('playing', pos="a")) 
print(lemmatizer.lemmatize('playing', pos="r"))

play
playing
playing
playing
