## Importing the required libraries.

In [119]:
import nltk
import random
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import gutenberg
from nltk.corpus import wordnet
from nltk.corpus import movie_reviews

## Sentence Tokenizing

In [92]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


## Word Tokenizing 

In [93]:
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


## Stop words 

In [94]:
example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


## Stemming 

In [95]:
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [96]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


## Getting dataset from imported module.

In [97]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
#sample_text = "the little yellow dog barked at the cat"

## Training the PunktSentenceTokenizer

In [98]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

## Tokenize the sample text using trained PunktSentenceTokenizer

In [99]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

## Function to process tagged words

In [100]:
def tagged_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))

In [101]:
tagged_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

## Chuking function

In [102]:
def chuck_content():
    try:
        for i in tokenized[:2]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            #chunked.draw()
            for subtree in chunked.subtrees():
                print(subtree)

    except Exception as e:
        print(str(e))

In [103]:
chuck_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  g

## Chicking function 

In [104]:
def chicking_content():
    try:
        for i in tokenized[:2]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

    except Exception as e:
        print(str(e))

In [105]:
chicking_content()

## Function to process the Name entitity 

In [106]:
def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            #print(tagged)
            nameEnt = nltk.ne_chunk(tagged)
            nameEnt.draw()
    except Exception as e:
        print(str(e))

In [107]:
process_content()

## Lemmatizing

In [108]:
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


## Location of nltk files 

In [109]:
print(nltk.__file__)

/home/ns/anaconda3/envs/tf/lib/python3.6/site-packages/nltk/__init__.py


## Accessing a file from corpus 

In [110]:
sample = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample)

for x in range(5):
    print(tok[x])

[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.
And the Spirit of God moved upon the face of the
waters.
1:3 And God said, Let there be light: and there was light.
1:4 And God saw the light, that it was good: and God divided the light
from the darkness.


## Wordnet 

In [111]:
syns = wordnet.synsets("program")

In [112]:
print(syns[0].name())

plan.n.01


In [113]:
print(syns[0].lemmas()[0].name())

plan


In [114]:
print(syns[5].examples())

['he was admitted to a new program at the university']


In [115]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

## Synonyms and Antonyms

In [116]:
print(set(synonyms))
print(set(antonyms))

{'salutary', 'good', 'trade_good', 'beneficial', 'thoroughly', 'effective', 'undecomposed', 'soundly', 'dependable', 'sound', 'near', 'secure', 'goodness', 'practiced', 'honest', 'in_effect', 'estimable', 'safe', 'right', 'upright', 'full', 'just', 'respectable', 'dear', 'skilful', 'adept', 'ripe', 'proficient', 'expert', 'in_force', 'unspoiled', 'skillful', 'unspoilt', 'commodity', 'well', 'honorable', 'serious'}
{'badness', 'evil', 'bad', 'ill', 'evilness'}


## Similarity of two words

## Text classification 

In [123]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [127]:
print(documents[1])

(['synopsis', ':', 'upper', 'middle', 'class', ',', 'suburban', 'family', 'man', 'lester', '(', 'kevin', 'spacey', ')', 'realizes', 'that', 'he', 'is', 'just', 'going', 'through', 'the', 'motions', ':', 'he', 'is', 'unable', 'to', 'feel', 'passion', ';', 'he', 'is', 'soulless', ',', 'cynical', ',', 'and', 'no', 'longer', 'able', 'to', 'feel', 'the', 'edge', 'between', 'success', 'and', 'failure', '.', 'everyone', 'else', 'around', 'him', 'has', 'similar', 'symptoms', '.', 'lester', "'", 's', 'unassertive', 'daughter', 'jane', '(', 'thora', 'birch', ')', 'is', 'too', 'lethargic', 'to', 'change', 'her', 'world', ',', 'yet', 'all', 'too', 'ready', 'to', 'whine', 'and', 'complain', '.', 'his', 'wife', 'carolyn', '(', 'annette', 'bening', ')', 'is', 'reduced', 'to', 'keeping', 'up', 'appearances', 'and', 'reciting', 'commercialist', 'slogans', '.', 'and', 'these', 'are', 'people', 'with', 'a', 'big', 'house', 'and', 'a', 'decent', 'standard', 'of', 'living', '.', 'the', 'only', 'person', 'w

In [129]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())


## Frequency analysis 

In [141]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(5))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576)]


## Occurence of a word 

In [142]:
print(all_words["best"])

1333
