In [1]:
from nltk import word_tokenize, sent_tokenize
sent = "I will walk 500 miles and I would walk 500 more, just to be the man who walks a thousand miles to fall down at your door!"
print(word_tokenize(sent))
print(sent_tokenize(sent))

['I', 'will', 'walk', '500', 'miles', 'and', 'I', 'would', 'walk', '500', 'more', ',', 'just', 'to', 'be', 'the', 'man', 'who', 'walks', 'a', 'thousand', 'miles', 'to', 'fall', 'down', 'at', 'your', 'door', '!']
['I will walk 500 miles and I would walk 500 more, just to be the man who walks a thousand miles to fall down at your door!']


In [2]:
from nltk.corpus import stopwords        # the corpus module is an 
                                         # extremely useful one. 
                                         # More on that later.
stop_words = stopwords.words('english')  # this is the full list of
                                         # all stop-words stored in
                                         # nltk
token = word_tokenize(sent)
cleaned_token = []
for word in token:
    if word not in stop_words:
        cleaned_token.append(word)
print("This is the unclean version:", token)
print("This is the cleaned version:", cleaned_token)


This is the unclean version: ['I', 'will', 'walk', '500', 'miles', 'and', 'I', 'would', 'walk', '500', 'more', ',', 'just', 'to', 'be', 'the', 'man', 'who', 'walks', 'a', 'thousand', 'miles', 'to', 'fall', 'down', 'at', 'your', 'door', '!']
This is the cleaned version: ['I', 'walk', '500', 'miles', 'I', 'would', 'walk', '500', ',', 'man', 'walks', 'thousand', 'miles', 'fall', 'door', '!']


In [3]:
#Stemming
This is when ‘fluff’ letters (not words) are removed from a word and grouped together with its “stem form”. 
For instance, the words ‘play’, ‘playing’, or ‘plays’ convey the same meaning (although, again, not exactly, 
but for analysis with a computer, that sort of detail is still not a viable option). 
So instead of having them as different words, we can put them together under the same umbrella term ‘play’.

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ['play', 'playing', 'plays', 'played',
         'playfullness', 'playful']
stemmed = [stemmer.stem(word) for word in words]
print(stemmed)

['play', 'play', 'play', 'play', 'playful', 'play']


# We used the PorterStemmer, which is a pre-written stemmer class. 
There are other stemmers like SnowballStemmer and LancasterStemmer but PorterStemmer is sort of the simplest one.
‘Play’ and ‘Playful’ should have been recognized as two different words however. 
Notice how the last ‘playful’ got recognized as ‘play’ and not ‘playful’. 
This is where the simplicity of the PorterStemmer is undesirable. 
You can also train your own using unsupervised clustering or supervised classification ML models. 
Now let’s stem an actual sentence!

In [5]:
sent2 = "I played the play playfully as the players were playing in the play with playfullness"
token = word_tokenize(sent2)
stemmed = ""
for word in token:
    stemmed += stemmer.stem(word) + " "
print(stemmed)

I play the play play as the player were play in the play with playful 


In [8]:
#Tagging Parts of Speech (pos)The next essential thing we want to do is tagging each word in the corpus 
#(a corpus is just a ‘bag’ of words) we created after converting sentences by tokenizing.

from nltk import pos_tag 
token = word_tokenize(sent) + word_tokenize(sent2)
tagged = pos_tag(cleaned_token)                 
print(tagged)

[('I', 'PRP'), ('walk', 'VBP'), ('500', 'CD'), ('miles', 'NNS'), ('I', 'PRP'), ('would', 'MD'), ('walk', 'VB'), ('500', 'CD'), (',', ','), ('man', 'NN'), ('walks', 'NNS'), ('thousand', 'VBP'), ('miles', 'NNS'), ('fall', 'VB'), ('door', 'NN'), ('!', '.')]
