# Tokenize Words

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize
data = "As it is a heuristic algorithm, there is no guarantee that it will converge to the global optimum, and the result may depend on the initial clusters. As the algorithm is usually very fast, it is common to run it multiple times with different starting conditions. However, in the worst case, k-means can be very slow to converge: in particular it has been shown that there exist certain point sets, even in 2 dimensions, on which k-means takes exponential time"
print(word_tokenize(data))

['As', 'it', 'is', 'a', 'heuristic', 'algorithm', ',', 'there', 'is', 'no', 'guarantee', 'that', 'it', 'will', 'converge', 'to', 'the', 'global', 'optimum', ',', 'and', 'the', 'result', 'may', 'depend', 'on', 'the', 'initial', 'clusters', '.', 'As', 'the', 'algorithm', 'is', 'usually', 'very', 'fast', ',', 'it', 'is', 'common', 'to', 'run', 'it', 'multiple', 'times', 'with', 'different', 'starting', 'conditions', '.', 'However', ',', 'in', 'the', 'worst', 'case', ',', 'k-means', 'can', 'be', 'very', 'slow', 'to', 'converge', ':', 'in', 'particular', 'it', 'has', 'been', 'shown', 'that', 'there', 'exist', 'certain', 'point', 'sets', ',', 'even', 'in', '2', 'dimensions', ',', 'on', 'which', 'k-means', 'takes', 'exponential', 'time']


# Tokenizing sentences

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize
data = "As it is a heuristic algorithm, there is no guarantee that it will converge to the global optimum, and the result may depend on the initial clusters. As the algorithm is usually very fast, it is common to run it multiple times with different starting conditions. However, in the worst case, k-means can be very slow to converge: in particular it has been shown that there exist certain point sets, even in 2 dimensions, on which k-means takes exponential time"
print(sent_tokenize(data))

['As it is a heuristic algorithm, there is no guarantee that it will converge to the global optimum, and the result may depend on the initial clusters.', 'As the algorithm is usually very fast, it is common to run it multiple times with different starting conditions.', 'However, in the worst case, k-means can be very slow to converge: in particular it has been shown that there exist certain point sets, even in 2 dimensions, on which k-means takes exponential time']


# Remove stop words

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
# get a set of english stop words
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []
 
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 # wordsFiltered - a list which do not contain any stop words
print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


# NLTK stemming

In [11]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
words = ["game","gaming","gamed","games"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

game
game
game
game


# word stemming for sentences

In [12]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
ps = PorterStemmer()
 
sentence = "gaming, the gamers play games"
words = word_tokenize(sentence)
 
for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game


# Speach tagging

In [16]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer
 
document = 'Whether you\'re new to programming or an experienced developer, it\'s easy to learn and use Python.'
sentences = nltk.sent_tokenize(document)   
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('Whether', 'IN'), ('you', 'PRP'), ("'re", 'VBP'), ('new', 'JJ'), ('to', 'TO'), ('programming', 'VBG'), ('or', 'CC'), ('an', 'DT'), ('experienced', 'JJ'), ('developer', 'NN'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('easy', 'JJ'), ('to', 'TO'), ('learn', 'VB'), ('and', 'CC'), ('use', 'VB'), ('Python', 'NNP'), ('.', '.')]


# Filter words based on the type of the word

In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
 
document = 'Today the Netherlands celebrates King\'s Day. To honor this tradition, the Dutch embassy in San Francisco invited me to'
sentences = nltk.sent_tokenize(document)   
 
data = []
for sent in sentences:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
 
for word in data: 
    if 'NNP' in word[1]: 
        print(word)

('Netherlands', 'NNP')
('King', 'NNP')
('Day', 'NNP')
('San', 'NNP')
('Francisco', 'NNP')


# Sentiment analysis with nltk

In [2]:
# import required libraries
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names

In [3]:
def word_feats(words):
    return dict([(word, True) for word in words])
 
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]
 
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]
 
train_set = negative_features + positive_features + neutral_features
 
classifier = NaiveBayesClassifier.train(train_set) 

In [10]:
# Predict
neg = 0
pos = 0

# getting the inout via the keyboard
sentence = input("Review: ")
#sentence = "Awesome movie, I liked it :D"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
    classResult = classifier.classify( word_feats(word))
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1
 
print('Positive: ' + str(float(pos)/len(words)))
print('Negative: ' + str(float(neg)/len(words)))

Review: movie was bad :(
Positive: 0.0
Negative: 0.5
