In [65]:
import nltk
import pandas as pd
text = 'This is just an example sentence for your reference'

In [66]:
from nltk import word_tokenize
words = word_tokenize(text)
words

['This', 'is', 'just', 'an', 'example', 'sentence', 'for', 'your', 'reference']

In [67]:
# Attaching a speech tag to each word
tags = nltk.pos_tag(words)

In [68]:
# To count words, count words with specific properties, count sequences and count tags, nltk defines the class:
# nltk.FreqDist
from nltk.probability import FreqDist

# example
list = ['a','b','a']
fdist = FreqDist(list)
fdist

FreqDist({'a': 2, 'b': 1})

In [69]:
fdist['a'] # 'a' is present twice

2

In [70]:
fdist['c'] # we see 'c' 0 times

0

In [71]:
fdist.max() # the maximum seen letter is 'a'

'a'

In [72]:
len(fdist) # we observed 2 types of objects in the stream

2

In [73]:
fdist.keys() # Return the types of the objects that were observed in the stream

dict_keys(['a', 'b'])

In [74]:
fdist.freq('a') # 2/3 of the samples we saw were 'a'

0.6666666666666666

In [75]:
fdist.N() # How many samples did we count?

3

In [76]:
# NLTK contains a collection of tagged corpora. They are arranged as Python objects for the purpose of convenience.
# I have used brown corpus in this example. The tagged_sents version is a list of sentences.
# Each sentence is a list of pairs (tuples)(word,tag)
# Corpus can also be accessed as a flat list of tagged words

from nltk.corpus import brown
brown_news_tagged = brown.tagged_sents(tagset='universal')
brown_news_words = brown.tagged_words(tagset='universal')

In [77]:
fdistw = FreqDist([w for (w, t) in brown_news_words])
fdistw.N()    # We saw 1,161,192 words in this section of the corpus

1161192

In [78]:
len(fdistw)  # How many distinct words are there

56057

In [79]:
fdistw.max()  # What is the most frequent word?

'the'

In [80]:
fdistw['the']   # How often does 'the' occur in the corpus

62713

In [81]:
# frequency distribution of 'the'
print('%5.2f%%' % (fdistw.freq('the') * 100))

 5.40%


In [82]:
# Process textual data using TF-IDF
s1 = 'The car is driven on the road'
s2 = 'The truck is driven on the highway'
s3 = 'The plane is flying in the air'
s4 = 'People in Austin love keeping cats'

In [83]:
# Import Tfidf vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [84]:
# Initialize the vectorizer and then call fit and transform over it to calculate the TF-IDF score of the text
vectorizer = TfidfVectorizer()
response = vectorizer.fit_transform([s1,s2])

In [63]:
print(response)

  (0, 6)	0.604379551537
  (0, 0)	0.42471718587
  (0, 3)	0.302189775769
  (0, 1)	0.302189775769
  (0, 4)	0.302189775769
  (0, 5)	0.42471718587
  (1, 6)	0.604379551537
  (1, 3)	0.302189775769
  (1, 1)	0.302189775769
  (1, 4)	0.302189775769
  (1, 7)	0.42471718587
  (1, 2)	0.42471718587


In [86]:
# Bag of words 
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['All my cats in a row','When my cat sits down, she looks like a Furby toy!','The cat from outer space','Sunshine loves to sit like this for some reason.']

vectorizer = CountVectorizer()
print( vectorizer.fit_transform(corpus).todense() )
print( vectorizer.vocabulary_ )

[[1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1]
 [0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 1 0 1 1 0 0]]
{'all': 0, 'my': 11, 'cats': 2, 'in': 7, 'row': 14, 'when': 25, 'cat': 1, 'sits': 17, 'down': 3, 'she': 15, 'looks': 9, 'like': 8, 'furby': 6, 'toy': 24, 'the': 21, 'from': 5, 'outer': 12, 'space': 19, 'sunshine': 20, 'loves': 10, 'to': 23, 'sit': 16, 'this': 22, 'for': 4, 'some': 18, 'reason': 13}


In [92]:
# Naive Bayes
train = [('I love this sandwich.', 'pos'),
('This is an amazing place!', 'pos'),
('I feel very good about these beers.', 'pos'),
('This is my best work.', 'pos'),
("What an awesome view", 'pos'),
('I do not like this restaurant', 'neg'),
('I am tired of this stuff.', 'neg'),
("I can't deal with this", 'neg'),
('He is my sworn enemy!', 'neg'),
('My boss is horrible.', 'neg')]

In [93]:
# The first element of each tuple should be a dictionary of features. 
# So change the list into a data structure that the classifier can work with
from nltk.tokenize import word_tokenize

In [94]:
all_words = set(word.lower() for passage in train for word in word_tokenize(passage[0]))

In [95]:
t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]

In [96]:
# Data is now structures
t

[({'!': False,
   '.': True,
   'about': False,
   'am': False,
   'amazing': False,
   'an': False,
   'awesome': False,
   'beers': False,
   'best': False,
   'boss': False,
   'ca': False,
   'deal': False,
   'do': False,
   'enemy': False,
   'feel': False,
   'good': False,
   'he': False,
   'horrible': False,
   'i': False,
   'is': False,
   'like': False,
   'love': True,
   'my': False,
   "n't": False,
   'not': False,
   'of': False,
   'place': False,
   'restaurant': False,
   'sandwich': True,
   'stuff': False,
   'sworn': False,
   'these': False,
   'this': True,
   'tired': False,
   'very': False,
   'view': False,
   'what': False,
   'with': False,
   'work': False},
  'pos'),
 ({'!': True,
   '.': False,
   'about': False,
   'am': False,
   'amazing': True,
   'an': True,
   'awesome': False,
   'beers': False,
   'best': False,
   'boss': False,
   'ca': False,
   'deal': False,
   'do': False,
   'enemy': False,
   'feel': False,
   'good': False,
   'he': F

In [97]:
# The first element of each tuple is now a dictionary. The classifier can be trained now.
classifier = nltk.NaiveBayesClassifier.train(t)

In [98]:
classifier.show_most_informative_features()

Most Informative Features
                    this = True              neg : pos    =      2.3 : 1.0
                    this = False             pos : neg    =      1.8 : 1.0
                      an = False             neg : pos    =      1.6 : 1.0
                       . = True              pos : neg    =      1.4 : 1.0
                       . = False             neg : pos    =      1.4 : 1.0
                      am = False             pos : neg    =      1.2 : 1.0
                sandwich = False             neg : pos    =      1.2 : 1.0
                    very = False             neg : pos    =      1.2 : 1.0
                horrible = False             pos : neg    =      1.2 : 1.0
                    love = False             neg : pos    =      1.2 : 1.0


In [99]:
# If you want to use the classifier, you can do it like this. First, begin with a test sentence:

test_sentence = "This is the best band I've ever heard!"

In [100]:
# Tokenize the sentence and figure out which words the sentence shares with all_words. These constitute the sentence's features.

test_sent_features = {word.lower(): (word in word_tokenize(test_sentence.lower())) for word in all_words}

In [101]:
# This is how the features look like
test_sent_features

{'!': True,
 '.': False,
 'about': False,
 'am': False,
 'amazing': False,
 'an': False,
 'awesome': False,
 'beers': False,
 'best': True,
 'boss': False,
 'ca': False,
 'deal': False,
 'do': False,
 'enemy': False,
 'feel': False,
 'good': False,
 'he': False,
 'horrible': False,
 'i': True,
 'is': True,
 'like': False,
 'love': False,
 'my': False,
 "n't": False,
 'not': False,
 'of': False,
 'place': False,
 'restaurant': False,
 'sandwich': False,
 'stuff': False,
 'sworn': False,
 'these': False,
 'this': True,
 'tired': False,
 'very': False,
 'view': False,
 'what': False,
 'with': False,
 'work': False}

In [102]:
# Now classify the features
classifier.classify(test_sent_features)   # This sentence appears to be positive

'pos'