In [11]:
import nltk

In [12]:
from nltk.corpus import brown

In [14]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [15]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [58]:
data = brown.sents(categories=["adventure"])

In [59]:
len(data)

4637

In [69]:
" ".join(data[9])

'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'

## Tokenization

In [22]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [23]:
document = ''' It was a very good movie. The cast was amazing and I liked the story.
I went to the movie hall to see it.
'''

In [24]:
sentence = "Code for cause is too OP"

In [27]:
sents = sent_tokenize(document)
print(sents)

[' It was a very good movie.', 'The cast was amazing and I liked the story.', 'I went to the movie hall to see it.']


In [28]:
words = word_tokenize(sentence)
print(words)

['Code', 'for', 'cause', 'is', 'too', 'OP']


In [29]:
# will also breakdown special characters like @

## Stopword removal

In [31]:
from nltk.corpus import stopwords

In [34]:
sw = set(stopwords.words('english'))
print(sw)

{'of', 'should', "doesn't", 'own', 'ourselves', "hadn't", 'again', 'doesn', 't', 'haven', "you're", 'be', 'and', 'through', 'while', 'once', 'more', 'couldn', 'until', 'up', 'for', 'between', "isn't", 'were', 'themselves', 'been', 'has', 'then', 'hasn', "you've", 'just', 'here', 'very', 'hadn', 'can', 'because', "hasn't", 'than', 'below', 'how', 'mustn', 'yourself', 'y', 'now', "needn't", 'isn', 'our', 'being', 'after', 'or', 'with', 'yours', 'nor', 've', 'did', 'theirs', 'some', 'won', 'yourselves', 'you', "haven't", 'hers', 'out', 'himself', 'whom', 'wouldn', "she's", 'any', 'ain', "aren't", 'down', 'i', "mightn't", "weren't", "shouldn't", 'doing', "don't", 'mightn', "wasn't", "it's", 'd', "shan't", 'against', "mustn't", 'in', 'only', 'we', 'he', 'ours', "you'd", 'does', 'weren', 'a', 'having', 'if', 'them', 'at', 'the', "won't", 'why', 'from', 'to', 'she', 'itself', "should've", 'they', 'his', 'further', 're', 'shan', 'under', 'that', 'ma', 'where', 'herself', 'its', 'll', 'do', 'as

In [36]:
text = "I am not a very good cricket player".split()
text

['I', 'am', 'not', 'a', 'very', 'good', 'cricket', 'player']

In [37]:
def removestopwords(text,stopwords):
    return [w for w in text if w not in stopwords]

In [38]:
useful = removestopwords(text,sw)
useful

['I', 'good', 'cricket', 'player']

In [39]:
from nltk.tokenize import RegexpTokenizer

In [43]:
sentence = "Code for cause is too OP msvn@gmail.com"

In [44]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful = tokenizer.tokenize(sentence)
print(useful)

['Code', 'for', 'cause', 'is', 'too', 'OP', 'msvn@gmail.com']


## Stemming

In [45]:
# nltk provides us Porter, Snowball, Lancaster stemmers

In [46]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer

In [47]:
ps = PorterStemmer()

In [48]:
ps.stem("Running")

'run'

In [49]:
# SnowballStemmer is used for multilingual stemming

In [70]:
corpus = [
    'Dan Morgan told himself he would forget Ann Turner .',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
    'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'
]

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
cv = CountVectorizer()

In [73]:
vc = cv.fit_transform(corpus)

In [80]:
vc = vc.toarray()
print(vc[2])
print(cv.vocabulary_)

[2 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 2 2 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1 0
 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
{'dan': 9, 'morgan': 27, 'told': 47, 'himself': 21, 'he': 19, 'would': 54, 'forget': 15, 'ann': 1, 'turner': 49, 'sometimes': 39, 'woke': 53, 'up': 50, 'in': 24, 'the': 42, 'middle': 26, 'of': 33, 'night': 29, 'thinking': 44, 'and': 0, 'then': 43, 'could': 8, 'not': 30, 'get': 17, 'back': 6, 'to': 46, 'sleep': 37, 'his': 22, 'plans': 34, 'dreams': 10, 'had': 18, 'revolved': 35, 'around': 2, 'her': 20, 'so': 38, 'much': 28, 'for': 14, 'long': 25, 'that': 41, 'now': 32, 'felt': 13, 'as': 3, 'if': 23, 'nothing': 31, 'found': 16, 'was': 51, 'tired': 45, 'enough': 11, 'at': 4, 'went': 52, 'simply': 36, 'because': 7, 'too': 48, 'exhausted': 12, 'stay': 40, 'awake': 5}


In [82]:
print(len(cv.vocabulary_))

55


In [85]:
numbers = vc[2]
print(len(numbers))

55


In [86]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    words = removestopwords(words,sw)
    return words

In [87]:
myTokenizer('this is a random text')

['random', 'text']

In [89]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [90]:
vc = cv.fit_transform(corpus).toarray()



In [91]:
print(vc)

[[1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0]]


In [92]:
len(vc[0])

33

In [93]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner': 29,
 '.': 0,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}

In [97]:
len(cv.transform([sentence]).toarray()[0]) # Here, sentence is the test data.. we use fit_transform for training only.

33

In [98]:
# TF-IDF normalization explore: used for ignoring words that repeat often in different document

In [99]:
# explore unigram, bigram, triagram, n-grams