In [7]:
import nltk

In [8]:
# nltk.download('brown')

In [9]:
from nltk.corpus import brown

In [11]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [12]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [15]:
data = brown.sents(categories=["adventure"])

In [16]:
# sentences belonging to adventure
data

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [17]:
len(data)

4637

In [74]:
" ".join(data[3])

"If he had married her , he'd have been asking for trouble ."

In [20]:
# Tokenization

from nltk.tokenize import sent_tokenize, word_tokenize

In [39]:
document = """ It was a very good movie. The cast was amazing, and I liked the story.
I went to the movie hall to see it.
"""

sentence = "Leena is OP leena@gmail"

In [40]:
# nltk.download('punkt')

In [41]:
sents = sent_tokenize(document)
print(sents)
len(sents)

[' It was a very good movie.', 'The cast was amazing, and I liked the story.', 'I went to the movie hall to see it.']


3

In [42]:
words = word_tokenize(sentence)
print(words)
len(words)

['Leena', 'is', 'OP', 'leena', '@', 'gmail']


6

# Stopword Removal

In [46]:
from nltk.corpus import stopwords

In [45]:
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leenagoyal/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [47]:
sw = set(stopwords.words('english'))

In [49]:
# ignore these words
sw 

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [50]:
text = "i am not a very good cricket player.".split()
print(text)

['i', 'am', 'not', 'a', 'very', 'good', 'cricket', 'player.']


In [51]:
def remove_stopwords(text, stopwords):
    useful = [w for w in text if w not in stopwords]
    return useful

In [52]:
useful_words = remove_stopwords(text,sw)

In [53]:
useful_words

['good', 'cricket', 'player.']

In [54]:
# Tokenization using regex

In [55]:
sent = "My email is leena@gmail.com, please don't spam my inbox"

In [56]:
from nltk.tokenize import RegexpTokenizer

In [58]:
tokenizer = RegexpTokenizer('[a-zA-Z@,]+')
useful = tokenizer.tokenize(sentence)
print(useful)

['Leena', 'is', 'OP', 'leena@gmail']


# Stemming

In [61]:
# love, loved, loving, loves -----> love
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer

In [62]:
ps = PorterStemmer()

In [63]:
ps.stem('laughing')

'laugh'

In [65]:
#SnowballStemmer = multilingual
ss = SnowballStemmer('english')

In [66]:
ss.stem('loving')

'love'

In [67]:
ls = LancasterStemmer()

In [68]:
ls.stem("running")

'run'

In [75]:
corpus = [
    'Dan Morgan told himself he would forget Ann Turner .',
    'He had no idea how much time Budd would give him .',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    "If he had married her , he'd have been asking for trouble ."    
]

In [76]:
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
cv = CountVectorizer()

In [78]:
vc = cv.fit_transform(corpus)

In [86]:
# in the zeroth sentence, first word is at the index of 7 in the vocabulary with the frequency of 1
vc = vc.toarray()
print(vc[0])
print(cv.vocabulary_)

[0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 1]
{'dan': 7, 'morgan': 24, 'told': 37, 'himself': 17, 'he': 14, 'would': 42, 'forget': 9, 'ann': 1, 'turner': 39, 'had': 12, 'no': 27, 'idea': 19, 'how': 18, 'much': 25, 'time': 35, 'budd': 5, 'give': 11, 'him': 16, 'sometimes': 31, 'woke': 41, 'up': 40, 'in': 21, 'the': 32, 'middle': 23, 'of': 29, 'night': 26, 'thinking': 34, 'and': 0, 'then': 33, 'could': 6, 'not': 28, 'get': 10, 'back': 3, 'to': 36, 'sleep': 30, 'if': 20, 'married': 22, 'her': 15, 'have': 13, 'been': 4, 'asking': 2, 'for': 8, 'trouble': 38}


In [87]:
# generated by running function hence vocabulary_ (with an underscore)
print(len(cv.vocabulary_))

43


In [88]:
numbers = vc[2]

In [89]:
print(numbers)

[1 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 2 1 1 2 1 1 0 1
 0 0 0 1 1 0]


In [91]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    
    #remove stop words
    words = remove_stopwords(words, sw)
    return words

In [92]:
myTokenizer('this is a random text')

['random', 'text']

In [94]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [101]:
# fit learns from data, transform is going to create in a vectorized form [vocabulary/features]
vc = cv.fit_transform(corpus).toarray()



In [103]:
print(vc)

[[0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1]
 [0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1]
 [1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0]]


In [99]:
#stopwords removed
len(vc[0])

25

In [100]:
cv.vocabulary_

{'dan': 6,
 'morgan': 13,
 'told': 20,
 'would': 24,
 'forget': 7,
 'ann': 1,
 'turner': 22,
 'idea': 10,
 'much': 14,
 'time': 19,
 'budd': 4,
 'give': 9,
 'sometimes': 17,
 'woke': 23,
 'middle': 12,
 'night': 15,
 'thinking': 18,
 ',': 0,
 'could': 5,
 'get': 8,
 'back': 3,
 'sleep': 16,
 'married': 11,
 'asking': 2,
 'trouble': 21}

In [105]:
cv.fit(['hey there!'])

In [107]:
len(cv.fit_transform([sent]).toarray()[0])



6

In [108]:
cv.vocabulary_

{'email': 1, 'leena@gmail': 3, 'com,': 0, 'please': 4, 'spam': 5, 'inbox': 2}