# Introduction to NLTK

In [3]:
import nltk

In [5]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [24]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/harsh_linux/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
# corous - A large collection of text
from nltk.corpus import brown

In [8]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [9]:
data = brown.sents(categories='adventure')

In [10]:
len(data)

4637

In [18]:
#first feature of the adventure category
' '.join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

# Bag of words pipline
 - get the data/corpus
 - tokenisation, stopward removal
 - stemming
 - Building a vocab
 - vectorisation
 - classification

## Tokenisation & stopward removal

In [21]:
document = """It was a very plesant day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [22]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [29]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very plesant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [30]:
sents[2]

'I went to the market to buy some fruits.'

In [32]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [33]:
words = word_tokenize(sentence)

In [34]:
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

## Stopwards

In [38]:
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

In [39]:
print(sw)

{'herself', 'during', 'each', 'about', "didn't", 'had', 'again', 'below', "mightn't", 's', "isn't", 'and', 'while', 'won', 'o', 'by', 're', 'aren', 'here', 'haven', 'an', 'when', 'y', "it's", 'more', 'doesn', 'into', 'this', 'has', 'very', 'hasn', 'if', "shouldn't", 'yourselves', 'mustn', "weren't", 'll', "don't", 'will', 'i', "should've", 'against', 'these', "hadn't", 'most', 'only', 'on', 'be', 'so', 'few', 'm', 'her', 'me', 'such', 'until', 'they', 'doing', 'other', 'all', 'she', 'didn', 'wouldn', 'him', 'couldn', 'theirs', "mustn't", 'being', 'our', 'it', 'off', 'am', "aren't", "you're", 'we', 'after', "you'd", 'its', 'under', "haven't", 'have', 'them', 'can', "won't", 'where', 'too', 'having', 't', 'same', 'a', 'just', 'as', "she's", 'needn', 'how', 'further', 'you', 'did', "that'll", 'between', 'ain', 'is', 'any', 'own', 'or', "doesn't", 'was', 'some', 'their', 'both', "needn't", 'do', 'himself', 'that', 'the', 'don', 'for', 'wasn', 'why', 'there', 'of', 'those', "wouldn't", 'whi

In [40]:
def remove_stopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [41]:
text = "i am not bothered about her very much".split()
useful_text = remove_stopwords(text, sw)
print(useful_text)

['bothered', 'much']


## Tokenisation using a regular expression
 - refer = "https://www.regexpal.com/"
 - for creating custom logic for sepration of words

In [42]:
sentence = "Send all the 50 characters related to chapter 1,2,3 at prateek@cb.com"

In [43]:
from nltk.tokenize import RegexpTokenizer

In [46]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')#above link
useful_text = tokenizer.tokenize(sentence)

In [47]:
useful_text

['Send',
 'all',
 'the',
 'characters',
 'related',
 'to',
 'chapter',
 'at',
 'prateek@cb.com']

## Stemming
 - Process the transforms particular words (verbs & plurals) into their radical form
 - Preserve the semantics of the sentence without increasing the number of unique tokens
 - Example - jumps, jumping, jumped, jump ==> jump

In [48]:

text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""


there are three types of stemmers
 - Snoball
 - porter
 - lancaster

In [50]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [51]:
ps = PorterStemmer()

In [52]:
ps.stem('jumping')

'jump'

In [55]:
ps.stem('worked')

'work'

In [56]:
ps.stem('loving')

'love'

In [58]:
#snowball is mltilingual stemmer
ss = SnowballStemmer('english')

In [59]:
ss.stem("lovely")

'love'

In [60]:
ss.stem("smoking")

'smoke'

In [63]:
##lemmatization refer - wordenet.com
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

In [64]:
wn.lemmatize('running')

'running'

## Building a vocab & Vectirization

In [65]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

In [68]:
cv = CountVectorizer()

In [70]:
vectorized_corpus = cv.fit_transform(corpus)

In [73]:
vectorized_corpus = vectorized_corpus.toarray()

In [77]:
vectorized_corpus[0]

array([0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2])

In [78]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [83]:
len(cv.vocabulary_.keys()), len(vectorized_corpus[1])

(42, 42)

In [84]:
#Reverse mapping
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [85]:
s  = cv.inverse_transform(numbers)
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


## Vectorization with Stopword Removal

In [86]:
cv = CountVectorizer?#has its own tokenizer argument

In [88]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove Stopwards
    words = remove_stopwords(words, sw)
    return words

In [91]:
myTokenizer(sentence)
#print(sentence)

['send', 'characters', 'related', 'chapter', 'prateek@cb.com']

In [92]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [93]:
vectorized_corpus = cv.fit_transform(corpus).toarrayay()

In [94]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [95]:
print(len(vectorized_corpus[0]))

33


In [96]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [97]:
# for test data
test_corpus = [
        'Indian cricket team rock!.',
]

In [98]:
#for training data call the fit_transform() and on the test data call the transform()
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [99]:
cv.vocabulary_

{'indian': 9,
 'cricket': 3,
 'team': 26,
 'wins': 31,
 'world': 32,
 'cup': 4,
 'says': 22,
 'capt.': 1,
 'virat': 29,
 'kohli.': 10,
 'held': 8,
 'sri': 24,
 'lanka.': 11,
 'win': 30,
 'next': 15,
 'lok': 13,
 'sabha': 21,
 'elections': 5,
 'confident': 2,
 'pm': 18,
 'nobel': 16,
 'laurate': 12,
 'hearts': 7,
 'people.': 17,
 'movie': 14,
 'raazi': 19,
 'exciting': 6,
 'spy': 23,
 'thriller': 27,
 'based': 0,
 'upon': 28,
 'real': 20,
 'story.': 25}

## More ways to Create Features
 - Unigram - every word as a feature
 - Bigrams
 - Trigrams
 - n-grams
 - TF-IDF Normalisation

In [108]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

corpus = [sent_1,sent_2,sent_3]

In [115]:
cv = CountVectorizer(ngram_range=(1,3))#(2, 2) -> bigram

In [116]:
docs = [sent_1[0], sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [117]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

## TF-IDF Normalisation
 - Avoid features that occur very often, becauase they contain less information
 - Information decreases as the number of occurences increases across different type of documents
 - So we define another term - term-document-frequency which associates a weight with every term

idf = (term_frequency) x log [(N)/(1+count)]

In [118]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [120]:
tfidf = TfidfVectorizer()

In [121]:
vc = tfidf.fit_transform(corpus).toarray()

In [122]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [123]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}

In [None]:
#good -> 0 lowest weigth beacuse it has the same frequency