In [1]:
import numpy as np
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize,word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### 1. Data Collection

In [2]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
data=brown.sents(categories='editorial')
print(len(data))
print(data[0])
print(data[1])

2997
['Assembly', 'session', 'brought', 'much', 'good']
['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


## Basic NLP Pipeline
* Data collection
* Tokenization, stopword, stemming, lemmatization
* Building a common vocab
* Vectorizing the documents
* Perform Classification/Clustering

### 2. Tokenization

In [4]:
text="Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com. Send all the 80 documents related to clauses 4,5,6 at def@uvx.com"
print(text)

Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com. Send all the 80 documents related to clauses 4,5,6 at def@uvx.com


In [5]:
text=text.lower()
sents=sent_tokenize(text)
print(sents[0])
print(sents[1])

send all the 50 documents related to clauses 1,2,3 at abc@xyz.com.
send all the 80 documents related to clauses 4,5,6 at def@uvx.com


In [6]:
word_list=word_tokenize(sents[0])
print(word_list)

['send', 'all', 'the', '50', 'documents', 'related', 'to', 'clauses', '1,2,3', 'at', 'abc', '@', 'xyz.com', '.']


### 3. Stopward Removal

In [7]:
sw=set(stopwords.words('english'))
print(len(sw))
print(sw)

179
{'into', 'a', 'do', "shouldn't", 'shan', 'from', 'just', 'how', 'hers', 'ours', 'needn', "you've", 'is', 'in', "don't", 'because', 'll', 'yours', "haven't", 'wasn', "wouldn't", 'its', 'y', 'than', 'were', "hadn't", 're', "couldn't", "isn't", 'down', 'hasn', 'this', 'be', 'up', 'when', 'where', 'but', 'here', 'then', 'our', 'their', 'both', 'has', 'nor', 'does', 'too', 'out', 'more', 'further', 'ma', 'most', 'theirs', "that'll", 'why', 'shouldn', 'any', 'you', 'as', 'between', 'of', 'what', 'until', "doesn't", 'few', 'i', 'd', 'yourself', "didn't", 'once', 'to', 'me', 'are', 'after', 'under', 'can', 'who', 'these', 'm', 'having', 'herself', 'been', 've', "won't", 'myself', "it's", 'which', 'there', "you'll", 'mightn', 'mustn', 'above', 'only', 'during', 'doesn', 'won', 'such', 'did', 'if', 'ourselves', 'each', 'by', 'not', 'itself', 'doing', 'o', 'she', "you'd", 'ain', 'had', 'against', 's', 'him', 'he', 'them', 'couldn', "you're", 'or', 'at', 'other', 'very', 'about', 'didn', 'hadn

In [8]:
def filter_words(word_list):
    return [w for w in word_list if w not in sw]

In [9]:
fw=filter_words(word_list)
print(fw)

['send', '50', 'documents', 'related', 'clauses', '1,2,3', 'abc', '@', 'xyz.com', '.']


### Tokenization using Regular Expression
* Word Tokenizer cannot handle complex tokenization so we use Regex Tokenizer in class NLTK

In [10]:
tokenizer=RegexpTokenizer("[a-zA-Z@]+")
word_list=tokenizer.tokenize(sents[0])
fw=filter_words(word_list)
print(fw)

['send', 'documents', 'related', 'clauses', 'abc@xyz', 'com']


### 4. Stemming
* Process that transforms particular words(verbs,plurals) into their radical form.
* Preserve the semantics of the sentence without increasing the number of unique tokens.
* jumps, jumped, jumping, jump => jump


* **Snowball** **Stammer**
* **Porter** **Stammer**
* **Lancaster** **Stammer**


In [11]:
ps=PorterStemmer()

In [12]:
print(ps.stem("Jumps"))
print(ps.stem("Jumped"))
print(ps.stem("Jumping"))
print(ps.stem("Awesome"))
print(ps.stem("Teeth"))

jump
jump
jump
awesom
teeth


In [13]:
ls=LancasterStemmer()
print(ls.stem("teeth"))
print(ls.stem("Awesome"))

tee
awesom


In [14]:
ss=SnowballStemmer('english')
print(ss.stem("Teenager"))
print(ss.stem("Awesome"))

teenag
awesom


### 5. Building Common Vocabulary and Vectorizing Documents (Based upon bag of words model)

In [15]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [16]:
cv=CountVectorizer()

In [17]:
vectorized_corpus=cv.fit_transform(corpus).toarray()
print(len(vectorized_corpus))
print(vectorized_corpus)

4
[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]


In [18]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [19]:
vector = np.ones((37,))
vector[3:7] = 0
print(vector)
print(len(vector))
print(cv.inverse_transform(vector))

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
37
[array(['an', 'at', 'based', 'cup', 'elections', 'exciting', 'hearts',
       'held', 'indian', 'is', 'kohli', 'lanka', 'laurate', 'lok',
       'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real',
       'sabha', 'says', 'spy', 'sri', 'story', 'team', 'the', 'thriller',
       'upon', 'virat', 'we'], dtype='<U9')]


### Custom Tokenizer

In [20]:
def myTokenizer(sentence):
    words=tokenizer.tokenize(sentence)
    return filter_words(words)

In [21]:
myTokenizer(corpus[0])

['Indian',
 'cricket',
 'team',
 'wins',
 'World',
 'Cup',
 'says',
 'Capt',
 'Virat',
 'Kohli',
 'World',
 'cup',
 'held',
 'Sri',
 'Lanka']

In [22]:
cv=CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus=cv.fit_transform(corpus)
vc=vectorized_corpus.toarray()
print(vc)
print(len(vc[0]))

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]
33


In [23]:
print(cv.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}


In [24]:
v=vc[0]
print(cv.inverse_transform(v))

[array(['capt', 'cricket', 'cup', 'held', 'indian', 'kohli', 'lanka',
       'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9')]


### Features in Bag of words model
* Uni-grams
* Bi-grams, Tri-grams
* N-Grams

In [25]:
cv=CountVectorizer(tokenizer=myTokenizer,ngram_range=(1,3))
vectorized_corpus=cv.fit_transform(corpus)
vc=vectorized_corpus.toarray()
print(len(vc[0]))
print(vc)

96
[[0 0 0 1 1 1 0 0 0 1 1 1 2 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1
  1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1
  1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 2 2 1 1]
 [0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
  0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0
  0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0
  0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [26]:
print(cv.vocabulary_)

{'indian': 28, 'cricket': 9, 'team': 74, 'wins': 89, 'world': 92, 'cup': 12, 'says': 63, 'capt': 3, 'virat': 83, 'kohli': 34, 'held': 25, 'sri': 71, 'lanka': 37, 'indian cricket': 29, 'cricket team': 10, 'team wins': 75, 'wins world': 90, 'world cup': 93, 'cup says': 15, 'says capt': 64, 'capt virat': 4, 'virat kohli': 84, 'kohli world': 35, 'cup held': 13, 'held sri': 26, 'sri lanka': 72, 'indian cricket team': 30, 'cricket team wins': 11, 'team wins world': 76, 'wins world cup': 91, 'world cup says': 95, 'cup says capt': 16, 'says capt virat': 65, 'capt virat kohli': 5, 'virat kohli world': 85, 'kohli world cup': 36, 'world cup held': 94, 'cup held sri': 14, 'held sri lanka': 27, 'win': 86, 'next': 47, 'lok': 41, 'sabha': 60, 'elections': 17, 'confident': 6, 'pm': 54, 'win next': 87, 'next lok': 48, 'lok sabha': 42, 'sabha elections': 61, 'elections says': 18, 'says confident': 66, 'confident indian': 7, 'indian pm': 31, 'win next lok': 88, 'next lok sabha': 49, 'lok sabha elections'

In [27]:
v=vc[0]
print(cv.inverse_transform(v))

[array(['capt', 'capt virat', 'capt virat kohli', 'cricket',
       'cricket team', 'cricket team wins', 'cup', 'cup held',
       'cup held sri', 'cup says', 'cup says capt', 'held', 'held sri',
       'held sri lanka', 'indian', 'indian cricket',
       'indian cricket team', 'kohli', 'kohli world', 'kohli world cup',
       'lanka', 'says', 'says capt', 'says capt virat', 'sri',
       'sri lanka', 'team', 'team wins', 'team wins world', 'virat',
       'virat kohli', 'virat kohli world', 'wins', 'wins world',
       'wins world cup', 'world', 'world cup', 'world cup held',
       'world cup says'], dtype='<U24')]


### Tf-idf Normalisation

* Avoid features that occur very often, becauase they contain less information
* Information decreases as the number of occurences increases across different type of documents
* So we define another term - term-document-frequency which associates a weight with every term



In [28]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,1),norm='l2')
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.2355126  0.         0.2355126  0.4710252  0.
  0.         0.         0.2355126  0.15032464 0.2355126  0.2355126
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.18568084 0.
  0.2355126  0.         0.2355126  0.         0.         0.2355126
  0.         0.2355126  0.4710252 ]
 [0.         0.         0.35291425 0.         0.         0.35291425
  0.         0.         0.         0.22526059 0.         0.
  0.         0.35291425 0.         0.35291425 0.         0.
  0.35291425 0.         0.         0.35291425 0.27824164 0.
  0.         0.         0.         0.         0.         0.
  0.35291425 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.5        0.         0.         0.         0.5        0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0. 

In [29]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}
