In [1]:
from nltk.corpus import brown

### 1.Data Collection

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
brown.sents?

In [5]:
data = brown.sents(categories = "editorial")
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [6]:
print(data[0])

['Assembly', 'session', 'brought', 'much', 'good']


### 2.Tokenization

In [8]:
text = "It was a pleasant day,the weather was cool and there were light showers. I went to the market to but some fruits."
print(text)

It was a pleasant day,the weather was cool and there were light showers. I went to the market to but some fruits.


In [15]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [14]:
sents = (sent_tokenize(text))
print(sents)

['It was a pleasant day,the weather was cool and there were light showers.', 'I went to the market to but some fruits.']


In [26]:
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


### 3.Stopword Removal

In [19]:
from nltk.corpus import stopwords

In [21]:
sw = set(stopwords.words('english'))

In [22]:
print(sw)
print(len(sw))

{'was', 'can', 'y', 'while', 'most', "shan't", 'their', 'which', 'who', 'any', 'both', 'is', 'each', 'himself', 'such', 'below', "mustn't", 'they', 'hadn', 'themselves', 'how', 'doing', 'and', 'after', 'own', 'don', 'no', 'll', 'ain', "doesn't", "you'd", 'me', 'yours', 'about', 'a', "aren't", 'couldn', 'where', 'aren', 'yourself', 'myself', 'few', 'same', 'against', 'off', "mightn't", 'd', 'then', 'ours', "hasn't", 'my', 'needn', 'should', 'during', 'through', 'why', 'o', 'those', 'up', 'them', 'with', 'of', 'that', 'the', 't', 'nor', 'mightn', 'shouldn', 'into', 'your', 'him', 'doesn', 'ourselves', 'here', 'm', 'do', 'were', 'until', "you're", 'are', 'hers', 'have', 'all', "you've", 'by', 'having', 'been', 'again', 'haven', 'now', 'or', 'our', "couldn't", 'i', 've', 'we', 'isn', 'his', 'at', 'on', 'than', 'am', 'itself', 'this', 'over', "don't", 'will', "isn't", "it's", 'above', 'under', 'when', 'shan', "needn't", 'herself', 'whom', "you'll", 'only', 'has', 'did', 'more', "she's", 'on

### Filtering useful words

In [34]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

useful_words = filter_words(word_list)
print(useful_words)

['foxes', 'love', 'make', 'jump', 'quicker', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'high', 'wall']


## Tokenization using Regular Expression

In [35]:
from nltk.tokenize import RegexpTokenizer

In [36]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")

In [37]:
text = "Send all 50 docs related to clause 1,2 at abc@xyz.com"
print(tokenizer.tokenize(text))

['Send', 'all', 'docs', 'related', 'to', 'clause', 'at', 'abc@xyz', 'com']


## Stemming
- Transforms Particular words(verbs,plurals) - Radical Forms.
- Prevents increase in number of unique tokens with same meaning
- Eg - jumps,jumped,jumping --> jump

In [38]:
text = """Foxes love to make jump. The quicker brown fox was seen jumping over the lovely dog from a 6ft high wall"""

word_list = tokenizer.tokenize(text.lower())
print(word_list)

['foxes', 'love', 'to', 'make', 'jump', 'the', 'quicker', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'high', 'wall']


In [39]:
word_list = filter_words(word_list)
print(word_list)

['foxes', 'love', 'make', 'jump', 'quicker', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'high', 'wall']


## Stemmers Type
- snowball stemmer
- Porter Stemmer
- Lancaster Stemmer

In [41]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [42]:
ps = PorterStemmer()

print(ps.stem("Lovely"))
print(ps.stem("jumped"))
print(ps.stem("awesome"))

love
jump
awesom


In [43]:
ls = LancasterStemmer()

print(ls.stem("teenager"))
print(ps.stem("teenager"))

teen
teenag


In [48]:
ss = SnowballStemmer('english')

print(ss.stem("teenager"))

teenag


In [49]:
ss_f = SnowballStemmer('french')

print(ss_f.stem("cousine"))

cousin


In [50]:
## Lemmatisation 

from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()
print(l.lemmatize("crying"))

cry


## Building Common Vocabulary and Vectorizing Documents (based upon Bag of Words Model)

In [67]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
cv = CountVectorizer()

vectorized_corpus = cv.fit_transform(corpus).toarray()

In [70]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]
42


In [71]:
print(cv.vocabulary_) #Dictionary - Word -> Index

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [72]:
# Given a Vector what is the sentenence 
import numpy as np
vector = np.ones((42,))
vector[3:7] = 0

print(vector)
print(len(vector))

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
42


In [73]:
print(cv.inverse_transform(vector))

[array(['an', 'at', 'based', 'cup', 'elections', 'exciting', 'hearts',
       'held', 'indian', 'is', 'kohli', 'lanka', 'laurate', 'lok',
       'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real',
       'sabha', 'says', 'spy', 'sri', 'story', 'team', 'the', 'thriller',
       'upon', 'virat', 'we', 'will', 'win', 'wins', 'won', 'world'],
      dtype='<U9')]


In [74]:
cv.vocabulary_["capt"]

4

### Reducing size of vector

In [75]:
def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words) ##Filter out the stopwords

print(myTokenizer(corpus[0]))

['indian', 'cricket', 'team', 'wins', 'world', 'cup', 'says', 'capt', 'virat', 'kohli', 'world', 'cup', 'held', 'sri', 'lanka']


In [76]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))

[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
33


In [80]:
v = vc[0]
print(vc[0])
print(cv.inverse_transform(v))

vc[0][2] = 1
print(vc[0])
print(cv.inverse_transform(v)) ## confident added as index corresponding to it is turned on

[1 1 1 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
[array(['based', 'capt', 'confident', 'cricket', 'cup', 'held', 'indian',
       'kohli', 'lanka', 'says', 'sri', 'team', 'virat', 'wins', 'world'],
      dtype='<U9')]
[1 1 1 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
[array(['based', 'capt', 'confident', 'cricket', 'cup', 'held', 'indian',
       'kohli', 'lanka', 'says', 'sri', 'team', 'virat', 'wins', 'world'],
      dtype='<U9')]


## Features in Bag of Words Model
- Unigrams
- Bigrams, Trigrams
- N-Grams

In [82]:
cv = CountVectorizer(tokenizer=myTokenizer,ngram_range=(1,2))
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()

print(cv.vocabulary_)

{'indian': 19, 'cricket': 6, 'team': 52, 'wins': 62, 'world': 64, 'cup': 8, 'says': 44, 'capt': 2, 'virat': 58, 'kohli': 23, 'held': 17, 'sri': 49, 'lanka': 25, 'indian cricket': 20, 'cricket team': 7, 'team wins': 53, 'wins world': 63, 'world cup': 65, 'cup says': 10, 'says capt': 45, 'capt virat': 3, 'virat kohli': 59, 'kohli world': 24, 'cup held': 9, 'held sri': 18, 'sri lanka': 50, 'win': 60, 'next': 32, 'lok': 28, 'sabha': 42, 'elections': 11, 'confident': 4, 'pm': 37, 'win next': 61, 'next lok': 33, 'lok sabha': 29, 'sabha elections': 43, 'elections says': 12, 'says confident': 46, 'confident indian': 5, 'indian pm': 21, 'nobel': 34, 'laurate': 26, 'hearts': 15, 'people': 36, 'nobel laurate': 35, 'laurate hearts': 27, 'hearts people': 16, 'movie': 30, 'raazi': 38, 'exciting': 13, 'spy': 47, 'thriller': 54, 'based': 0, 'upon': 56, 'real': 40, 'story': 51, 'movie raazi': 31, 'raazi exciting': 39, 'exciting indian': 14, 'indian spy': 22, 'spy thriller': 48, 'thriller based': 55, 'b

In [83]:
print(len(vc[0]))

66


## Tf-idf Normalisation
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,2),norm='l2')

vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.         0.17142549 0.17142549 0.         0.
  0.17142549 0.17142549 0.34285097 0.17142549 0.17142549 0.
  0.         0.         0.         0.         0.         0.17142549
  0.17142549 0.10941867 0.17142549 0.         0.         0.17142549
  0.17142549 0.17142549 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.13515382 0.17142549 0.         0.
  0.         0.17142549 0.17142549 0.         0.17142549 0.17142549
  0.         0.         0.         0.         0.17142549 0.17142549
  0.         0.         0.17142549 0.17142549 0.34285097 0.34285097]
 [0.         0.         0.         0.         0.24977372 0.24977372
  0.         0.         0.         0.         0.         0.24977372
  0.24977372 0.         0.         0.         0.         0.
  0.         0.15942733 0.         0.24977372 0.         0.
  0.         0.         0.         0.      

In [88]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 19, 'cricket': 6, 'team': 52, 'wins': 62, 'world': 64, 'cup': 8, 'says': 44, 'capt': 2, 'virat': 58, 'kohli': 23, 'held': 17, 'sri': 49, 'lanka': 25, 'indian cricket': 20, 'cricket team': 7, 'team wins': 53, 'wins world': 63, 'world cup': 65, 'cup says': 10, 'says capt': 45, 'capt virat': 3, 'virat kohli': 59, 'kohli world': 24, 'cup held': 9, 'held sri': 18, 'sri lanka': 50, 'win': 60, 'next': 32, 'lok': 28, 'sabha': 42, 'elections': 11, 'confident': 4, 'pm': 37, 'win next': 61, 'next lok': 33, 'lok sabha': 29, 'sabha elections': 43, 'elections says': 12, 'says confident': 46, 'confident indian': 5, 'indian pm': 21, 'nobel': 34, 'laurate': 26, 'hearts': 15, 'people': 36, 'nobel laurate': 35, 'laurate hearts': 27, 'hearts people': 16, 'movie': 30, 'raazi': 38, 'exciting': 13, 'spy': 47, 'thriller': 54, 'based': 0, 'upon': 56, 'real': 40, 'story': 51, 'movie raazi': 31, 'raazi exciting': 39, 'exciting indian': 14, 'indian spy': 22, 'spy thriller': 48, 'thriller based': 55, 'b