# Nafisur Rahman
nafisur21@gmail.com<br>
https://www.linkedin.com/in/nafisur-rahman

# NLP Preprocessing task

In [1]:
import nltk

In [2]:
text='Mary had a little lamb. Her fieece was white as snow'

## Some common task in NLP
1. Tokenization
2. Stopword Removal
3. N-Grams
4. Word sense Disambiguation
5. POS
6. Stemming



## 1. Tokenization
* word_tokenize 
* sent_tokenize

In [3]:
from nltk.tokenize import word_tokenize,sent_tokenize

spliting the text into sentence

In [4]:
sents=sent_tokenize(text)
sents

['Mary had a little lamb.', 'Her fieece was white as snow']

spliting the text into word

In [5]:
words=word_tokenize(text)
words

['Mary',
 'had',
 'a',
 'little',
 'lamb',
 '.',
 'Her',
 'fieece',
 'was',
 'white',
 'as',
 'snow']

In [6]:
len(words)

12

spliting each sentence into words

In [7]:
words=[word_tokenize(w) for w in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fieece', 'was', 'white', 'as', 'snow']]


## 2. Stopword Removal

In [8]:
from nltk.corpus import stopwords
from string import punctuation

In [9]:
list(punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

Adding punctuation into stopwords

In [10]:
customStopwords=set(stopwords.words('english')+list(punctuation))

In [11]:
len(customStopwords)

211

Removing stopwords from text

In [12]:
wordsWOstopwords=[w for w in word_tokenize(text) if w not in customStopwords]
print(wordsWOstopwords)

['Mary', 'little', 'lamb', 'Her', 'fieece', 'white', 'snow']


## 3. N-grams

Finding frequency of bigrams

In [13]:
from nltk.collocations import BigramCollocationFinder

In [14]:
finder=BigramCollocationFinder.from_words(wordsWOstopwords)
sorted(finder.ngram_fd.items())

[(('Her', 'fieece'), 1),
 (('Mary', 'little'), 1),
 (('fieece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

## 4. Normalization and stemming
* PorterStemmer
* WordNetLemmatizer
* LancasterStemmer

In [15]:
text2 = "Mary closed on closing night when she was in the mood to close."

In [16]:
text3="list List lists listed listing LIST"

In [17]:
word_tokenize(text2)

['Mary',
 'closed',
 'on',
 'closing',
 'night',
 'when',
 'she',
 'was',
 'in',
 'the',
 'mood',
 'to',
 'close',
 '.']

In [18]:
word_tokenize(text3)

['list', 'List', 'lists', 'listed', 'listing', 'LIST']

In [19]:
text2.split()

['Mary',
 'closed',
 'on',
 'closing',
 'night',
 'when',
 'she',
 'was',
 'in',
 'the',
 'mood',
 'to',
 'close.']

In [20]:
text3.split(sep=' ')

['list', 'List', 'lists', 'listed', 'listing', 'LIST']

Converting all the capital letter to small and then spliting

In [21]:
words=text2.lower().split(sep=' ')
print(words)

['mary', 'closed', 'on', 'closing', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'close.']


In [22]:
words2=text3.lower().split()
words2

['list', 'list', 'lists', 'listed', 'listing', 'list']

In [23]:
from nltk.stem import PorterStemmer
porter1=PorterStemmer()

In [24]:
[porter1.stem(w) for w in words]

['mari',
 'close',
 'on',
 'close',
 'night',
 'when',
 'she',
 'wa',
 'in',
 'the',
 'mood',
 'to',
 'close.']

In [25]:
from nltk.stem import LancasterStemmer
lancaster=LancasterStemmer()

In [26]:
[lancaster.stem(w) for w in words]

['mary',
 'clos',
 'on',
 'clos',
 'night',
 'when',
 'she',
 'was',
 'in',
 'the',
 'mood',
 'to',
 'close.']

In [27]:
[lancaster.stem(w) for w in word_tokenize(text3)]

['list', 'list', 'list', 'list', 'list', 'list']

In [28]:
from nltk.stem import WordNetLemmatizer
WNlemma=WordNetLemmatizer()

In [29]:
[WNlemma.lemmatize(w) for w in words]

['mary',
 'closed',
 'on',
 'closing',
 'night',
 'when',
 'she',
 'wa',
 'in',
 'the',
 'mood',
 'to',
 'close.']

In [30]:
[WNlemma.lemmatize(w) for w in words2]

['list', 'list', 'list', 'listed', 'listing', 'list']

## 5. part of speech (POS)

In [31]:
nltk.pos_tag(words)

[('mary', 'NN'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close.', 'VB')]

In [32]:
nltk.pos_tag(words2)

[('list', 'NN'),
 ('list', 'NN'),
 ('lists', 'NNS'),
 ('listed', 'VBD'),
 ('listing', 'JJ'),
 ('list', 'NN')]

In [33]:
nltk.help.upenn_tagset('VBD')

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...


## 6. Word Sense Disambiguation
Identifing the meaning of the word in the context it is occuring

In [34]:
from nltk.corpus import wordnet

In [35]:
for ss in wordnet.synsets('bass'):
    print(ss,ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [36]:
from nltk.wsd import lesk

In [37]:
meaning=lesk(word_tokenize('sing in a lower tone, along with the bass'),'bass')
print(meaning,meaning.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [38]:
meaning=lesk(word_tokenize('I follow the rule define by the company'),'rule')
print(meaning,meaning.definition())

Synset('rule.n.08') directions that define the way a game or sport is to be conducted
