# Text Analysis nltk

Based on Rocky DeRaze youtube channel. Some updates are made as per syntax. There are some additional examples being used to clarify nlp concepts

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [3]:
import nltk 

### 1)- Sentence Tokenizer

In [4]:
parag = "WASHINGTON — In his first full cabinet meeting last June, President Trump invited a chorus of gushing praise from his top aides by boasting that he had assembled a “phenomenal team of people, a great group of talent.” \
But in the nine months since then, Mr. Trump has fired or forced out a half-dozen of the “incredible, talented” people in the Cabinet Room that day: his secretaries of state and health, along with his chief strategist, his chief of staff, his top economic aide and his press secretary. \
And the purge at the top may not be over. Mr. Trump, who is famously fickle, appears to have soured on additional members of his senior leadership team — and his frequent mulling about making changes has some people around him convinced that he could act soon. \
“There will always be change. I think you want to see change,” Mr. Trump said, ominously, on Thursday. “I want to also see different ideas.”"

In [5]:
from nltk.tokenize import sent_tokenize

In [6]:
tokenized_parag = sent_tokenize(parag)

In [7]:
print(tokenized_parag)

['WASHINGTON — In his first full cabinet meeting last June, President Trump invited a chorus of gushing praise from his top aides by boasting that he had assembled a “phenomenal team of people, a great group of talent.” But in the nine months since then, Mr. Trump has fired or forced out a half-dozen of the “incredible, talented” people in the Cabinet Room that day: his secretaries of state and health, along with his chief strategist, his chief of staff, his top economic aide and his press secretary.', 'And the purge at the top may not be over.', 'Mr. Trump, who is famously fickle, appears to have soured on additional members of his senior leadership team — and his frequent mulling about making changes has some people around him convinced that he could act soon.', '“There will always be change.', 'I think you want to see change,” Mr. Trump said, ominously, on Thursday.', '“I want to also see different ideas.”']


In [8]:
print(len(tokenized_parag))

6


In [9]:
import nltk.data

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [11]:
for w in tokenizer.tokenize(parag):
	print(w+'\n')

WASHINGTON — In his first full cabinet meeting last June, President Trump invited a chorus of gushing praise from his top aides by boasting that he had assembled a “phenomenal team of people, a great group of talent.” But in the nine months since then, Mr. Trump has fired or forced out a half-dozen of the “incredible, talented” people in the Cabinet Room that day: his secretaries of state and health, along with his chief strategist, his chief of staff, his top economic aide and his press secretary.

And the purge at the top may not be over.

Mr. Trump, who is famously fickle, appears to have soured on additional members of his senior leadership team — and his frequent mulling about making changes has some people around him convinced that he could act soon.

“There will always be change.

I think you want to see change,” Mr. Trump said, ominously, on Thursday.

“I want to also see different ideas.”



In [12]:
arrayT = tokenizer.tokenize(parag)

In [13]:
print(arrayT[5])

“I want to also see different ideas.”


In [14]:
print(arrayT[1])

And the purge at the top may not be over.


### 2)- Word Tokenizer

In [15]:
from nltk.tokenize import word_tokenize

In [16]:
sent1 = "Let\'s hack this freaking Linux droid! We shan't wait."

In [17]:
arr = word_tokenize(sent1)

In [18]:
from nltk.tokenize import TreebankWordTokenizer

In [19]:
tok2 = TreebankWordTokenizer()

In [20]:
from nltk.tokenize import WordPunctTokenizer 

tok3 = WordPunctTokenizer()

In [21]:
print(arr)
print(tok2.tokenize(sent1))
print(tok3.tokenize(sent1))


['Let', "'s", 'hack', 'this', 'freaking', 'Linux', 'droid', '!', 'We', 'sha', "n't", 'wait', '.']
['Let', "'s", 'hack', 'this', 'freaking', 'Linux', 'droid', '!', 'We', 'sha', "n't", 'wait', '.']
['Let', "'", 's', 'hack', 'this', 'freaking', 'Linux', 'droid', '!', 'We', 'shan', "'", 't', 'wait', '.']


See the difference how WordPunctTokenizer works with punctuations. 

### 3)-Regular Expression Tokenizer

In [22]:
from nltk.tokenize import regexp_tokenize 

In [23]:
parag1 = "I won't do this, you shan't do that."

In [24]:
from nltk.tokenize import word_tokenize 

In [25]:
print(word_tokenize(parag1))

['I', 'wo', "n't", 'do', 'this', ',', 'you', 'sha', "n't", 'do', 'that', '.']


In [26]:
print(regexp_tokenize(parag1, "[\w']+"))

['I', "won't", 'do', 'this', 'you', "shan't", 'do', 'that']


Problem for word segmentation is pretty much solved

In [27]:
# without + at end
print(regexp_tokenize(parag1, "[\w']"))

['I', 'w', 'o', 'n', "'", 't', 'd', 'o', 't', 'h', 'i', 's', 'y', 'o', 'u', 's', 'h', 'a', 'n', "'", 't', 'd', 'o', 't', 'h', 'a', 't']


In [28]:
from nltk.tokenize import RegexpTokenizer

In [29]:
tokenizer = RegexpTokenizer("[\w]+")

In [30]:
print(tokenizer.tokenize(parag1))

['I', 'won', 't', 'do', 'this', 'you', 'shan', 't', 'do', 'that']


### 4)-Stop words

In [31]:
from nltk.corpus import stopwords 

In [32]:
ensw = stopwords.words('english')

In [33]:
print(ensw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [34]:
len(ensw)

179

In [35]:
from nltk.tokenize import word_tokenize

In [36]:
parag1 = "WASHINGTON — In his first full cabinet meeting last June, President Trump invited a chorus of gushing praise from his top aides by boasting that he had assembled a “phenomenal team of people, a great group of talent.” \
But in the nine months since then, Mr. Trump has fired or forced out a half-dozen of the “incredible, talented” people in the Cabinet Room that day: his secretaries of state and health, along with his chief strategist, his chief of staff, his top economic aide and his press secretary. \
And the purge at the top may not be over. Mr. Trump, who is famously fickle, appears to have soured on additional members of his senior leadership team — and his frequent mulling about making changes has some people around him convinced that he could act soon. \
“There will always be change. I think you want to see change,” Mr. Trump said, ominously, on Thursday. “I want to also see different ideas.”"

In [37]:
paragArr = word_tokenize(parag1.lower())

In [38]:
print(paragArr)

['washington', '—', 'in', 'his', 'first', 'full', 'cabinet', 'meeting', 'last', 'june', ',', 'president', 'trump', 'invited', 'a', 'chorus', 'of', 'gushing', 'praise', 'from', 'his', 'top', 'aides', 'by', 'boasting', 'that', 'he', 'had', 'assembled', 'a', '“', 'phenomenal', 'team', 'of', 'people', ',', 'a', 'great', 'group', 'of', 'talent.', '”', 'but', 'in', 'the', 'nine', 'months', 'since', 'then', ',', 'mr.', 'trump', 'has', 'fired', 'or', 'forced', 'out', 'a', 'half-dozen', 'of', 'the', '“', 'incredible', ',', 'talented', '”', 'people', 'in', 'the', 'cabinet', 'room', 'that', 'day', ':', 'his', 'secretaries', 'of', 'state', 'and', 'health', ',', 'along', 'with', 'his', 'chief', 'strategist', ',', 'his', 'chief', 'of', 'staff', ',', 'his', 'top', 'economic', 'aide', 'and', 'his', 'press', 'secretary', '.', 'and', 'the', 'purge', 'at', 'the', 'top', 'may', 'not', 'be', 'over', '.', 'mr.', 'trump', ',', 'who', 'is', 'famously', 'fickle', ',', 'appears', 'to', 'have', 'soured', 'on', '

In [39]:
len(paragArr)

187

In [40]:
#list comprehension
filterArr = [item for item in paragArr if item not in ensw]

In [41]:
print(filterArr)

['washington', '—', 'first', 'full', 'cabinet', 'meeting', 'last', 'june', ',', 'president', 'trump', 'invited', 'chorus', 'gushing', 'praise', 'top', 'aides', 'boasting', 'assembled', '“', 'phenomenal', 'team', 'people', ',', 'great', 'group', 'talent.', '”', 'nine', 'months', 'since', ',', 'mr.', 'trump', 'fired', 'forced', 'half-dozen', '“', 'incredible', ',', 'talented', '”', 'people', 'cabinet', 'room', 'day', ':', 'secretaries', 'state', 'health', ',', 'along', 'chief', 'strategist', ',', 'chief', 'staff', ',', 'top', 'economic', 'aide', 'press', 'secretary', '.', 'purge', 'top', 'may', '.', 'mr.', 'trump', ',', 'famously', 'fickle', ',', 'appears', 'soured', 'additional', 'members', 'senior', 'leadership', 'team', '—', 'frequent', 'mulling', 'making', 'changes', 'people', 'around', 'convinced', 'could', 'act', 'soon', '.', '“', 'always', 'change', '.', 'think', 'want', 'see', 'change', ',', '”', 'mr.', 'trump', 'said', ',', 'ominously', ',', 'thursday', '.', '“', 'want', 'also',

In [42]:
len(filterArr)

119

### 5)-Synsets, Hypernyms and Hyponyms

In [43]:
from nltk.corpus import wordnet

In [44]:
word1 = "Movie"

In [45]:
synArray = wordnet.synsets(word1)

In [46]:
print(synArray)

[Synset('movie.n.01')]


In [47]:
# word of interest
woi = synArray[0]

In [48]:
woi

Synset('movie.n.01')

In [49]:
woi.definition()

'a form of entertainment that enacts a story by sound and a sequence of images giving the illusion of continuous movement'

In [50]:
woi.pos()

'n'

In [51]:
woi.name()

'movie.n.01'

In [52]:
woi.hypernyms()

[Synset('product.n.02'), Synset('show.n.03')]

In [53]:
woi.hyponyms()

[Synset('cinema_verite.n.01'),
 Synset('collage_film.n.01'),
 Synset('coming_attraction.n.01'),
 Synset('documentary.n.01'),
 Synset('feature.n.03'),
 Synset('film_noir.n.01'),
 Synset('final_cut.n.01'),
 Synset('home_movie.n.01'),
 Synset('musical.n.01'),
 Synset('rough_cut.n.01'),
 Synset('shoot-'em-up.n.01'),
 Synset('short_subject.n.01'),
 Synset('silent_movie.n.01'),
 Synset('skin_flick.n.01'),
 Synset('slow_motion.n.01'),
 Synset('talking_picture.n.01'),
 Synset('telefilm.n.01'),
 Synset('three-d.n.01')]

In [54]:
woi2 = woi.hyponyms()[12]

In [55]:
woi2

Synset('silent_movie.n.01')

In [56]:
woi2.hypernyms()

[Synset('movie.n.01')]

In [57]:
woi2.definition()

'a movie without a soundtrack'

### 6)-Lemmas, Synonyms and Antonyms

In [58]:
from nltk.corpus import wordnet 

In [59]:
sArr = wordnet.synsets('win')

In [60]:
sArr

[Synset('win.n.01'),
 Synset('winnings.n.01'),
 Synset('win.v.01'),
 Synset('acquire.v.05'),
 Synset('gain.v.05'),
 Synset('succeed.v.01')]

In [61]:
woi = sArr[2]

In [62]:
woi.pos()

'v'

In [63]:
woi.lemmas()

[Lemma('win.v.01.win')]

In [64]:
woi.definition()

'be the winner in a contest or competition; be victorious'

In [65]:
woi.lemmas()[0].name()

'win'

In [66]:
synArr = []
antArr = []

In [67]:
for syn in sArr:
	for lem in syn.lemmas():
		synArr.append(lem.name())

In [68]:
print(synArr)

['win', 'winnings', 'win', 'profits', 'win', 'acquire', 'win', 'gain', 'gain', 'advance', 'win', 'pull_ahead', 'make_headway', 'get_ahead', 'gain_ground', 'succeed', 'win', 'come_through', 'bring_home_the_bacon', 'deliver_the_goods']


In [69]:
print(len(synArr))

20


In [70]:
# to get a unique value, we will use "set"
print(set(synArr))

{'winnings', 'make_headway', 'get_ahead', 'come_through', 'pull_ahead', 'succeed', 'win', 'gain', 'advance', 'acquire', 'profits', 'deliver_the_goods', 'bring_home_the_bacon', 'gain_ground'}


In [71]:
print(len(set(synArr)))

14


In [72]:
print(woi.lemmas()[0].antonyms())

[Lemma('lose.v.02.lose')]


In [73]:
for syn in sArr:
	for lem in syn.lemmas():
		for ant in lem.antonyms():
			antArr.append(ant.name())

In [74]:
print(antArr)
print(len(antArr))

['losings', 'lose', 'lose', 'fall_back', 'fail']
5


In [75]:
print(set(antArr))
print(len(set(antArr)))

{'fall_back', 'lose', 'fail', 'losings'}
4


### 7)-Wu Palmer Similarity

In [76]:
from nltk.corpus import wordnet

In [77]:
sarr1 = wordnet.synsets('cake')
sarr2 = wordnet.synsets('loaf')
sarr3 = wordnet.synsets('bread')

In [78]:
print(sarr1)

[Synset('cake.n.01'), Synset('patty.n.01'), Synset('cake.n.03'), Synset('coat.v.03')]


In [79]:
print(sarr2)

[Synset('loaf_of_bread.n.01'), Synset('loaf.n.02'), Synset('bum.v.02'), Synset('loiter.v.01')]


In [80]:
print(sarr3)

[Synset('bread.n.01'), Synset('boodle.n.01'), Synset('bread.v.01')]


In [81]:
cake = sarr1[0]

In [82]:
loafb = sarr2[0]

In [83]:
loaf = sarr2[1]

In [84]:
bread = sarr3[0]

In [85]:
print(cake.wup_similarity(loaf))

0.3076923076923077


meaning that cake and loaf are 30.76% similar

In [86]:
print(cake.wup_similarity(loafb))
print(loaf.wup_similarity(loafb))
print(bread.wup_similarity(loaf))
print(bread.wup_similarity(loafb))

0.26666666666666666
0.7142857142857143
0.7692307692307693
0.9411764705882353


In [87]:
print(loaf.hypernyms()[0])

Synset('food.n.02')


In [88]:
ref = loaf.hypernyms()[0]

In [89]:
print(loaf.shortest_path_distance(ref))
print(bread.shortest_path_distance(ref))
print(loafb.shortest_path_distance(ref))
print(cake.shortest_path_distance(ref))

1
2
3
8


### 8)-Path and LCH Similarities

In [90]:
catArr = wordnet.synsets("cat")
dogArr = wordnet.synsets("dog")

In [91]:
catArr

[Synset('cat.n.01'),
 Synset('guy.n.01'),
 Synset('cat.n.03'),
 Synset('kat.n.01'),
 Synset('cat-o'-nine-tails.n.01'),
 Synset('caterpillar.n.02'),
 Synset('big_cat.n.01'),
 Synset('computerized_tomography.n.01'),
 Synset('cat.v.01'),
 Synset('vomit.v.01')]

In [92]:
dogArr

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [93]:
doi = dogArr[0]
coi = catArr[0]

In [94]:
doi

Synset('dog.n.01')

In [95]:
coi

Synset('cat.n.01')

In [96]:
doi.wup_similarity(coi)

0.8571428571428571

In [97]:
doi.path_similarity(coi)

0.2

similarity of 1 or value closer to 1 describes sameness

In [98]:
doi.path_similarity(doi)

1.0

**Leacock Chodorow similarity**

In [99]:
doi.lch_similarity(coi)

2.0281482472922856

In [100]:
doi.lch_similarity(doi)

3.6375861597263857

In [101]:
coi.lch_similarity(doi)

2.0281482472922856

In [102]:
coi.lch_similarity(coi)

3.6375861597263857

In this 3.63 is the highest value as it shows relation of dog with dog

### 9)-Bigrams

In [103]:
from nltk.corpus import webtext 
from nltk.collocations import BigramCollocationFinder 
from nltk.metrics import BigramAssocMeasures

In [104]:
nltk.download('webtext')

[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\Hassan\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

In [105]:
# getting data
textWords = [w.lower() for w in webtext.words('pirates.txt')]

In [106]:
len(textWords)

22679

In [107]:
finder = BigramCollocationFinder.from_words(textWords)

In [108]:
likeliestW = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)

In [109]:
print(likeliestW)

[("'", 's'), ('jack', 'sparrow'), (']', '['), ('will', 'turner'), ('sparrow', ':'), ('elizabeth', 'swann'), ('turner', ':'), ('davy', 'jones'), ('swann', ':'), ("'", 't'), ('flying', 'dutchman'), ('lord', 'cutler'), ('cutler', 'beckett'), ('black', 'pearl'), ('gibbs', ':'), ('[', 'jack'), ('tia', 'dalma'), ("'", 're'), ('of', 'the'), ('!', '[')]


In [110]:
len(likeliestW)

20

In [111]:
# Removing stopwords
from nltk.corpus import stopwords 
ignored_words = set(stopwords.words('english'))

In [112]:
filterStops = lambda w: len(w) < 3 or w in ignored_words

In [113]:
finder.apply_word_filter(filterStops)
likeliestW = finder.nbest(BigramAssocMeasures.likelihood_ratio, 20)

In [114]:
print(likeliestW)

[('jack', 'sparrow'), ('elizabeth', 'swann'), ('davy', 'jones'), ('flying', 'dutchman'), ('lord', 'cutler'), ('cutler', 'beckett'), ('black', 'pearl'), ('tia', 'dalma'), ('cannibal', 'island'), ('port', 'royal'), ('bamboo', 'pole'), ('edinburgh', 'trader'), ('east', 'india'), ('india', 'trading'), ('wounded', 'sailor'), ('black', 'spot'), ('scuttled', 'ship'), ('isla', 'cruces'), ('slow', 'motion'), ('trading', 'company')]


Names like Jack Sparrow and Davy Jones are examples of Bigrams

### 10)- Trigrams

In [115]:
from nltk.collocations import TrigramCollocationFinder
from nltk.corpus import webtext 
from nltk.metrics import TrigramAssocMeasures

In [116]:
textWords = [w.lower() for w in webtext.words('grail.txt')]

In [117]:
len(textWords)

16967

In [118]:
print(textWords[:50])

['scene', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop', 'clop', ']', 'king', 'arthur', ':', 'whoa', 'there', '!', '[', 'clop', 'clop', 'clop', ']', 'soldier', '#', '1', ':', 'halt', '!', 'who', 'goes', 'there', '?', 'arthur', ':', 'it', 'is', 'i', ',', 'arthur', ',', 'son', 'of', 'uther', 'pendragon', ',', 'from', 'the', 'castle', 'of', 'camelot']


In [119]:
finder = TrigramCollocationFinder.from_words(textWords)

In [120]:
likeliestW = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 20)

In [121]:
print(likeliestW)

[('[', 'boom', ']'), ('[', 'singing', ']'), ('[', 'music', ']'), ('[', 'clang', ']'), ('.', 'arthur', ':'), ('[', 'chanting', ']'), ('[', 'pause', ']'), ('[', 'squeak', ']'), ('[', 'thud', ']'), ('[', 'bonk', ']'), ('[', 'clunk', ']'), ('[', 'crash', ']'), ('[', 'howl', ']'), ('[', 'roar', ']'), ('[', 'kick', ']'), ('[', 'trumpets', ']'), ('[', 'twang', ']'), ('[', 'twong', ']'), ('[', 'whop', ']'), ('[', 'clank', ']')]


In [122]:
len(likeliestW)

20

In [123]:
# again applying stopwords and filters
from nltk.corpus import stopwords 

ignored_words = set(stopwords.words('english'))

filterStops = lambda w: len(w) < 3 or w in ignored_words 

In [124]:
finder.apply_word_filter(filterStops)

In [125]:
likeliestW = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 30)

In [126]:
print(likeliestW)

[('clop', 'clop', 'clop'), ('mumble', 'mumble', 'mumble'), ('squeak', 'squeak', 'squeak'), ('saw', 'saw', 'saw'), ('black', 'knight', 'kills'), ('black', 'knight', 'always'), ('pie', 'iesu', 'domine'), ('clap', 'clap', 'clap'), ('squeak', 'squeak', '...]'), ('...', 'head', 'knight'), ('dona', 'eis', 'requiem'), ('brave', 'sir', 'robin'), ('holy', 'grail', 'returns'), ('holy', 'grail', 'could'), ('heh', 'heh', 'heh'), ('king', 'arthur', 'music'), ('iesu', 'domine', '...'), ('haw', 'haw', 'haw'), ('hee', 'hee', 'hee'), ('bold', 'sir', 'robin'), ('round', 'table', 'narrator'), ('sir', 'robin', 'ran'), ('sir', 'robin', 'rode'), ('sir', 'robin', 'set'), ('round', 'table', 'shall'), ('sir', 'robin', 'turned'), ('round', 'table', 'scene'), ('holy', 'hand', 'grenade'), ('arthur', 'music', 'stops'), ('boom', 'boom', 'boom')]


In [127]:
finder.apply_freq_filter(3)

In [128]:
likeliestW = finder.nbest(TrigramAssocMeasures.likelihood_ratio, 30)

In [129]:
print(likeliestW)

[('clop', 'clop', 'clop'), ('mumble', 'mumble', 'mumble'), ('squeak', 'squeak', 'squeak'), ('saw', 'saw', 'saw'), ('pie', 'iesu', 'domine'), ('clap', 'clap', 'clap'), ('dona', 'eis', 'requiem'), ('brave', 'sir', 'robin'), ('heh', 'heh', 'heh'), ('king', 'arthur', 'music'), ('hee', 'hee', 'hee'), ('holy', 'hand', 'grenade'), ('boom', 'boom', 'boom'), ('...', 'dona', 'eis'), ('already', 'got', 'one'), ('good', 'sir', 'knight')]


### 11)- Stemming

In [130]:
from nltk.stem import PorterStemmer 
from nltk.stem import LancasterStemmer 
from nltk.stem import RegexpStemmer

In [131]:
# applying PorterSTemmer - least strict on word cutting
pstemmer = PorterStemmer()

In [132]:
print(pstemmer.stem('dancing'))
print(pstemmer.stem('dancer'))
print(pstemmer.stem('cooking'))
print(pstemmer.stem('cookery'))

danc
dancer
cook
cookeri


In [133]:
# applying LancasterStemmer - most strict on word cutting
lstemmer = LancasterStemmer()

In [134]:
print(lstemmer.stem('dancing'))
print(lstemmer.stem('dance'))
print(lstemmer.stem('dancer'))
print(lstemmer.stem('cooking'))
print(lstemmer.stem('cookery'))

dant
dant
dant
cook
cookery


In [135]:
# RegexStemmer
rstemmer = RegexpStemmer('ing')

In [136]:
print(rstemmer.stem('skiing'))
print(rstemmer.stem('cooking'))
print(rstemmer.stem('king'))

ski
cook
k


As ing in skiing and cooking is extra however; king's ing makes it means. Hence removing ing from king means only k

### 12)- Lemmatization

In [137]:
from nltk.stem import WordNetLemmatizer 
lzr = WordNetLemmatizer()

In [138]:
print(lzr.lemmatize('dancing'))
print(lzr.lemmatize('working'))

dancing
working


**adding pos to lemmatizer make it more clear if word is a verb, noun or other form**

In [139]:
print(lzr.lemmatize('dancing', pos='v'))
print(lzr.lemmatize('working', pos='v'))
print(lzr.lemmatize('working', pos='a'))

dance
work
working


In [140]:
print(lzr.lemmatize('kings'))

king


In [141]:
print(lzr.lemmatize('sings'))
print(lzr.lemmatize('sings', pos='v'))

sings
sing


In [142]:
print(lzr.lemmatize('abruptly', pos='r'))

abruptly


In [143]:
# for seeing difference in lemma and stemmer
from nltk.stem import PorterStemmer
stm = PorterStemmer()

In [144]:
print(stm.stem('dancing'))
print(lzr.lemmatize('dancing', pos='v'))

danc
dance


In [145]:
print(stm.stem('believes'))
print(lzr.lemmatize('believes'))
print(lzr.lemmatize('believes', pos='v'))

believ
belief
believe


In [146]:
print(stm.stem('buses'))
print(lzr.lemmatize('buses'))
print(stm.stem('bus'))

buse
bus
bu


### 13)-Regular Expression

In [147]:
import re 

In [148]:
regex = re.compile(r'(?i)don\'t')
fst = "Don't you dare. I don't."
sst = regex.sub('do not', fst)

In [149]:
print(fst)
print(sst)

Don't you dare. I don't.
do not you dare. I do not.


In [150]:
fst = "I won't go there. He's a mad man. He won't end that. He'd have to go now."

In [151]:
givenpatterns = [
					(r'won\'t', 'will not'), 
					(r'\'s', ' is'), 
					(r'\'d', ' would'),
					(r'mad man', 'crazy arse mother fucking anthropoid')
				]

In [152]:
def replace(text, patterns):
	for(raw, rep) in patterns:
		regex = re.compile(raw)
		text = regex.sub(rep,text)
	print(text)

In [153]:
print(fst)
replace(fst, givenpatterns)

I won't go there. He's a mad man. He won't end that. He'd have to go now.
I will not go there. He is a crazy arse mother fucking anthropoid. He will not end that. He would have to go now.


In [154]:
import re 
regex = re.compile(r'(\w*)(\w)\2(\w*)')

In [155]:
fw = 'dramaaaatiiiic'
sw = regex.sub(r'\1\2\3', fw)

In [156]:
print(sw)

dramaaaatiiic


In [157]:
def looper(word):
	loop_res = regex.sub(r'\1\2\3', word)
	if (word == loop_res):
		return loop_res
	else:
		# Too see the process, uncomment the line:
		# print(loop_res)
		return looper(loop_res)

In [158]:
sw = looper(fw)

In [159]:
sw

'dramatic'

### 14)-Replacer

In [160]:
from replacer import RegexReplacer 

In [161]:
givenpatterns = [
					(r'won\'t', 'will not'), 
					(r'\'s', ' is'), 
					(r'\'d', ' would'),
					(r'mad man', 'crazy arse mother fucking anthropoid')
				]

In [162]:
replacer = RegexReplacer(givenpatterns)

In [163]:
txt = replacer.replace("He's gone")

In [164]:
txt

'He is gone'

In [165]:
#RepeatReplacer
from replacer import RepeatReplacer
replacer = RepeatReplacer()

In [166]:
txt = replacer.replace("Anthhhhropoiiid")

In [167]:
txt

'Anthropoid'

In [168]:
txt = replacer.replace("Book")
print(txt)

Book


In [169]:
txt = replacer.replace("cattle")
print(txt)

cattle


In [170]:
txt = replacer.replace("botttleeee")
print(txt)

bottle


In [171]:
# Replace same meaning word i.e Synonym
from replacer import WordReplacer
from nltk.tokenize import word_tokenize

In [172]:
wordmapobj = {
				'bday' : 'birthday',
				'sup' : 'what\'s up',
				'brb' : 'be right back'
			}

In [173]:
replacer = WordReplacer(wordmapobj)

In [174]:
result = replacer.replace("bday")
print(result)

birthday


In [175]:
result = replacer.replace("sup")
print(result)

what's up


In [176]:
result = replacer.replace("brb")
print(result)

be right back


In [177]:
sw = word_tokenize('Sup! Awesome bday? brb!')
sw2 = ""

In [178]:
for word in sw:	
	result = replacer.replace(word)
	sw2 += result+" "

In [179]:
sw2

'Sup ! Awesome birthday ? be right back ! '

In [180]:
# Replace opposite meaning i.e Antonym
from replacer import AntonymReplacer
rep = AntonymReplacer()

In [181]:
antn = rep.replace('cowardice')
print(antn)

antn = rep.replace('heavy')
print(antn)

antn = rep.replace('weak')
print(antn)

antn = rep.replace('blue')
print(antn)

courage
light
strong
None


In [182]:
sent = rep.negreplace('this man is not salty')
print(sent)

this man is fresh 
