### Stemming in NLTK

In [44]:
import pandas as pd
from nltk.stem import PorterStemmer
stem = PorterStemmer()

In [11]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for w in words:
    print(w, " | ", stem.stem(w))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


#### Lemmatization in Spacy

In [15]:
import spacy

In [25]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc1 = nlp("eating eats eat ate adjustable rafting ability meeting better")
for d in doc:
    print(d, " | ", d.lemma_)

Mando  |  Mando
talked  |  talk
for  |  for
3  |  3
hours  |  hour
although  |  although
talking  |  talk
is  |  be
n't  |  not
his  |  his
thing  |  thing


## Customizing lemmatizer

In [28]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [30]:
ar = nlp.get_pipe('attribute_ruler')
ar.add([[{'TEXT':'Bro'}], [{'TEXT':'Brah'}]], {'LEMMA':"Brother"})

In [32]:
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token, " | ", token.lemma_)

Bro  |  Brother
,  |  ,
you  |  you
wanna  |  wanna
go  |  go
?  |  ?
Brah  |  Brother
,  |  ,
do  |  do
n't  |  not
say  |  say
no  |  no
!  |  !
I  |  I
am  |  be
exhausted  |  exhaust


# Exercies

In [35]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
nltk.download('all')


import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       tagge

#### Exercise1:

- Convert these list of words into base form using Stemming and Lemmatization and observe the transformations
- Write a short note on the words that have different base words using stemming and Lemmatization

In [46]:
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
stemmed_words = {'Before Stemming': [w for w in lst_words], 'After Stemming': [stemmer.stem(w) for w in lst_words]}
stemmed_words_df = pd.DataFrame(stemmed_words)
stemmed_words_df    

Unnamed: 0,Before Stemming,After Stemming
0,running,run
1,painting,paint
2,walking,walk
3,dressing,dress
4,likely,like
5,children,children
6,whom,whom
7,good,good
8,ate,ate
9,fishing,fish


In [48]:
doc = nlp("running painting walking dressing likely children who good ate fishing")
for token in doc:
    print(token, "|", token.lemma_)

running | run
painting | paint
walking | walk
dressing | dress
likely | likely
children | child
who | who
good | good
ate | eat
fishing | fishing
