<a href="https://colab.research.google.com/github/Justabhi96/NLP/blob/master/03_Stemming%2C_Lammetization_and_Stop_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stemming

Spacy does not have stemmer. It has only lemmatizer.

So Alternatively we will use NLTK for stemming

In [0]:
from nltk.stem.porter import PorterStemmer

In [0]:
p_stemmer = PorterStemmer()

In [0]:
words = ["run", "runner", "ran", "runs", "easily", "fairly"]
for word in words:
  print(word + " -----> " + p_stemmer.stem(word))

run -----> run
runner -----> runner
ran -----> ran
runs -----> run
easily -----> easili
fairly -----> fairli


In [0]:
from nltk.stem.snowball import SnowballStemmer

In [0]:
s_stemmer = SnowballStemmer(language = "english")

In [0]:
words = ["run", "runner", "ran", "runs", "easily", "fairly"]
for word in words:
  print(word + " -----> " + s_stemmer.stem(word))

run -----> run
runner -----> runner
ran -----> ran
runs -----> run
easily -----> easili
fairly -----> fair


In [0]:
words = ["generous", "generation", "generously", "generate"]
for word in words:
  print(word + " -----> " + s_stemmer.stem(word))

generous -----> generous
generation -----> generat
generously -----> generous
generate -----> generat


# Lemmatization

In [0]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [0]:
doc = nlp("I am a runner running in a race beacause I love to run since I ran today.")

In [0]:
for token in doc:
  print(f"{token.text:{10}} {token.pos_:{7}} {token.lemma:<{25}} {token.lemma_}")

I          PRON    561228191312463089        -PRON-
am         VERB    10382539506755952630      be
a          DET     11901859001352538922      a
runner     NOUN    12640964157389618806      runner
running    VERB    12767647472892411841      run
in         ADP     3002984154512732771       in
a          DET     11901859001352538922      a
race       NOUN    8048469955494714898       race
beacause   NOUN    7794235611920507838       beacause
I          PRON    561228191312463089        -PRON-
love       VERB    3702023516439754181       love
to         PART    3791531372978436496       to
run        VERB    12767647472892411841      run
since      ADP     10066841407251338481      since
I          PRON    561228191312463089        -PRON-
ran        VERB    12767647472892411841      run
today      NOUN    11042482332948150395      today
.          PUNCT   12646065887601541794      .


In [0]:
doc = nlp("I saw ten mice today!")
for token in doc:
  print(f"{token.text:{10}} {token.pos_:{7}} {token.lemma:<{25}} {token.lemma_}")

I          PRON    561228191312463089        -PRON-
saw        VERB    11925638236994514241      see
ten        NUM     7970704286052693043       ten
mice       NOUN    1384165645700560590       mouse
today      NOUN    11042482332948150395      today
!          PUNCT   17494803046312582752      !


# Stop words

In [0]:
print(nlp.Defaults.stop_words)

{'they', 'else', 'below', 'itself', 'myself', 'can', 'others', 'though', 'those', 'together', 'beyond', 'whoever', 'she', 'against', 'any', 'regarding', 'serious', 'their', 'within', 'is', 'there', 'although', 'nowhere', 'hundred', 'hers', 'six', 'herein', 'once', 'thereby', 'too', 'this', 'thereafter', 'over', 'after', 'yet', 'either', 'or', 'at', 'been', '‘ve', 'would', 'doing', '‘m', 'amount', 'hereupon', 'again', 'anything', 'front', 'into', 'latter', 'ca', 'first', 'become', 'further', 'might', 'name', 'toward', 'due', 'various', 'almost', 'someone', 'without', 'your', "n't", 'somehow', 'why', 'yourselves', 'until', 'everyone', 'me', 'i', 'hereafter', 'not', 'should', 'above', 'often', '‘ll', 'rather', 'therefore', 'yours', 'eight', 'fifty', 'his', 'among', 'anyone', 'last', 'more', 'out', 'several', 'whether', 'then', 'such', 'made', 'same', 'themselves', "'d", 'top', 'anyhow', 'enough', 'part', 'hereby', 'down', 'five', 'than', 'yourself', "'ll", 'latterly', 'n’t', '’re', 'somet

In [0]:
len(nlp.Defaults.stop_words)

326

In [0]:
nlp.vocab["is"].is_stop, nlp.vocab["btw"].is_stop

(True, False)

###Adding a word in stop words

In [0]:
nlp.Defaults.stop_words.add("btw")
nlp.vocab["btw"].is_stop = True

In [0]:
len(nlp.Defaults.stop_words)

327

In [0]:
nlp.vocab["btw"].is_stop

True

###Remove a stop word

In [0]:
nlp.Defaults.stop_words.remove("btw")
nlp.vocab["btw"].is_stop = False

In [0]:
len(nlp.Defaults.stop_words)

326

In [0]:
nlp.vocab["btw"].is_stop

False