# Stemming

In [5]:
# Input
data = """Natural language processing (NLP) is a subfield of linguistics."""

In [6]:
# import PorterStemmer and tokenization from nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
     
# To use Class PorterStemmer create a variable
ps = PorterStemmer()

# Apply tokenization on data
tokens = word_tokenize(data)

for word in tokens:
    print(word," : ",ps.stem(word))

Natural  :  natur
language  :  languag
processing  :  process
(  :  (
NLP  :  nlp
)  :  )
is  :  is
a  :  a
subfield  :  subfield
of  :  of
linguistics  :  linguist
.  :  .


In [8]:
# import LancasterStemmer and tokenization from nltk
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
     
# To use Class PorterStemmer create a variable
ls = LancasterStemmer()

# Apply tokenization on data
tokens = word_tokenize(data)

for word in tokens:
    print(word," : ",ls.stem(word))

Natural  :  nat
language  :  langu
processing  :  process
(  :  (
NLP  :  nlp
)  :  )
is  :  is
a  :  a
subfield  :  subfield
of  :  of
linguistics  :  lingu
.  :  .


In [17]:
# import LancasterStemmer and tokenization from nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
     
# To use Class PorterStemmer create a variable
ss = SnowballStemmer(language="english")

# Apply tokenization on data
tokens = word_tokenize(data)

for word in tokens:
    print(word," : ",ss.stem(word))

Natural  :  natur
language  :  languag
processing  :  process
(  :  (
NLP  :  nlp
)  :  )
is  :  is
a  :  a
subfield  :  subfield
of  :  of
linguistics  :  linguist
.  :  .


We can see that porter stemmer takes "Natur" as a base root of natural while other stemmer take "nat" as a base root.

In [13]:
#Let's take a set of word to check the difference between porter and lancaster stemmer.

#A list of words to be stemmed
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football","singing","sang","sing"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,ps.stem(word),ls.stem(word)))

Word                Porter Stemmer      lancaster Stemmer   
friend              friend              friend              
friendship          friendship          friend              
friends             friend              friend              
friendships         friendship          friend              
stabil              stabil              stabl               
destabilize         destabil            dest                
misunderstanding    misunderstand       misunderstand       
railroad            railroad            railroad            
moonlight           moonlight           moonlight           
football            footbal             footbal             
singing             sing                sing                
sang                sang                sang                
sing                sing                sing                


# Lemmatization

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jaiminmungalpara/nltk_data...


True

In [22]:
# import WordNetLemmatizer and tokenization from nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
     
# To use Class PorterStemmer create a variable
WL = WordNetLemmatizer()

# Apply tokenization on data
tokens = word_tokenize(data)

for word in tokens:
    print(word," : ",WL.lemmatize(word))

Natural  :  Natural
language  :  language
processing  :  processing
(  :  (
NLP  :  NLP
)  :  )
is  :  is
a  :  a
subfield  :  subfield
of  :  of
linguistics  :  linguistics
.  :  .


We can see that all the words are having their base root now

# StopWords

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaiminmungalpara/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [29]:
# import stopwords and tokenization from nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#take a list of stop words in a variable

stop_words = set(stopwords.words('english'))
  
tokens = word_tokenize(data)
  
filtered_words = []
  
for w in tokens:
    if w not in stop_words:
        filtered_words.append(w)
print(data)
print(filtered_words)

Natural language processing (NLP) is a subfield of linguistics.
['Natural', 'language', 'processing', '(', 'NLP', ')', 'subfield', 'linguistics', '.']


## Let's create a custom stop word list

In [30]:
from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords

filtered_sentence = remove_stopwords(data)

print(filtered_sentence)

Natural language processing (NLP) subfield linguistics.


## Now we will add "subfield" in stopword's list and check 

In [31]:
all_stopwords_gensim = STOPWORDS.union(set(['subfield']))


tokens = word_tokenize(data)
clean_words = [word for word in tokens if not word in all_stopwords_gensim]

print(clean_words)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'linguistics', '.']


# N-Grams

In [32]:
#import ngrams from nltk 

from nltk.util import ngrams

n = 1

unigrams = ngrams(data.split(), n)

for item in unigrams:
    print(item)

('Natural',)
('language',)
('processing',)
('(NLP)',)
('is',)
('a',)
('subfield',)
('of',)
('linguistics.',)


In [33]:
n = 2

unigrams = ngrams(data.split(), n)

for item in unigrams:
    print(item)

('Natural', 'language')
('language', 'processing')
('processing', '(NLP)')
('(NLP)', 'is')
('is', 'a')
('a', 'subfield')
('subfield', 'of')
('of', 'linguistics.')


In [34]:
n = 3

unigrams = ngrams(data.split(), n)

for item in unigrams:
    print(item)

('Natural', 'language', 'processing')
('language', 'processing', '(NLP)')
('processing', '(NLP)', 'is')
('(NLP)', 'is', 'a')
('is', 'a', 'subfield')
('a', 'subfield', 'of')
('subfield', 'of', 'linguistics.')


In [37]:
#import ngrams from nltk 

from nltk.util import everygrams

tokens = data.split()

list(everygrams(tokens))

[('Natural',),
 ('Natural', 'language'),
 ('Natural', 'language', 'processing'),
 ('Natural', 'language', 'processing', '(NLP)'),
 ('Natural', 'language', 'processing', '(NLP)', 'is'),
 ('Natural', 'language', 'processing', '(NLP)', 'is', 'a'),
 ('Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'subfield'),
 ('Natural', 'language', 'processing', '(NLP)', 'is', 'a', 'subfield', 'of'),
 ('Natural',
  'language',
  'processing',
  '(NLP)',
  'is',
  'a',
  'subfield',
  'of',
  'linguistics.'),
 ('language',),
 ('language', 'processing'),
 ('language', 'processing', '(NLP)'),
 ('language', 'processing', '(NLP)', 'is'),
 ('language', 'processing', '(NLP)', 'is', 'a'),
 ('language', 'processing', '(NLP)', 'is', 'a', 'subfield'),
 ('language', 'processing', '(NLP)', 'is', 'a', 'subfield', 'of'),
 ('language',
  'processing',
  '(NLP)',
  'is',
  'a',
  'subfield',
  'of',
  'linguistics.'),
 ('processing',),
 ('processing', '(NLP)'),
 ('processing', '(NLP)', 'is'),
 ('processing', '(

## We can use Textblob library also for N-gram implementation. 