In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp('In computing, plain text is a loose term for data that represent only characters of readable material but not its graphical representation nor other objects.')

for token in doc:
    print(token, end='|')

In|computing|,|plain|text|is|a|loose|term|for|data|that|represent|only|characters|of|readable|material|but|not|its|graphical|representation|nor|other|objects|.|

In [3]:
[token.text for token in doc.sents]

['In computing, plain text is a loose term for data that represent only characters of readable material but not its graphical representation nor other objects.']

In [4]:
import nltk

sample = 'In computing, plain text is a loose term for data that represent only characters of readable material but not its graphical representation nor other objects.'
tokens = nltk.sent_tokenize(sample)

print(tokens)
print('No of tokens:', len(tokens) )

['In computing, plain text is a loose term for data that represent only characters of readable material but not its graphical representation nor other objects.']
No of tokens: 1


In [5]:
tokens = nltk.word_tokenize(sample)

print(tokens)
print('No of tokens:', len(tokens) )

['In', 'computing', ',', 'plain', 'text', 'is', 'a', 'loose', 'term', 'for', 'data', 'that', 'represent', 'only', 'characters', 'of', 'readable', 'material', 'but', 'not', 'its', 'graphical', 'representation', 'nor', 'other', 'objects', '.']
No of tokens: 27


Stemming
Stemming is a somewhat crude method for cataloging related words; it essentially chops off letters from the end until the stem is reached. This works fairly well in most cases, but unfortunately English has many exceptions where a more sophisticated process is required. In fact, spaCy doesn't include a stemmer, opting instead to rely entirely on lemmatization.

One of the most common and effective stemming tools is Porter's Algorithm developed by Martin Porter in 1980. The algorithm employs five phases of word reduction, each with its own set of mapping rules.

In [6]:
import nltk

from nltk.stem.porter import *

p_stemmer = PorterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly']

for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


Snowball Stemmer
This is somewhat of a misnomer, as Snowball is the name of a stemming language developed by Martin Porter. The algorithm used here is more accurately called the "English Stemmer" or "Porter2 Stemmer". It offers a slight improvement over the original Porter stemmer, both in logic and speed. Since nltk uses the name SnowballStemmer, we'll use it here.

In [7]:
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')
words = ['run','runner','running','ran','runs','easily','fairly']

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [8]:
words = ['generous','generation','generously','generate']

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


Lancaster Stemmer
The Lancaster stemming algorithm is another algorithm that you can use. This one is the most aggressive stemming algorithm of the bunch. However, if you use the stemmer in NLTK, you can add your own custom rules to this algorithm very easily

In [9]:
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly']

for word in words:
    print(word+' --> '+ls.stem(word))

run --> run
runner --> run
running --> run
ran --> ran
runs --> run
easily --> easy
fairly --> fair


Lemmatization
In contrast to stemming, lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a morphological analysis to words. The lemma of 'was' is 'be' and the lemma of 'mice' is 'mouse'. Further, the lemma of 'meeting' might be 'meet' or 'meeting' depending on its use in a sentence.

In [10]:
##Spacy
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(f'{token.text:{10}} -- {token.lemma_:{10}}')

I          -- -PRON-    
am         -- be        
a          -- a         
runner     -- runner    
running    -- run       
in         -- in        
a          -- a         
race       -- race      
because    -- because   
I          -- -PRON-    
love       -- love      
to         -- to        
run        -- run       
since      -- since     
I          -- -PRON-    
ran        -- run       
today      -- today     


In [11]:
##NLTK lemmatization
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer() #Creating object of word net lemmatizer

text = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'
tokens = nltk.word_tokenize(text)

lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)

#Nltk's lemmatization method require positional tag to perform well.
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

In [12]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


In [13]:
from nltk.corpus import wordnet

##Positional Tagging the word
def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

#NLTK's lemmatization
def wordnet_lemmatize_text(text):
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text)) #Positonal tagging
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    
    #Word lemmatizer method don't understand all the tags, so it's converted into basic format by passing it to the function
    lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens) #lemmatizing the tokens
    return lemmatized_text

lemma_words = wordnet_lemmatize_text(text)

lemma_words

'The brown fox be quick and they be jump over the sleep lazy dog !'

Stop words
Words like "a" and "the" appear so frequently that they don't require tagging as thoroughly as nouns, verbs and modifiers. We call these stop words, and they can be filtered from the text to be processed. spaCy holds a built-in list of some 305 English stop words.

In [14]:
print(len(nlp.Defaults.stop_words))

326


In [15]:
#To add a stop word
#There may be times when you wish to add a stop word to the default set. Perhaps you decide that 'btw' (common shorthand for "by the way") should be considered a stop word

# Add the word to the set of stop words. Use lowercase!
print('Default words:',len(nlp.Defaults.stop_words))
nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

print('After adding:',len(nlp.Defaults.stop_words))

nlp.vocab['btw'].is_stop

Default words: 326
After adding: 327


True

In [16]:
#To remove a stop word
#Alternatively, you may decide that 'beyond' should not be considered a stop word.

# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

print(len(nlp.Defaults.stop_words))

nlp.vocab['beyond'].is_stop

326


False

In [17]:
#NTLK

stopwords = nltk.corpus.stopwords.words('english')
print('Default length:', len(stopwords))

stopwords.remove('the')
print('After removing a word length:', len(stopwords))

stopwords.append('brown')
print('After adding a word length:', len(stopwords))

Default length: 179
After removing a word length: 178
After adding a word length: 179
