In [1]:
# https://github.com/Rishikumar04/Natural-Language-Processing/blob/main/03-Preprocessing%20Steps%20-%20spacy%20vs%20nltk.ipynb
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp('In computing, plain text is a loose term for data that represent \
          only characters of readable material but not its graphical representation nor other objects.')

for token in doc:
    print(token, end='|')

In|computing|,|plain|text|is|a|loose|term|for|data|that|represent|          |only|characters|of|readable|material|but|not|its|graphical|representation|nor|other|objects|.|

In [3]:
[token.text for token in doc.sents]

['In computing, plain text is a loose term for data that represent           only characters of readable material but not its graphical representation nor other objects.']

In [4]:
##Importing NLTK
import nltk

In [5]:
sample = 'In computing, plain text is a loose term for data that represent \
          only characters of readable material but not its graphical representation nor other objects.'

tokens = nltk.sent_tokenize(sample)

print(tokens)
print('No of tokens:', len(tokens) )

['In computing, plain text is a loose term for data that represent           only characters of readable material but not its graphical representation nor other objects.']
No of tokens: 1


In [6]:
tokens = nltk.word_tokenize(sample)

print(tokens)
print('No of tokens:', len(tokens))

['In', 'computing', ',', 'plain', 'text', 'is', 'a', 'loose', 'term', 'for', 'data', 'that', 'represent', 'only', 'characters', 'of', 'readable', 'material', 'but', 'not', 'its', 'graphical', 'representation', 'nor', 'other', 'objects', '.']
No of tokens: 27


In [7]:
tok = nltk.toktok.ToktokTokenizer()
tokens = tok.tokenize(sample)

print(tokens)
print('No of tokens:', len(tokens) )

['In', 'computing', ',', 'plain', 'text', 'is', 'a', 'loose', 'term', 'for', 'data', 'that', 'represent', 'only', 'characters', 'of', 'readable', 'material', 'but', 'not', 'its', 'graphical', 'representation', 'nor', 'other', 'objects', '.']
No of tokens: 27


In [8]:
# Import the toolkit and the full Porter Stemmer library
import nltk

from nltk.stem.porter import *

p_stemmer = PorterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly']

for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [9]:
# Import the Snow Ball Stemmer library
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')
words = ['run','runner','running','ran','runs','easily','fairly']

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [10]:
words = ['generous','generation','generously','generate']

for word in words:
    print(word+' --> '+s_stemmer.stem(word))

generous --> generous
generation --> generat
generously --> generous
generate --> generat


In [11]:
# Import the Lancaster
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

words = ['run','runner','running','ran','runs','easily','fairly']

for word in words:
    print(word+' --> '+ls.stem(word))

run --> run
runner --> run
running --> run
ran --> ran
runs --> run
easily --> easy
fairly --> fair


In [13]:
# Import the Snow Ball Ste##Spacy
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc1:
    print(f'{token.text:{10}} -- {token.lemma_:{10}}')

I          -- -PRON-    
am         -- be        
a          -- a         
runner     -- runner    
running    -- run       
in         -- in        
a          -- a         
race       -- race      
because    -- because   
I          -- -PRON-    
love       -- love      
to         -- to        
run        -- run       
since      -- since     
I          -- -PRON-    
ran        -- run       
today      -- today     


In [14]:
##NLTK lemmatization
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer() #Creating object of word net lemmatizer
text = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'
tokens = nltk.word_tokenize(text)

lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)

#Nltk's lemmatization method require positional tag to perform well.
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

In [15]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


In [16]:
print(len(nlp.Defaults.stop_words))

326


In [17]:
# Add the word to the set of stop words. Use lowercase!
print('Default words:',len(nlp.Defaults.stop_words))
nlp.Defaults.stop_words.add('btw')

# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

print('After adding:',len(nlp.Defaults.stop_words))

nlp.vocab['btw'].is_stop

Default words: 326
After adding: 327


True

In [18]:
# Remove the word from the set of stop words
nlp.Defaults.stop_words.remove('beyond')

# Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False

print(len(nlp.Defaults.stop_words))

nlp.vocab['beyond'].is_stop

326


False

In [19]:
stopwords = nltk.corpus.stopwords.words('english')
print('Default length:', len(stopwords))

stopwords.remove('the')
print('After removing a word length:', len(stopwords))

stopwords.append('brown')
print('After adding a word length:', len(stopwords))

Default length: 179
After removing a word length: 178
After adding a word length: 179
