# Stemming

For NLTK

In [1]:
# Porter Stemmer

import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')

porter_stemmer = PorterStemmer()


words = ["running", "happily", "studies", "connection",
         "argument", "national", "bouncing", "relational", "cats", "leaves"]

for word in words:

    print(f"Original : {word} --------> Stemmed : {porter_stemmer.stem(word)}")

Original : running --------> Stemmed : run
Original : happily --------> Stemmed : happili
Original : studies --------> Stemmed : studi
Original : connection --------> Stemmed : connect
Original : argument --------> Stemmed : argument
Original : national --------> Stemmed : nation
Original : bouncing --------> Stemmed : bounc
Original : relational --------> Stemmed : relat
Original : cats --------> Stemmed : cat
Original : leaves --------> Stemmed : leav


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Snowball Stemmer

from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer(language='english')

words = ["running", "happily", "studies", "connection",
         "argument", "national", "bouncing", "relational", "cats", "leaves"]

for word in words:

    print(f"Original : {word} --------> Stemmed : {snowball_stemmer.stem(word)}")

Original : running --------> Stemmed : run
Original : happily --------> Stemmed : happili
Original : studies --------> Stemmed : studi
Original : connection --------> Stemmed : connect
Original : argument --------> Stemmed : argument
Original : national --------> Stemmed : nation
Original : bouncing --------> Stemmed : bounc
Original : relational --------> Stemmed : relat
Original : cats --------> Stemmed : cat
Original : leaves --------> Stemmed : leav


In [3]:
print("Porter : ", porter_stemmer.stem('fairly'))
print("Snowball : ",snowball_stemmer.stem('fairly'))

print("Porter : ", porter_stemmer.stem('sportingly'))
print("Snowball : ",snowball_stemmer.stem('sportingly'))

print("Porter : ", porter_stemmer.stem('going'))
print("Snowball : ",snowball_stemmer.stem('going'))

Porter :  fairli
Snowball :  fair
Porter :  sportingli
Snowball :  sport
Porter :  go
Snowball :  go


In [4]:
word_1 = "relate"
word_2 = "relativity"

print(f"Word : {word_1} --------> Stemmed : {porter_stemmer.stem(word_1)}")
print(f"Word : {word_2} --------> Stemmed : {porter_stemmer.stem(word_2)}")

Word : relate --------> Stemmed : relat
Word : relativity --------> Stemmed : rel


In [5]:
word_1 = "probe"
word_2 = "probate"

print(f"Word : {word_1} --------> Stemmed : {porter_stemmer.stem(word_1)}")
print(f"Word : {word_2} --------> Stemmed : {porter_stemmer.stem(word_2)}")

Word : probe --------> Stemmed : probe
Word : probate --------> Stemmed : probat


# Lemmatization
For spaCy

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp(u"running happily studies connection argument national bouncing relational cats leaves")

for token in doc:
    print(token.text, token.lemma_)

running run
happily happily
studies study
connection connection
argument argument
national national
bouncing bounce
relational relational
cats cat
leaves leave


In [7]:
# NLTK

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('going'))

print(lemmatizer.lemmatize('going', pos='n'))

print(lemmatizer.lemmatize('going', pos='v'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...


going
going
go


In [15]:
import nltk
nltk.download('punkt_tab')

words = """
He is playing the music beatifully.
She painted this picture so well.
They will be going to Mumbai tommorrow.
By when will they have returned.
Historically batting is better than bowling first."""

tokenized_words = word_tokenize(words)

for word in tokenized_words:

    print(f"Word : {word} => Stemmed from : {lemmatizer.lemmatize(word)}")

Word : He => Stemmed from : He
Word : is => Stemmed from : is
Word : playing => Stemmed from : playing
Word : the => Stemmed from : the
Word : music => Stemmed from : music
Word : beatifully => Stemmed from : beatifully
Word : . => Stemmed from : .
Word : She => Stemmed from : She
Word : painted => Stemmed from : painted
Word : this => Stemmed from : this
Word : picture => Stemmed from : picture
Word : so => Stemmed from : so
Word : well => Stemmed from : well
Word : . => Stemmed from : .
Word : They => Stemmed from : They
Word : will => Stemmed from : will
Word : be => Stemmed from : be
Word : going => Stemmed from : going
Word : to => Stemmed from : to
Word : Mumbai => Stemmed from : Mumbai
Word : tommorrow => Stemmed from : tommorrow
Word : . => Stemmed from : .
Word : By => Stemmed from : By
Word : when => Stemmed from : when
Word : will => Stemmed from : will
Word : they => Stemmed from : they
Word : have => Stemmed from : have
Word : returned => Stemmed from : returned
Word : . =

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
for word in tokenized_words:

    print(f"Word : {word} => Stemmed from : {lemmatizer.lemmatize(word, pos='v')}")

Word : He => Stemmed from : He
Word : is => Stemmed from : be
Word : playing => Stemmed from : play
Word : the => Stemmed from : the
Word : music => Stemmed from : music
Word : beatifully => Stemmed from : beatifully
Word : . => Stemmed from : .
Word : She => Stemmed from : She
Word : painted => Stemmed from : paint
Word : this => Stemmed from : this
Word : picture => Stemmed from : picture
Word : so => Stemmed from : so
Word : well => Stemmed from : well
Word : . => Stemmed from : .
Word : They => Stemmed from : They
Word : will => Stemmed from : will
Word : be => Stemmed from : be
Word : going => Stemmed from : go
Word : to => Stemmed from : to
Word : Mumbai => Stemmed from : Mumbai
Word : tommorrow => Stemmed from : tommorrow
Word : . => Stemmed from : .
Word : By => Stemmed from : By
Word : when => Stemmed from : when
Word : will => Stemmed from : will
Word : they => Stemmed from : they
Word : have => Stemmed from : have
Word : returned => Stemmed from : return
Word : . => Stemmed 

In [10]:
# spaCy

import spacy

nlp = spacy.load('en_core_web_sm')

sentences = ["The cats are running in the garden.",
             "She bettered her skills by practicing regularly.",
             "Mice and rats can be found in the bins."]

for sentence  in sentences:

    doc = nlp(sentence)

    for token in doc:

        print(f"Original : {token.text} \t Lemmatized : {token.lemma_}")
        print("-"*40)

Original : The 	 Lemmatized : the
----------------------------------------
Original : cats 	 Lemmatized : cat
----------------------------------------
Original : are 	 Lemmatized : be
----------------------------------------
Original : running 	 Lemmatized : run
----------------------------------------
Original : in 	 Lemmatized : in
----------------------------------------
Original : the 	 Lemmatized : the
----------------------------------------
Original : garden 	 Lemmatized : garden
----------------------------------------
Original : . 	 Lemmatized : .
----------------------------------------
Original : She 	 Lemmatized : she
----------------------------------------
Original : bettered 	 Lemmatized : better
----------------------------------------
Original : her 	 Lemmatized : her
----------------------------------------
Original : skills 	 Lemmatized : skill
----------------------------------------
Original : by 	 Lemmatized : by
----------------------------------------
Original :

In [11]:
doc1 = nlp(u"I am a runner in a race because I love to run since I ran today")

for token in doc1:

    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


# Span

In [12]:
from spacy.tokens import Span

nlp = spacy.load('en_core_web_sm')

doc = nlp(u"SpaCy is a great tool for natural language processing.")

span = Span(doc, start=2, end=6)

print(span.text)
print(span.start)
print(span.end)
print(type(span))
print(type(doc))

a great tool for
2
6
<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.doc.Doc'>


In [13]:
for token in span:

    print(token.text, token.pos_, token.dep_)

a DET det
great ADJ amod
tool NOUN attr
for ADP prep


# Sentences / Tokenizer types


In [17]:
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize

tweets_df = pd.read_csv("tweets.csv")

tweet_tokenizer = TweetTokenizer()

first_10_tweets = tweets_df['Tweet Content'].dropna().astype(str).head(10)

first_10_tweets

0    Pets change our lives &amp; become a part of o...
1    Another spot of our #morethanmedicine bus in #...
2    What a great team ⁦@HealthSourceOH⁩ ⁦@Local12⁩...
3    What a great team ⁦@HealthSourceOH⁩ ⁦@Local12⁩...
4    What a great team ⁦@HealthSourceOH⁩ ⁦@Local12⁩...
5    What a great team ⁦@HealthSourceOH⁩ ⁦@Local12⁩...
6    Will you be at #FIX19? Want a preview of @AG_E...
7    Will you be at #FIX19? Want a preview of @AG_E...
8    Will you be at #FIX19? Want a preview of @AG_E...
9    Will you be at #FIX19? Want a preview of @AG_E...
Name: Tweet Content, dtype: object

In [18]:
tweet_tokenized_tweet = [tweet_tokenizer.tokenize(tweet) for tweet in first_10_tweets]

tweet_tokenized_tweet

[['Pets',
  'change',
  'our',
  'lives',
  '&',
  'become',
  'a',
  'part',
  'of',
  'our',
  'families',
  '❤',
  '️',
  "That's",
  'why',
  'our',
  'members',
  'offer',
  'many',
  'solutions',
  'to',
  'help',
  'you',
  'to',
  'enjoy',
  'a',
  'long-lasting',
  'bond',
  'with',
  'your',
  'happy',
  '&',
  'healthy',
  'pet',
  '🐱',
  '🐶',
  '#MorethanMedicine',
  '#PetCare',
  '#PetsareFamily',
  'https://t.co/fZNIXge9a3'],
 ['Another',
  'spot',
  'of',
  'our',
  '#morethanmedicine',
  'bus',
  'in',
  '#bristol',
  'this',
  'week',
  '!',
  'If',
  'you',
  'need',
  'support',
  'with',
  'your',
  'cancer',
  'diagnosis',
  'call',
  'us',
  'on',
  '0303 3000',
  '118',
  '.',
  '#livingwellwithcancer',
  'https://t.co/eZGLz0BkXB'],
 ['What',
  'a',
  'great',
  'team',
  '\u2066',
  '@HealthSourceOH',
  '\u2069',
  '\u2066',
  '@Local12',
  '\u2069',
  '#morethanmedicine',
  'https://t.co/g2YzMDUpVA'],
 ['What',
  'a',
  'great',
  'team',
  '\u2066',
  '@Health

In [19]:
default_tokenized_tweet = [word_tokenize(tweet) for tweet in first_10_tweets]

default_tokenized_tweet

[['Pets',
  'change',
  'our',
  'lives',
  '&',
  'amp',
  ';',
  'become',
  'a',
  'part',
  'of',
  'our',
  'families',
  '❤️',
  'That',
  "'s",
  'why',
  'our',
  'members',
  'offer',
  'many',
  'solutions',
  'to',
  'help',
  'you',
  'to',
  'enjoy',
  'a',
  'long-lasting',
  'bond',
  'with',
  'your',
  'happy',
  '&',
  'amp',
  ';',
  'healthy',
  'pet',
  '🐱🐶',
  '#',
  'MorethanMedicine',
  '#',
  'PetCare',
  '#',
  'PetsareFamily',
  'https',
  ':',
  '//t.co/fZNIXge9a3'],
 ['Another',
  'spot',
  'of',
  'our',
  '#',
  'morethanmedicine',
  'bus',
  'in',
  '#',
  'bristol',
  'this',
  'week',
  '!',
  'If',
  'you',
  'need',
  'support',
  'with',
  'your',
  'cancer',
  'diagnosis',
  'call',
  'us',
  'on',
  '0303',
  '3000',
  '118',
  '.',
  '#',
  'livingwellwithcancer',
  'https',
  ':',
  '//t.co/eZGLz0BkXB'],
 ['What',
  'a',
  'great',
  'team',
  '\u2066',
  '@',
  'HealthSourceOH\u2069',
  '\u2066',
  '@',
  'Local12\u2069',
  '#',
  'morethanmedi

In [20]:
print("Using TweetTokenizer : \n")

for i, token in enumerate(tweet_tokenized_tweet, start=1):

    print(f"Tweet {i}  Token : {token}")

Using TweetTokenizer : 

Tweet 1  Token : ['Pets', 'change', 'our', 'lives', '&', 'become', 'a', 'part', 'of', 'our', 'families', '❤', '️', "That's", 'why', 'our', 'members', 'offer', 'many', 'solutions', 'to', 'help', 'you', 'to', 'enjoy', 'a', 'long-lasting', 'bond', 'with', 'your', 'happy', '&', 'healthy', 'pet', '🐱', '🐶', '#MorethanMedicine', '#PetCare', '#PetsareFamily', 'https://t.co/fZNIXge9a3']
Tweet 2  Token : ['Another', 'spot', 'of', 'our', '#morethanmedicine', 'bus', 'in', '#bristol', 'this', 'week', '!', 'If', 'you', 'need', 'support', 'with', 'your', 'cancer', 'diagnosis', 'call', 'us', 'on', '0303 3000', '118', '.', '#livingwellwithcancer', 'https://t.co/eZGLz0BkXB']
Tweet 3  Token : ['What', 'a', 'great', 'team', '\u2066', '@HealthSourceOH', '\u2069', '\u2066', '@Local12', '\u2069', '#morethanmedicine', 'https://t.co/g2YzMDUpVA']
Tweet 4  Token : ['What', 'a', 'great', 'team', '\u2066', '@HealthSourceOH', '\u2069', '\u2066', '@Local12', '\u2069', '#morethanmedicine', 'h

In [21]:
print("Using Default Word Tokenizer (nltk.word_tokenize) : \n")

for i, token in enumerate(default_tokenized_tweet, start=1):

    print(f"Tweet {i}  Token : {token}")

Using Default Word Tokenizer (nltk.word_tokenize) : 

Tweet 1  Token : ['Pets', 'change', 'our', 'lives', '&', 'amp', ';', 'become', 'a', 'part', 'of', 'our', 'families', '❤️', 'That', "'s", 'why', 'our', 'members', 'offer', 'many', 'solutions', 'to', 'help', 'you', 'to', 'enjoy', 'a', 'long-lasting', 'bond', 'with', 'your', 'happy', '&', 'amp', ';', 'healthy', 'pet', '🐱🐶', '#', 'MorethanMedicine', '#', 'PetCare', '#', 'PetsareFamily', 'https', ':', '//t.co/fZNIXge9a3']
Tweet 2  Token : ['Another', 'spot', 'of', 'our', '#', 'morethanmedicine', 'bus', 'in', '#', 'bristol', 'this', 'week', '!', 'If', 'you', 'need', 'support', 'with', 'your', 'cancer', 'diagnosis', 'call', 'us', 'on', '0303', '3000', '118', '.', '#', 'livingwellwithcancer', 'https', ':', '//t.co/eZGLz0BkXB']
Tweet 3  Token : ['What', 'a', 'great', 'team', '\u2066', '@', 'HealthSourceOH\u2069', '\u2066', '@', 'Local12\u2069', '#', 'morethanmedicine', 'https', ':', '//t.co/g2YzMDUpVA']
Tweet 4  Token : ['What', 'a', 'great'

# Text Cleaning / Preprocessing

In [22]:
import string

print("Punctuation character", string.punctuation)

Punctuation character !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [23]:
print("\nPunctuation list")

for punct in string.punctuation:

    print(punct)


Punctuation list
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
:
;
<
=
>
?
@
[
\
]
^
_
`
{
|
}
~


In [24]:
eu_defn = """
The European Union (EU) is a supranational political and economic union of 27 member states that are located primarily in Europe.[9][10] The union has a total area of 4,233,255 km2 (1,634,469 sq mi) and an estimated population of over 449 million as of 2024. The EU is often described as a sui generis political entity combining characteristics of both a federation and a confederation.[11][12]

Containing 5.5% of the world population in 2023,[13] EU member states generated a nominal gross domestic product (GDP) of around €17.935 trillion in 2024, accounting for approximately one sixth of global economic output.[14] Its cornerstone, the Customs Union, paved the way to establishing an internal single market based on standardised legal framework and legislation that applies in all member states in those matters, and only those matters, where the states have agreed to act as one. EU policies aim to ensure the free movement of people, goods, services and capital within the internal market;[15] enact legislation in justice and home affairs; and maintain common policies on trade,[16] agriculture,[17] fisheries and regional development.[18] Passport controls have been abolished for travel within the Schengen Area.[19] The eurozone is a group composed of the 20 EU member states that have fully implemented the EU's economic and monetary union and use the euro currency. Through the Common Foreign and Security Policy, the union has developed a role in external relations and defence. It maintains permanent diplomatic missions throughout the world and represents itself at the United Nations, the World Trade Organization, the G7 and the G20. Due to its global influence, the European Union has been described by some scholars as an emerging superpower.[20][21][22][needs update]

The EU was established, along with its citizenship, when the Maastricht Treaty came into force in 1993, and was incorporated as an international legal juridical person[clarification needed] upon entry into force of the Treaty of Lisbon in 2009.[23] Its beginnings can be traced to the Inner Six states (Belgium, France, Italy, Luxembourg, the Netherlands, and West Germany) at the start of modern European integration in 1948, and to the Western Union, the International Authority for the Ruhr, the European Coal and Steel Community, the European Economic Community and the European Atomic Energy Community, which were established by treaties. These increasingly amalgamated bodies grew, with their legal successor the EU, both in size through the accessions of a further 22 states from 1973 to 2013, and in power through acquisitions of policy areas.

In 2012, the EU was awarded the Nobel Peace Prize. In 2020, the United Kingdom became the only member state to leave the EU; ten countries are aspiring or negotiating to join it."""



In [25]:
import nltk
nltk.download('punkt')

eu_defn = eu_defn.replace("\n", " ")

for punct in string.punctuation:

    eu_defn = eu_defn.replace(punct, " ")

print("Text without Punctuation  : \n")
print(eu_defn)

Text without Punctuation  : 

 The European Union  EU  is a supranational political and economic union of 27 member states that are located primarily in Europe  9  10  The union has a total area of 4 233 255 km2  1 634 469 sq mi  and an estimated population of over 449 million as of 2024  The EU is often described as a sui generis political entity combining characteristics of both a federation and a confederation  11  12   Containing 5 5  of the world population in 2023  13  EU member states generated a nominal gross domestic product  GDP  of around €17 935 trillion in 2024  accounting for approximately one sixth of global economic output  14  Its cornerstone  the Customs Union  paved the way to establishing an internal single market based on standardised legal framework and legislation that applies in all member states in those matters  and only those matters  where the states have agreed to act as one  EU policies aim to ensure the free movement of people  goods  services and capital

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
eu_defn = eu_defn.lower()

print("Text in Lower Case : \n")
print(eu_defn)

Text in Lower Case : 

 the european union  eu  is a supranational political and economic union of 27 member states that are located primarily in europe  9  10  the union has a total area of 4 233 255 km2  1 634 469 sq mi  and an estimated population of over 449 million as of 2024  the eu is often described as a sui generis political entity combining characteristics of both a federation and a confederation  11  12   containing 5 5  of the world population in 2023  13  eu member states generated a nominal gross domestic product  gdp  of around €17 935 trillion in 2024  accounting for approximately one sixth of global economic output  14  its cornerstone  the customs union  paved the way to establishing an internal single market based on standardised legal framework and legislation that applies in all member states in those matters  and only those matters  where the states have agreed to act as one  eu policies aim to ensure the free movement of people  goods  services and capital within

In [27]:
tokenized_eu_defn = word_tokenize(eu_defn)

print("Tokenized Text : \n")
print(tokenized_eu_defn)

Tokenized Text : 

['the', 'european', 'union', 'eu', 'is', 'a', 'supranational', 'political', 'and', 'economic', 'union', 'of', '27', 'member', 'states', 'that', 'are', 'located', 'primarily', 'in', 'europe', '9', '10', 'the', 'union', 'has', 'a', 'total', 'area', 'of', '4', '233', '255', 'km2', '1', '634', '469', 'sq', 'mi', 'and', 'an', 'estimated', 'population', 'of', 'over', '449', 'million', 'as', 'of', '2024', 'the', 'eu', 'is', 'often', 'described', 'as', 'a', 'sui', 'generis', 'political', 'entity', 'combining', 'characteristics', 'of', 'both', 'a', 'federation', 'and', 'a', 'confederation', '11', '12', 'containing', '5', '5', 'of', 'the', 'world', 'population', 'in', '2023', '13', 'eu', 'member', 'states', 'generated', 'a', 'nominal', 'gross', 'domestic', 'product', 'gdp', 'of', 'around', '€17', '935', 'trillion', 'in', '2024', 'accounting', 'for', 'approximately', 'one', 'sixth', 'of', 'global', 'economic', 'output', '14', 'its', 'cornerstone', 'the', 'customs', 'union', 'pa

In [28]:
print("\nFirst 10 tokens : \n")

print(tokenized_eu_defn[:10])


First 10 tokens : 

['the', 'european', 'union', 'eu', 'is', 'a', 'supranational', 'political', 'and', 'economic']


In [29]:
from pprint import pprint

from nltk.probability import FreqDist

fdist = FreqDist(tokenized_eu_defn)

top_20_tokens = fdist.most_common(20)

pprint(top_20_tokens)



[('the', 41),
 ('and', 21),
 ('of', 15),
 ('in', 14),
 ('eu', 10),
 ('a', 9),
 ('to', 9),
 ('union', 8),
 ('states', 7),
 ('european', 6),
 ('member', 5),
 ('as', 5),
 ('economic', 4),
 ('an', 4),
 ('its', 4),
 ('is', 3),
 ('that', 3),
 ('has', 3),
 ('world', 3),
 ('for', 3)]


# Stop Words

In [30]:
# NLTK

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kisha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
paragraph ="""
Language is one of the most powerful tools available to humanity. It allows us to communicate our thoughts, express emotions, and share ideas across time and space. From ancient manuscripts to modern AI-driven conversation systems, the evolution of language has been intertwined with human progress. Words carry meaning, but not all words contribute equally to understanding. This is where Natural Language Processing (NLP) techniques, such as stopword removal, become essential in extracting useful information from text.
In computational linguistics, stopwords refer to commonly used words—such as "the," "is," "and," or "of"—which often add little semantic value in text analysis. Removing stopwords helps reduce noise and improves the efficiency of NLP models, enabling them to focus on meaningful content. Consider a search engine processing billions of documents: by filtering out stopwords, it can enhance relevance and speed. However, stopword removal isn't always beneficial; in some cases, the presence of these words provides necessary context, making their elimination counterproductive.
Beyond stopword removal, NLP encompasses other crucial techniques like stemming and lemmatization, which help normalize words by reducing them to their base forms. Additionally, named entity recognition (NER) identifies key figures, places, or organizations within text, while sentiment analysis determines whether a given passage conveys positive, negative, or neutral emotions. With the advent of deep learning and transformer models, NLP applications have expanded beyond traditional text processing to advanced tasks such as speech recognition and machine translation.
As AI continues to evolve, NLP is transforming industries, enhancing virtual assistants, automating customer support, and even detecting patterns in medical texts. The ability to process and understand human language at scale has opened up endless possibilities, bridging the gap between humans and machines. Whether in academia, business, or creative storytelling, NLP-driven insights are shaping the future of communication in exciting ways.
"""

In [32]:
stemmer = PorterStemmer()

sentences = nltk.sent_tokenize(paragraph)

stop_words = set(stopwords.words('english'))

for i in range(len(sentences)):

    words = nltk.word_tokenize(sentences[i])

    filtered_words = [word for word in words if word.lower() not in stop_words]

    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    print(f"Original Sentence : {sentences[i]}")
    print(f"Filtered Words : {filtered_words}")
    print(f"Stemmed Words : {stemmed_words}")
    print()

Original Sentence : 
Language is one of the most powerful tools available to humanity.
Filtered Words : ['Language', 'one', 'powerful', 'tools', 'available', 'humanity', '.']
Stemmed Words : ['languag', 'one', 'power', 'tool', 'avail', 'human', '.']

Original Sentence : It allows us to communicate our thoughts, express emotions, and share ideas across time and space.
Filtered Words : ['allows', 'us', 'communicate', 'thoughts', ',', 'express', 'emotions', ',', 'share', 'ideas', 'across', 'time', 'space', '.']
Stemmed Words : ['allow', 'us', 'commun', 'thought', ',', 'express', 'emot', ',', 'share', 'idea', 'across', 'time', 'space', '.']

Original Sentence : From ancient manuscripts to modern AI-driven conversation systems, the evolution of language has been intertwined with human progress.
Filtered Words : ['ancient', 'manuscripts', 'modern', 'AI-driven', 'conversation', 'systems', ',', 'evolution', 'language', 'intertwined', 'human', 'progress', '.']
Stemmed Words : ['ancient', 'manus

In [33]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

processed_sentences = []

for i in range(len(sentences)):

    words = nltk.word_tokenize(sentences[i])

    filtered_words = [word for word in words if word.lower() not in stop_words]

    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in filtered_words]

    print(f"Original Sentence : {sentences[i]}")
    print(f"Filtered Words : {filtered_words}")
    print(f"Lemmatized Words : {lemmatized_words}")
    print()

Original Sentence : 
Language is one of the most powerful tools available to humanity.
Filtered Words : ['Language', 'one', 'powerful', 'tools', 'available', 'humanity', '.']
Lemmatized Words : ['Language', 'one', 'powerful', 'tool', 'available', 'humanity', '.']

Original Sentence : It allows us to communicate our thoughts, express emotions, and share ideas across time and space.
Filtered Words : ['allows', 'us', 'communicate', 'thoughts', ',', 'express', 'emotions', ',', 'share', 'ideas', 'across', 'time', 'space', '.']
Lemmatized Words : ['allow', 'us', 'communicate', 'thoughts', ',', 'express', 'emotions', ',', 'share', 'ideas', 'across', 'time', 'space', '.']

Original Sentence : From ancient manuscripts to modern AI-driven conversation systems, the evolution of language has been intertwined with human progress.
Filtered Words : ['ancient', 'manuscripts', 'modern', 'AI-driven', 'conversation', 'systems', ',', 'evolution', 'language', 'intertwined', 'human', 'progress', '.']
Lemmat

In [34]:
processed_sentence = ' '.join(lemmatized_words)

processed_sentences.append(processed_sentence)

processed_sentences

['Whether academia , business , creative storytelling , NLP-driven insights shape future communication excite ways .']

In [35]:
# spaCy

import spacy

nlp = spacy.load('en_core_web_sm')

sentence = """
Language is one of the most powerful tools available to humanity.
It allows us to communicate our thoughts, express emotions, and share ideas across time and space.
From ancient manuscripts to modern AI-driven conversation systems,
the evolution of language has been intertwined with human progress.
Words carry meaning, but not all words contribute equally to understanding.
This is where Natural Language Processing (NLP) techniques,
such as stopword removal, become essential in extracting useful information from text."""

doc = nlp(sentence)

for token in doc:

    print(f"Word : {token.text} \t Is Stopword : {token.is_stop}")

Word : 
 	 Is Stopword : False
Word : Language 	 Is Stopword : False
Word : is 	 Is Stopword : True
Word : one 	 Is Stopword : True
Word : of 	 Is Stopword : True
Word : the 	 Is Stopword : True
Word : most 	 Is Stopword : True
Word : powerful 	 Is Stopword : False
Word : tools 	 Is Stopword : False
Word : available 	 Is Stopword : False
Word : to 	 Is Stopword : True
Word : humanity 	 Is Stopword : False
Word : . 	 Is Stopword : False
Word : 
 	 Is Stopword : False
Word : It 	 Is Stopword : True
Word : allows 	 Is Stopword : False
Word : us 	 Is Stopword : True
Word : to 	 Is Stopword : True
Word : communicate 	 Is Stopword : False
Word : our 	 Is Stopword : True
Word : thoughts 	 Is Stopword : False
Word : , 	 Is Stopword : False
Word : express 	 Is Stopword : False
Word : emotions 	 Is Stopword : False
Word : , 	 Is Stopword : False
Word : and 	 Is Stopword : True
Word : share 	 Is Stopword : False
Word : ideas 	 Is Stopword : False
Word : across 	 Is Stopword : True
Word : time 	 I

In [36]:
# Default Stop words
print(nlp.Defaults.stop_words)
print()
print(len(nlp.Defaults.stop_words))

{'was', 'i', 'his', 'quite', 'otherwise', 'eight', "'ll", 'almost', 'move', 'such', 'what', 'last', 'thereafter', 'it', 'you', 'themselves', 'indeed', 'behind', 'though', 'only', "'re", 'amount', 'five', 'although', 'towards', 'when', 'in', 'whither', 'per', 'perhaps', 'the', 'yet', 'an', 'whatever', 'anyway', 'ten', 'off', 'with', 'becomes', 'others', 'further', 'her', '’ve', 'fifty', 'also', 'since', 'whereas', 'she', 'herein', 'top', 'through', 'anyone', 'whom', 'than', 'yourselves', 'anyhow', 'so', 'anything', 'out', 'four', 'under', 'all', 'else', 'sometimes', 'your', 'them', 'n’t', 'see', 'twenty', 'itself', 'into', 'many', 'even', 'bottom', 'several', 'this', 'together', 'get', 'whereupon', 'yourself', 'toward', 'just', 'take', 'some', 'while', 'unless', '’ll', 'hence', 'among', "'m", 'another', 'their', 'formerly', 'anywhere', 'whose', 'say', 'put', 'of', 'how', 'herself', 'within', 'they', 'who', 'would', '‘d', 'wherever', 'as', 'often', 'is', 'been', 'same', 'least', 'neverth

In [37]:
# check Default Stop words
print(nlp.vocab['is'].is_stop)
print(nlp.vocab['student'].is_stop)

True
False


In [38]:
# Add

nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

print(len(nlp.Defaults.stop_words))

327


In [39]:
# Remove

nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop = False

print(len(nlp.Defaults.stop_words))

326
