## Tokenization

In [1]:
import nltk
from string import punctuation
# nltk.download('punkt')

from nltk import sent_tokenize,word_tokenize,wordpunct_tokenize

corpus="Artificial Intelligence (AI) is transforming industries! From healthcare to finance, AI is helping automate processes, improve decision-making, and create innovative products. But, it also raises ethical concerns—such as job displacement and data privacy."

sent=sent_tokenize(corpus)
words=word_tokenize(corpus)
wordswithoutpun=wordpunct_tokenize(corpus)

print("Sentences: ")
for i in sent:
    print(i)

print("Words:")
for i in words:
    print(i)

total=0
for i in wordswithoutpun:
    if i not in punctuation:
        total+=1

print(f"Total num of words is {total}")


Sentences: 
Artificial Intelligence (AI) is transforming industries!
From healthcare to finance, AI is helping automate processes, improve decision-making, and create innovative products.
But, it also raises ethical concerns—such as job displacement and data privacy.
Words:
Artificial
Intelligence
(
AI
)
is
transforming
industries
!
From
healthcare
to
finance
,
AI
is
helping
automate
processes
,
improve
decision-making
,
and
create
innovative
products
.
But
,
it
also
raises
ethical
concerns—such
as
job
displacement
and
data
privacy
.
Total num of words is 36


## Stemming + Tokenization

In [19]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,RegexpStemmer,SnowballStemmer
from string import punctuation

text="""The children were happily playing in the playgrounds while the wolves were hunting nearby.  
Studies about modernization, globalizations, and digitalizations are rapidly increasing.  
However, some organizations are disabling unnecessary features to simplify their systems."""

tokens=word_tokenize(text)


porter=PorterStemmer()
regex=RegexpStemmer('ing$|s$|e$|able$', min=3)
snowball=SnowballStemmer('english')
lst=[]
print("word ----> porter ----> regex ----> snowball")
for i in tokens:
    if i not in punctuation and len(i)>=4:
        print(f"{i} ----> {porter.stem(i)} ----> {regex.stem(i)} ----> {snowball.stem(i)}")
        lst.append([i,porter.stem(i),regex.stem(i),snowball.stem(i)])

for i in lst:
    if i[1]!=i[2] or i[1]!=i[3] or i[2]!=i[3]:
        print(i)



word ----> porter ----> regex ----> snowball
children ----> children ----> children ----> children
were ----> were ----> wer ----> were
happily ----> happili ----> happily ----> happili
playing ----> play ----> play ----> play
playgrounds ----> playground ----> playground ----> playground
while ----> while ----> whil ----> while
wolves ----> wolv ----> wolve ----> wolv
were ----> were ----> wer ----> were
hunting ----> hunt ----> hunt ----> hunt
nearby ----> nearbi ----> nearby ----> nearbi
Studies ----> studi ----> Studie ----> studi
about ----> about ----> about ----> about
modernization ----> modern ----> modernization ----> modern
globalizations ----> global ----> globalization ----> global
digitalizations ----> digit ----> digitalization ----> digit
rapidly ----> rapidli ----> rapidly ----> rapid
increasing ----> increas ----> increas ----> increas
However ----> howev ----> However ----> howev
some ----> some ----> som ----> some
organizations ----> organ ----> organization ----> 

## Lemitisation

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
Sent="""The cats were chasing the mice while the children were running quickly towards the playgrounds.  
She has been studying the effects of modernizations on societies.  
Many people are happier when they are given responsibilities and recognition."""
words2=word_tokenize(Sent)

for i in words2:
    print(lemmatizer.lemmatize(i))

## Stopwords

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

text="Machine learning is a subset of artificial intelligence that enables systems to learn from data without being explicitly programmed."
def rem_stopwords(text):
    words=word_tokenize(text)
    lst=[]
    stop_words=stopwords.words('english')
    for i in words:
        if i.lower() not in stop_words and i not in punctuation:
            lst.append(i)
    return lst

rem_stopwords(text)




## Parts of Speech tag

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from string import punctuation
nltk.download('averaged_perceptron_tagger_eng')

text="John is playing football in the park while his friends are studying at home."
lst=[]
nountag=["NN",'NNS' ,"NNP","NNPS"]
verbtag=["VB" ,'VBD','VBG' ,'VBN' ,'VBP' ,'VBZ']
noun=[]
verb=[]
words=word_tokenize(text)
stop_words=stopwords.words('english')

for i in words:
    if i not in stop_words and i not in punctuation:
        if i not in lst:
            lst.append(i)

aftlst=pos_tag(lst,tagset=None, lang="eng")
print(f"{'word':<15} pos")
for i in aftlst:
    print(f"{i[0]:<15} {i[1]}")
    if i[1] in nountag:
        noun.append(i[0])
    elif i[1] in verbtag :
        verb.append(i[0])

print(f"Noun: {noun}")
print(f"verb= {verb}")



In [69]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
text="""Barack Obama was born in Hawaii. He was elected President of the United States in 2008. 
Later, he gave a speech in Paris about climate change."""
persons=[]

others=[]
lst=word_tokenize(text)
tagged=pos_tag(lst,lang='eng')

chunked=ne_chunk(tagged)

# go throug this once again

for subtree in chunked:
    if hasattr(subtree, 'label'):   # Named Entity found
        label = subtree.label()
        name = " ".join([token for token, pos in subtree.leaves()])
        
        if label == "PERSON":
            persons.append(name)
        else:
            others.append((name, label))  # keep label too (like GPE, ORGANIZATION, etc.)

print("Persons:", persons)
print("Others :", others)

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/kishan/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/kishan/nltk_data...
[nltk_data]   Package words is already up-to-date!


Persons: ['Barack', 'Obama']
Others : [('Hawaii', 'GPE'), ('United States', 'GPE'), ('Paris', 'GPE')]
