In [43]:
!pip install nltk




In [44]:
corpus="""Hello Welcome, this is a sample text for demonstrating text processing using NLTK in Python.
NLTK is a powerful library for natural language processing tasks!
Please revise the concept learning here.
"""

# Tokenization


In [45]:
#Tokenization
#Sentence->Paragraph
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
documents=sent_tokenize(corpus, language='english')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jay.khandelwal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [46]:
type(documents)

list

In [47]:
for i in documents:
    print(i)


Hello Welcome, this is a sample text for demonstrating text processing using NLTK in Python.
NLTK is a powerful library for natural language processing tasks!
Please revise the concept learning here.


In [48]:
#Tokenization
#Paragraph->Words
from nltk.tokenize import word_tokenize
words=word_tokenize(corpus, language='english')

In [49]:
words

['Hello',
 'Welcome',
 ',',
 'this',
 'is',
 'a',
 'sample',
 'text',
 'for',
 'demonstrating',
 'text',
 'processing',
 'using',
 'NLTK',
 'in',
 'Python',
 '.',
 'NLTK',
 'is',
 'a',
 'powerful',
 'library',
 'for',
 'natural',
 'language',
 'processing',
 'tasks',
 '!',
 'Please',
 'revise',
 'the',
 'concept',
 'learning',
 'here',
 '.']

In [50]:
for sentences in documents:
    word=word_tokenize(sentences)
    print(word)

['Hello', 'Welcome', ',', 'this', 'is', 'a', 'sample', 'text', 'for', 'demonstrating', 'text', 'processing', 'using', 'NLTK', 'in', 'Python', '.']
['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', 'tasks', '!']
['Please', 'revise', 'the', 'concept', 'learning', 'here', '.']


In [51]:
from nltk.tokenize import TreebankWordTokenizer    #In between sentence fullstop doesn't split (e.g [...,'in', 'Python.', 'NLTK',...])
from nltk.tokenize import WordPunctTokenizer      #I'm->['I', "'", 'm']

In [52]:
tokenizer=TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'this',
 'is',
 'a',
 'sample',
 'text',
 'for',
 'demonstrating',
 'text',
 'processing',
 'using',
 'NLTK',
 'in',
 'Python.',
 'NLTK',
 'is',
 'a',
 'powerful',
 'library',
 'for',
 'natural',
 'language',
 'processing',
 'tasks',
 '!',
 'Please',
 'revise',
 'the',
 'concept',
 'learning',
 'here',
 '.']

# Stemming

In [53]:
#Stemming is a process of reducing words to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.
from nltk.stem import PorterStemmer
ps=PorterStemmer()
words=["program","programs","programmer","programming","programmers","ran","running","runner","easily","fairly","fairness","studies","studying","studied"]
for w in words:
    print(f"{w} --> {ps.stem(w)}")


program --> program
programs --> program
programmer --> programm
programming --> program
programmers --> programm
ran --> ran
running --> run
runner --> runner
easily --> easili
fairly --> fairli
fairness --> fair
studies --> studi
studying --> studi
studied --> studi


In [54]:
from nltk.stem import RegexpStemmer
rs=RegexpStemmer('ing$|s$|ed$|ly$|er$|ies$|ness$',min=4)
w="fairly"
print(f"{w} --> {rs.stem(w)}")

fairly --> fair


In [55]:
from nltk.stem import SnowballStemmer  #A little bit more advanced than PorterStemmer
ss=SnowballStemmer('english')
for w in words:
    print(f"{w} --> {ss.stem(w)}")

program --> program
programs --> program
programmer --> programm
programming --> program
programmers --> programm
ran --> ran
running --> run
runner --> runner
easily --> easili
fairly --> fair
fairness --> fair
studies --> studi
studying --> studi
studied --> studi


# Lemmatization


In [56]:
# This class use morphy() function internally. It is a built-in function in WordNetLemmatizer which looks for the lemma of a word in WordNet's built-in morphy function.
from nltk.stem import WordNetLemmatizer
wn=WordNetLemmatizer()
wn.lemmatize("studies",pos='n')   #pos can be 'a' for adjective, 'r' for adverb, 'v' for verb, 'n' for noun


'study'

# Stopwords

In [57]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jay.khandelwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
print(stop_words)

{'below', 'there', 'was', 'be', 'so', 'y', "i've", 'ma', "doesn't", "she's", 'themselves', "i'd", 'an', 'had', "shan't", 'any', 'me', "it's", 'm', 'himself', 'in', 'own', 'its', 'them', 'who', 'her', "mightn't", 'o', 'do', 'whom', 'yours', "isn't", 'about', 'this', 's', 'it', 'mustn', 'where', "he's", 'he', 'herself', 'shan', "you're", 'after', 'are', 'because', 'under', 'for', 'myself', "don't", 'just', 'or', 'into', 'how', 'weren', "it'll", 'been', 'they', 've', 'isn', "that'll", 'these', 'him', "we'd", 'between', "you'd", 're', 'ours', 'against', 'off', "didn't", 'on', "she'll", 'his', 'yourselves', 'very', 'from', 'couldn', "haven't", 'having', 'a', 'above', 'did', 'i', "i'll", 'other', 'if', "they'll", 'further', 'some', 'each', 'but', "mustn't", 'will', 'why', 'don', 'wouldn', 'itself', 'needn', 'shouldn', 'theirs', 'our', "we're", "needn't", 'is', "wouldn't", 'you', "you'll", 'which', 'your', 'doing', 'haven', 'all', 'wasn', 'such', "we've", "it'd", 'mightn', "he'd", 'nor', 'not

In [59]:
paragraph="APJ Abdul Kalam was born in Rameswaram on 15 October 1931. He studied physics and aerospace engineering. He then joined DRDO and ISRO. He was also a key figure in India's Pokhran-II nuclear tests in 1998. He served as the 11th President of India from 2002 to 2007. His presidency was marked by efforts to promote education, particularly in the fields of science and technology. Kalam was known as the 'People's President' and inspired millions with his vision for a developed India. He passed away on 27 July 2015 while delivering a lecture at the Indian Institute of Management Shillong."

In [60]:
sentences=nltk.sent_tokenize(paragraph)

In [61]:
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[wn.lemmatize(word,pos='v') for word in words if word not in set(stop_words)]
    sentences[i]=' '.join(words)

sentences

['APJ Abdul Kalam bear Rameswaram 15 October 1931 .',
 'He study physics aerospace engineer .',
 'He join DRDO ISRO .',
 "He also key figure India 's Pokhran-II nuclear test 1998 .",
 'He serve 11th President India 2002 2007 .',
 'His presidency mark efforts promote education , particularly field science technology .',
 "Kalam know 'People 's President ' inspire millions vision develop India .",
 'He pass away 27 July 2015 deliver lecture Indian Institute Management Shillong .']

# Speech Tagging

In [62]:
nltk.download('averaged_perceptron_tagger_eng')
paragraph="APJ Abdul Kalam was born in Rameswaram on 15 October 1931. He studied physics and aerospace engineering. He then joined DRDO and ISRO. He was also a key figure in India's Pokhran-II nuclear tests in 1998. He served as the 11th President of India from 2002 to 2007. His presidency was marked by efforts to promote education, particularly in the fields of science and technology. Kalam was known as the 'People's President' and inspired millions with his vision for a developed India. He passed away on 27 July 2015 while delivering a lecture at the Indian Institute of Management Shillong."
sentences=nltk.sent_tokenize(paragraph)
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[word for word in words if word not in set(stop_words)]
    pos_tagged=nltk.pos_tag(words)
    print(pos_tagged)


[('APJ', 'NNP'), ('Abdul', 'NNP'), ('Kalam', 'NNP'), ('born', 'VBD'), ('Rameswaram', 'NNP'), ('15', 'CD'), ('October', 'NNP'), ('1931', 'CD'), ('.', '.')]
[('He', 'PRP'), ('studied', 'VBD'), ('physics', 'NNS'), ('aerospace', 'NN'), ('engineering', 'NN'), ('.', '.')]
[('He', 'PRP'), ('joined', 'VBD'), ('DRDO', 'NNP'), ('ISRO', 'NNP'), ('.', '.')]
[('He', 'PRP'), ('also', 'RB'), ('key', 'JJ'), ('figure', 'NN'), ('India', 'NNP'), ("'s", 'POS'), ('Pokhran-II', 'JJ'), ('nuclear', 'JJ'), ('tests', 'NNS'), ('1998', 'CD'), ('.', '.')]
[('He', 'PRP'), ('served', 'VBD'), ('11th', 'CD'), ('President', 'NNP'), ('India', 'NNP'), ('2002', 'CD'), ('2007', 'CD'), ('.', '.')]
[('His', 'PRP$'), ('presidency', 'NN'), ('marked', 'VBD'), ('efforts', 'NNS'), ('promote', 'JJ'), ('education', 'NN'), (',', ','), ('particularly', 'RB'), ('fields', 'NNS'), ('science', 'NN'), ('technology', 'NN'), ('.', '.')]
[('Kalam', 'NNP'), ('known', 'VBN'), ("'People", 'NNP'), ("'s", 'POS'), ('President', 'NNP'), ("'", 'POS'

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/jay.khandelwal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


# Named Entity Recognition

In [65]:
sentence="Eiffel Tower is located in Paris. It is one of the most famous landmarks in the world. The tower was constructed in 1889 and stands at a height of 324 meters. It is named after the engineer Gustave Eiffel, whose company designed and built the tower. The Eiffel Tower attracts millions of visitors each year and offers stunning views of the city from its observation decks.It buiilds in year 1889."
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
words=nltk.word_tokenize(sentence)
pos_tagged=nltk.pos_tag(words)
named_entity=nltk.ne_chunk(pos_tagged)


[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/jay.khandelwal/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/jay.khandelwal/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [66]:
for chunk in named_entity:
    if hasattr(chunk, 'label'):
        entity = " ".join(c[0] for c in chunk)
        print(entity, "→", chunk.label())

Eiffel → PERSON
Tower → ORGANIZATION
Paris → GPE
Gustave Eiffel → PERSON
Eiffel Tower → ORGANIZATION
