In [1]:
import nltk

In [2]:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk

In [4]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
sentence = """At eight o'clock on Thursday morning Arthur didn't feel very good."""

Normalization

In [8]:
normalized_text = sentence.lower()
print("\nNormalized Text:")
print(normalized_text)


Normalized Text:
at eight o'clock on thursday morning arthur didn't feel very good.


Sentence Tokenization

In [9]:
sentences=sent_tokenize(normalized_text)
print('\nSentence Tokenization: ')
print(sentences)


Sentence Tokenization: 
["at eight o'clock on thursday morning arthur didn't feel very good."]


Word Tokenization

In [10]:
words = word_tokenize(normalized_text)
print("\nWord Tokenization:")
print(words)


Word Tokenization:
['at', 'eight', "o'clock", 'on', 'thursday', 'morning', 'arthur', 'did', "n't", 'feel', 'very', 'good', '.']


Punctuation removal

In [11]:
words_no_punct = [word for word in words if word not in string.punctuation]
print("\nPunctuation Removal:")
print(words_no_punct)


Punctuation Removal:
['at', 'eight', "o'clock", 'on', 'thursday', 'morning', 'arthur', 'did', "n't", 'feel', 'very', 'good']


Stemming

In [12]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words_no_punct]
print("\nStemming:")
print(stemmed_words)


Stemming:
['at', 'eight', "o'clock", 'on', 'thursday', 'morn', 'arthur', 'did', "n't", 'feel', 'veri', 'good']


Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words_no_punct]
print("\nLemmatization:")
print(lemmatized_words)


Lemmatization:
['at', 'eight', "o'clock", 'on', 'thursday', 'morning', 'arthur', 'did', "n't", 'feel', 'very', 'good']


POS Tagging

In [14]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [15]:
pos_tags = pos_tag(words_no_punct)
print("\nPOS Tagging:")
print(pos_tags)


POS Tagging:
[('at', 'IN'), ('eight', 'CD'), ("o'clock", 'NN'), ('on', 'IN'), ('thursday', 'JJ'), ('morning', 'NN'), ('arthur', 'NN'), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ')]


Named entity recognition

In [17]:
nltk.download("maxent_ne_chunker_tab")

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


True

In [20]:
ner_chunks = ne_chunk(pos_tags)
print("\nNamed Entity Recognition (NER):")
print(ner_chunks)


Named Entity Recognition (NER):
(S
  at/IN
  eight/CD
  o'clock/NN
  on/IN
  thursday/JJ
  morning/NN
  arthur/NN
  did/VBD
  n't/RB
  feel/VB
  very/RB
  good/JJ)


Parse tree

In [24]:
import nltk
import matplotlib.pyplot as plt
from nltk.tree import Tree
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget
from IPython.display import display

In [25]:
# Convert NER chunk to string tree format
tree_str = str(ner_chunks)

In [30]:
# Convert string format to NLTK Tree
tree = Tree.fromstring(tree_str)

# Plot the tree using matplotlib
plt.figure(figsize=(10, 6))
tree.pretty_print()
plt.show()

                                                S                                                      
   _____________________________________________|__________________________________________________     
at/IN eight/CD o'clock/NN on/IN thursday/JJ morning/NN arthur/NN did/VBD n't/RB feel/VB very/RB good/JJ



<Figure size 1000x600 with 0 Axes>