# NLP Lab 1 Assignment

### Import necessary libraries

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords,wordnet

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
#init sample text
text="The leaves on the tree are falling and the wind is blowing gently. This is a beautiful sight to behold"

### Tokenization - Sentence Tokenization

In [8]:
nltk.download('punkt_tab')
sentences=sent_tokenize(text) #raised error- had to download punkt-tab resource
print("Sentence tokenization:")
print(sentences)

Sentence tokenization:
['The leaves on the tree are falling and the wind is blowing gently.', 'This is a beautiful sight to behold']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Tokenization - Word tokenization

In [9]:
words=word_tokenize(text)
print('\nWord Tokenization')
print(words)


Word Tokenization
['The', 'leaves', 'on', 'the', 'tree', 'are', 'falling', 'and', 'the', 'wind', 'is', 'blowing', 'gently', '.', 'This', 'is', 'a', 'beautiful', 'sight', 'to', 'behold']


### Removing stopwords

In [10]:
stop_words=set(stopwords.words('english'))
filtered_words=[word for word in words if word.lower() not in stop_words]
print("\nWords after removing stop words:")
print(filtered_words)


Words after removing stop words:
['leaves', 'tree', 'falling', 'wind', 'blowing', 'gently', '.', 'beautiful', 'sight', 'behold']


## Using wordnet

### Lemmatization

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
lemmatized_words=[lemmatizer.lemmatize(word) for word in filtered_words]
print("\nLemmatized Words")
print(lemmatized_words)


Lemmatized Words
['leaf', 'tree', 'falling', 'wind', 'blowing', 'gently', '.', 'beautiful', 'sight', 'behold']


### Synonyms and antonyms using wordnet

In [12]:
word="beautiful"
synonyms=[]
antonyms=[]

for syn in wordnet.synsets(word):
  for lemma in syn.lemmas():
    synonyms.append(lemma.name())
    if lemma.antonyms():
      antonyms.append(lemma.antonyms()[0].name())

In [15]:
print(f"\nSynonyms of '{word}': {set(synonyms)}")
print(f"Antonyms of '{word}': {set(antonyms)}")


Synonyms of 'beautiful': {'beautiful'}
Antonyms of 'beautiful': {'ugly'}


### Definitions from Wordnet

In [16]:
definitions=[syn.definition() for syn in wordnet.synsets(word)]
print(f"\nDefn of '{word}':")
for i,definition in enumerate(definitions,1):
  print(f"{i}, {definition}")


Defn of 'beautiful':
1, delighting the senses or exciting intellectual or emotional admiration
2, (of weather) highly enjoyable


## Stemming using nltk

In [18]:
from nltk.stem import PorterStemmer,LancasterStemmer

porter_stemmer=PorterStemmer()
lancaster_stemmer=LancasterStemmer()

words=["running","ran","runs","easily","fairness"]

porter_stems=[porter_stemmer.stem(word) for word in words]
print("Stemming with porterstemmer")
print(porter_stems)

Stemming with porterstemmer
['run', 'ran', 'run', 'easili', 'fair']


In [19]:
lancaster_stems=[lancaster_stemmer.stem(word) for word in words]
print('\nStemming with LancasterStemmer:')
print(lancaster_stems)


Stemming with LancasterStemmer:
['run', 'ran', 'run', 'easy', 'fair']


## Word replacement

In [64]:
text="U r going 2 the mall aren't u?"

In [65]:
replacements={
    "U ":" you ",
    " r ":" are ",
    " 2 ":" to ",
    ' u': " you",
}

In [66]:
for key,value in replacements.items():
  text=text.replace(key,value)

In [67]:
print("After replacement :",text)

After replacement :  you are going to the mall aren't you?


Note that r in any part of the sentence is being replaced by are and that's not actually right

### Synonym replacement

In [31]:
from nltk.corpus import wordnet

In [34]:
def get_synonyms(word):
  synonyms=[]
  for synset in wordnet.synsets(word):
    for lemma in synset.lemmas():
      if lemma.name()!=word:
        synonyms.append(lemma.name())
  return set(synonyms)

In [35]:
word="happy"
synonyms=get_synonyms(word)
print(f"Synonys of '{word}':",synonyms)

Synonys of 'happy': {'felicitous', 'well-chosen', 'glad'}


### Antonym Replacement

In [68]:
from nltk.corpus import wordnet

def get_antonyms(word):
  antonyms=[]
  for synset in wordnet.synsets(word):
    for lemma in synset.lemmas():
      if lemma.antonyms():
        antonyms.append(lemma.antonyms()[0].name())
  return set(antonyms)


In [69]:
word="happy"
antonyms=get_antonyms(word)
print(f"Antonyms of '{word}:'",antonyms)

Antonyms of 'happy:' {'unhappy'}


Parse Tree

In [71]:
from nltk import CFG

grammar = CFG.fromstring('''
          S -> NP VP
          NP -> DT NN | DT JJ NN
          VP -> VBZ NP | VBZ
          DT -> 'the' | 'a'
          JJ -> 'red'
          NN -> 'cat'|'dog'
          VBZ -> 'chases'|'sleeps'
''')

In [73]:
parser=nltk.ChartParser(grammar)
sentence='the red cat chases the dog'.split()

print("Parse Tree")
for tree in parser.parse(sentence):
  print(tree)

Parse Tree
(S
  (NP (DT the) (JJ red) (NN cat))
  (VP (VBZ chases) (NP (DT the) (NN dog))))


In [74]:
tree.pretty_print()

         S                    
      ___|__________           
     |              VP        
     |         _____|___       
     NP       |         NP    
  ___|___     |      ___|___   
 DT  JJ  NN  VBZ    DT      NN
 |   |   |    |     |       |  
the red cat chases the     dog



## Named Entity Recognition

In [80]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

text='Barack Obama was the 44th president of the united states and he lives in washington'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [79]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
tokens=word_tokenize(text)
pos_tags=pos_tag(tokens)
named_entities=(ne_chunk(pos_tags))
print("Named Entity Recog")
for subtree in named_entities:
  if isinstance(subtree,Tree):
    entity_name="".join([token for token,pos in subtree.leaves()])
    entity_type=subtree.label()
    print(f"Entity:{entity_name},Type:{entity_type}")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


Named Entity Recog
Entity:Barack,Type:PERSON
Entity:Obama,Type:PERSON
