In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Joel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Get Synonyms from WordNet

In [2]:
from nltk.corpus import wordnet

In [3]:
words = ["cricket", "gym", "fitness", "body", "cardio"]

for word in words:
    synsets = wordnet.synsets(word)
    print(f"\nWord: {word}")
    for i, syn in enumerate(synsets[:5], start=1):
        print(f"{i}. {syn.name()} - {syn.definition()}")


Word: cricket
1. cricket.n.01 - leaping insect; male makes chirping noises by rubbing the forewings together
2. cricket.n.02 - a game played with a ball and bat by two teams of 11 players; teams take turns trying to score runs
3. cricket.v.01 - play cricket

Word: gym
1. gymnasium.n.02 - athletic facility equipped for sports or physical training

Word: fitness
1. fitness.n.01 - the quality of being suitable
2. fitness.n.02 - good physical condition; being in shape or in condition
3. seaworthiness.n.01 - fitness to traverse the seas
4. fitness.n.04 - the quality of being qualified

Word: body
1. body.n.01 - the entire structure of an organism (an animal, plant, or human being)
2. body.n.02 - a group of persons associated by some common tie or occupation and regarded as an entity
3. body.n.03 - a natural object consisting of a dead animal or person
4. body.n.04 - an individual 3-dimensional object that has mass and that is distinguishable from other objects
5. torso.n.01 - the body excl

***Find the 3rd meaning of the word***

In [4]:
word = "talk"
synsets = wordnet.synsets(word)

In [5]:
if len(synsets) >= 3:
    third_meaning = synsets[2]
    print(f"3rd meaning of '{word}': {third_meaning.definition()}")
else:
    print(f"Less than 3 meanings found for '{word}'")

3rd meaning of 'talk': the act of giving a talk to an audience


***Extract different PoS from synonyms***

In [6]:
def get_synonyms_by_pos(word, pos):
    synsets = wordnet.synsets(word, pos=pos)
    synonyms = set()
    for syn in synsets:
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

word = "talk"
print("Nouns:", get_synonyms_by_pos(word, wordnet.NOUN))
print("Verbs:", get_synonyms_by_pos(word, wordnet.VERB))
print("Adjectives:", get_synonyms_by_pos(word, wordnet.ADJ))
print("Adverbs:", get_synonyms_by_pos(word, wordnet.ADV))

Nouns: ['talk_of_the_town', 'lecture', 'talk', 'public_lecture', 'talking']
Verbs: ['verbalise', 'tattle', 'talk', 'blab', 'peach', 'lecture', 'blab_out', 'speak', 'verbalize', 'babble_out', 'utter', 'spill', 'sing', 'babble', 'spill_the_beans', 'let_the_cat_out_of_the_bag', 'mouth']
Adjectives: []
Adverbs: []


***Extract the definition of the word***

In [7]:
word = "strong"
synsets = wordnet.synsets(word)

if synsets:
    print(f"Definition of '{word}': {synsets[0].definition()}")

Definition of 'strong': having strength or power greater than average or expected


***Get Antonyms from WordNet***

In [8]:
def get_antonyms(word):
    antonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
    return list(antonyms)

In [9]:
word = "happy"
print(f"Antonyms of '{word}': {get_antonyms(word)}")

Antonyms of 'happy': ['unhappy']


In [10]:
word = "talk"
print(f"Antonyms of '{word}': {get_antonyms(word)}")

Antonyms of 'talk': ['keep_quiet']


In [11]:
word = "Good"
print(f"Antonyms of '{word}': {get_antonyms(word)}")

Antonyms of 'Good': ['ill', 'evilness', 'badness', 'evil', 'bad']


***Lemmatizing words using WordNet***

In [12]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "speaking", "stronger", "happiest"]

for word in words:
    print(f"Lemmatized '{word}': {lemmatizer.lemmatize(word, wordnet.VERB)}")

Lemmatized 'running': run
Lemmatized 'speaking': speak
Lemmatized 'stronger': stronger
Lemmatized 'happiest': happiest


***Differentiate Stemming and Lemmatizing***

In [13]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words = ["running", "talked", "happily", "strongest","flies","studies"]

print("Word | Stemming | Lemmatizing")
for word in words:
    print(f"{word} | {stemmer.stem(word)} | {lemmatizer.lemmatize(word, wordnet.VERB)}")

Word | Stemming | Lemmatizing
running | run | run
talked | talk | talk
happily | happili | happily
strongest | strongest | strongest
flies | fli | fly
studies | studi | study


***PoS Tagging***

In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "She happily runs to the bright and strong tower."

doc = nlp(sentence)
print("POS Tagging:")
for token in doc:
    print(f"{token.text} - {token.pos_}")

POS Tagging:
She - PRON
happily - ADV
runs - VERB
to - ADP
the - DET
bright - ADJ
and - CCONJ
strong - ADJ
tower - NOUN
. - PUNCT


***Named Entity Recognition (NER)***

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Albert Einstein was born in Germany and worked at Princeton University."

doc = nlp(text)
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")

Named Entities:
Albert Einstein - PERSON
Germany - GPE
Princeton University - ORG


***Dependency & Constituency Parsing***

In [16]:
pip install benepar

Collecting benepar
  Downloading benepar-0.2.0.tar.gz (33 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch-struct>=0.5 (from benepar)
  Downloading torch_struct-0.5-py3-none-any.whl.metadata (4.3 kB)
Collecting tokenizers>=0.9.4 (from benepar)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting transformers>=4.2.2 (from transformers[tokenizers,torch]>=4.2.2->benepar)
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     ---------------------------------------- 44.4/44.4 kB 2.1 MB/s eta 0:00:00
Collecting sentencepiece>=0.1.91 (from benepar)
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers>=0.9.4->benepar)
  Downloading huggingface_hub-0.28.1-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (f

In [17]:
import benepar
benepar.download('benepar_en3')

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\Joel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping models\benepar_en3.zip.


True

In [18]:
import spacy
import benepar

if "benepar" not in nlp.pipe_names:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

text = "The bright student talks happily about science."
doc = nlp(text)

print("Dependency Parsing:")
spacy.displacy.render(doc, style="dep", jupyter=True)

print("Constituency Parsing :")
for sent in doc.sents:
    print(sent._.parse_string)

  state_dict = torch.load(
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Dependency Parsing:


Constituency Parsing :
(S (NP (DT The) (JJ bright) (NN student)) (VP (VBZ talks) (ADVP (RB happily)) (PP (IN about) (NP (NN science)))) (. .))
