In [2]:
!pip install nltk



In [1]:
#nltk - wordnet and lexical database: setting up WordNet in NLTK
import nltk
nltk.download('wordnet')
#for wordnet with different language
nltk.download('omw-1.4')
#for tokenization
nltk.download('punkt')
#POS tagger part of speech
nltk.download('averaged_perceptron_tagger_eng')

from nltk.corpus import wordnet as wn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Luis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Luis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Luis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [5]:
#implement Lesk Algo - is simple knowledge-based WSD method - lesk()
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

#example sentence
#sentence = "I deposited money at the bank."
sentences = [
    "The riverbank was overflowing.",
    "He withdrew money from the bank."
]

# Define a function to manually find the appropriate sense
def get_best_sense(word, context):
    synsets = wn.synsets(word)

    print(f"All possible senses for '{word}': ")
    for i, syn in enumerate(synsets, 1):
        print(f"{i}. {syn.name()}: {syn.definition()}")
    
    # Based on context, manually select the appropriate sense
    if "river" in context or "overflowing" in context:
        # Select the sense related to the geographical feature (river bank)
            # Synset index for 'bank' as a riverbank
        selected_sense = synsets[0]
    elif "money" in context or "withdrew" in context:
        # Select the financial institution sense
        # Synset index for 'bank' as a financial institution
        selected_sense = synsets[1]
    else:
        selected_sense = None
    
    return selected_sense

# Loop through each sentence
for sentence in sentences:
    # Tokenize the sentence
    tokens = nltk.word_tokenize(sentence)

    # Use manual disambiguation based on the context
    best_sense = get_best_sense('bank', sentence)

    if best_sense:
        print(f"\nSentence: {sentence}")
        print(f"Best sense of 'bank': {best_sense.name()}")
        print(f"Definition: {best_sense.definition()}")
    else:
        print(f"No suitable sense found for 'bank' in the sentence.")

All possible senses for 'bank': 
1. bank.n.01: sloping land (especially the slope beside a body of water)
2. depository_financial_institution.n.01: a financial institution that accepts deposits and channels the money into lending activities
3. bank.n.03: a long ridge or pile
4. bank.n.04: an arrangement of similar objects in a row or in tiers
5. bank.n.05: a supply or stock held in reserve for future use (especially in emergencies)
6. bank.n.06: the funds held by a gambling house or the dealer in some gambling games
7. bank.n.07: a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
8. savings_bank.n.02: a container (usually with a slot in the top) for keeping money at home
9. bank.n.09: a building in which the business of banking transacted
10. bank.n.10: a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
11. bank.v.01: tip laterally
12. bank.v.02: enclose with a ban

 Mini Activity - WSD

Activity: Building a Simple WSD System (1 hour)  
Objective: Students will implement a basic WSD model using Python and apply it to a dataset of sentences with ambiguous words.

Instructions:  
Dataset Preparation:

Provide a dataset containing sentences with ambiguous words like "bank," "plant," "bark," etc.  
Task 1: Implement the Lesk Algorithm

Use the Lesk algorithm in Python to disambiguate each ambiguous word in the dataset.  
Task 2: Compare Results with Gold Standard:

Compare the algorithm’s output with human-labeled correct senses.  
Task 3: Evaluate Performance:

Measure accuracy using metrics like precision, recall, and F1 score.  
Discuss areas of improvement in the algorithm.


### Our chosen ambiguous words are:

1. bank
2. plant
3. bass

Displayed below are all their possible meanings.

In [15]:
words_highlighted: list[str] = [
    'bank',
    'plant',
    'bass'
]

def get_all_senses(word: str) -> None:
    synsets = wn.synsets(word)

    print(f"All possible senses for '{word}': ")
    for i, syn in enumerate(synsets, 1):
        print(f"{i}. {syn.name()}: {syn.definition()}")

for word in words_highlighted:
    get_all_senses(word)
    print("")

All possible senses for 'bank': 
1. bank.n.01: sloping land (especially the slope beside a body of water)
2. depository_financial_institution.n.01: a financial institution that accepts deposits and channels the money into lending activities
3. bank.n.03: a long ridge or pile
4. bank.n.04: an arrangement of similar objects in a row or in tiers
5. bank.n.05: a supply or stock held in reserve for future use (especially in emergencies)
6. bank.n.06: the funds held by a gambling house or the dealer in some gambling games
7. bank.n.07: a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
8. savings_bank.n.02: a container (usually with a slot in the top) for keeping money at home
9. bank.n.09: a building in which the business of banking transacted
10. bank.n.10: a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
11. bank.v.01: tip laterally
12. bank.v.02: enclose with a ban

### Providing statements for NLTK’s Lesk algorithm to sense

I added three statements for each ambiguous word.

In [18]:
statement_input: tuple = [
    ("I need to go to the bank to deposit some money.", "bank"),
    ("She sat on the river bank and watched the sunset.", "bank"),
    ("That highway is dangerous; too many motorcyclists bank on every turn!", "bank"),
    ("The gardener watered the plant in the morning.", "plant"),
    ("I wish peas were available on every supermarket. We need to plant them more.", "plant"),
    ("She was a plant by an offshore government to infiltrate the regional center.", "plant"),
    ("The fisherman caught a large bass in the river.", "bass"),
    ("You should turn up the bass, people will like it more that way.", "bass"),
    ("This bass tasted good!", "bass"),
]

def auto_sense(chosen_statement: str, highlighted_word: str):
    print(f"**Chosen statement:** {chosen_statement}", end="  \n")
    print(f"**Highlighted word:** {highlighted_word}", end="\n\n")

    return lesk(nltk.word_tokenize(chosen_statement), highlighted_word)

for statement, word in statement_input:
    sense = auto_sense(statement, word)

    print(f"> **Best sense of highlighted word:** {sense}")
    if sense:
        print(f"> **Definition:** {sense.definition()}", end="\n\n")
    print("----", end="\n\n")



**Chosen statement:** I need to go to the bank to deposit some money.  
**Highlighted word:** bank

> **Best sense of highlighted word:** Synset('savings_bank.n.02')
> **Definition:** a container (usually with a slot in the top) for keeping money at home

----

**Chosen statement:** She sat on the river bank and watched the sunset.  
**Highlighted word:** bank

> **Best sense of highlighted word:** Synset('depository_financial_institution.n.01')
> **Definition:** a financial institution that accepts deposits and channels the money into lending activities

----

**Chosen statement:** That highway is dangerous; too many motorcyclists bank on every turn!  
**Highlighted word:** bank

> **Best sense of highlighted word:** Synset('bank.n.07')
> **Definition:** a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force

----

**Chosen statement:** The gardener watered the plant in the morning.  
**Highlighted word:** plan

## Task 2: Compare Results with Gold Standard:
> Compare the algorithm’s output with human-labeled correct senses.

Unfortunately, we see many incorrect meanings assigned by the algorithm.

### Correcting the computer’s assumptions

**Chosen statement:** I need to go to the bank to deposit some money.  
**Highlighted word:** bank

> **Best sense of highlighted word:** Synset('savings_bank.n.02')
> **Definition:** a container (usually with a slot in the top) for keeping money at home

**CORRECT SENSE:** Synset('depository_financial_institution.n.01')
**CORRECT DEFINITION:** a financial institution that accepts deposits and channels the money into lending activities

----

**Chosen statement:** She sat on the river bank and watched the sunset.  
**Highlighted word:** bank

> **Best sense of highlighted word:** Synset('depository_financial_institution.n.01')
> **Definition:** a financial institution that accepts deposits and channels the money into lending activities

**CORRECT SENSE / DEFINITION:**  
bank.n.01: sloping land (especially the slope beside a body of water)

----

**Chosen statement:** That highway is dangerous; too many motorcyclists bank on every turn!  
**Highlighted word:** bank

> **Best sense of highlighted word:** Synset('bank.n.07')
> **Definition:** a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force

**CORRECT SENSE / DEFINITION:**  
bank.v.01: tip laterally

----

**Chosen statement:** The gardener watered the plant in the morning.  
**Highlighted word:** plant

> **Best sense of highlighted word:** Synset('plant.v.06')
> **Definition:** put firmly in the mind

**CORRECT SENSE / DEFINITION:**  
plant.n.02: (botany) a living organism lacking the power of locomotion

----

**Chosen statement:** I wish peas were available on every supermarket. We need to plant them more.  
**Highlighted word:** plant

> **Best sense of highlighted word:** Synset('plant.v.05')
> **Definition:** place something or someone in a certain position in order to secretly observe or deceive

**CORRECT SENSE / DEFINITION:**  
plant.n.02: (botany) a living organism lacking the power of locomotion

----

**Chosen statement:** She was a plant by an offshore government to infiltrate the regional center.  
**Highlighted word:** plant

> **Best sense of highlighted word:** Synset('plant.n.03')
> **Definition:** an actor situated in the audience whose acting is rehearsed but seems spontaneous to the audience

**CORRECT SENSE / DEFINITION:**  
**CORRECT SENSE:** Synset('plant.v.05')  
**CORRECT DEFINITION:** place something or someone in a certain position in order to secretly observe or deceive

----

**Chosen statement:** The fisherman caught a large bass in the river.  
**Highlighted word:** bass

> **Best sense of highlighted word:** Synset('sea_bass.n.01')
> **Definition:** the lean flesh of a saltwater fish of the family Serranidae

***✅ — THIS IS CORRECT!***

----

**Chosen statement:** You should turn up the bass, people will like it more that way.  
**Highlighted word:** bass

> **Best sense of highlighted word:** Synset('sea_bass.n.01')
> **Definition:** the lean flesh of a saltwater fish of the family Serranidae

**CORRECT SENSE / DEFINITION:**  
bass.n.01: the lowest part of the musical range

----

**Chosen statement:** This bass tasted good!  
**Highlighted word:** bass

> **Best sense of highlighted word:** Synset('sea_bass.n.01')
> **Definition:** the lean flesh of a saltwater fish of the family Serranidae

***✅ — THIS IS CORRECT!***

----


### Comparison of Results

The Lesk algorithm received a score of **2 out of 9** in the quiz. This is a failing grade of epic proportions.

### Calculating the AI’s Scores

We _could_ actually just count it ourselves, but aren't we learning so we can automate all the rank-and-file labor away?

In [20]:
# true_positives: int = 2
# true_negatives: int = 3
# false_positives: int = 1
# false_negatives: int = 3
# overall_score: int = true_positives + true_negatives + false_positives + false_negatives

# accuracy_score: float = true_positives / overall_score
# precision_score: float = true_positives / (true_positives + false_positives)
# recall_score: float = true_positives / (true_positives + false_negatives)
# f1_score: float = 2 * ((precision_score * recall_score) / (precision_score + recall_score))

# print(f"Accuracy: {accuracy_score}")
# print(f"Precision: {precision_score}")
# print(f"Recall: {recall_score}")
# print(f"F-1: {f1_score}")

Accuracy: 0.2222222222222222
Precision: 0.6666666666666666
Recall: 0.4
F-1: 0.5


Let's measure accuracy using SciKit Learn’s Metrics package.

In [2]:
gold_standard = [
    wn.synset('depository_financial_institution.n.01'),  # I need to go to the bank to deposit some money.
    wn.synset('bank.n.01'),                               # She sat on the river bank and watched the sunset.
    wn.synset('bank.v.01'),                               # That highway is dangerous; too many motorcyclists bank on every turn!
    wn.synset('plant.n.02'),                              # The gardener watered the plant in the morning.
    wn.synset('plant.n.02'),                              # I wish peas were available on every supermarket. We need to plant them more.
    wn.synset('plant.v.05'),                              # She was a plant by an offshore government to infiltrate the regional center.
    wn.synset('sea_bass.n.01'),                          # The fisherman caught a large bass in the river.
    wn.synset('bass.n.01'),                               # You should turn up the bass, people will like it more that way.
    wn.synset('sea_bass.n.01')                           # This bass tasted good!
]

predicted_senses = [
    wn.synset('savings_bank.n.02'),                       # I need to go to the bank to deposit some money.
    wn.synset('depository_financial_institution.n.01'),  # She sat on the river bank and watched the sunset.
    wn.synset('bank.n.07'),                               # That highway is dangerous; too many motorcyclists bank on every turn!
    wn.synset('plant.v.06'),                              # The gardener watered the plant in the morning.
    wn.synset('plant.v.05'),                              # I wish peas were available on every supermarket. We need to plant them more.
    wn.synset('plant.n.03'),                              # She was a plant by an offshore government to infiltrate the regional center.
    wn.synset('sea_bass.n.01'),                          # The fisherman caught a large bass in the river.
    wn.synset('sea_bass.n.01'),                          # You should turn up the bass, people will like it more that way.
    wn.synset('sea_bass.n.01')                           # This bass tasted good!
]

# Convert synsets to their names for comparison
gold_standard_names = [syn.name() for syn in gold_standard]
predicted_sense_names = [syn.name() for syn in predicted_senses]

# Compute Accuracy
accuracy = accuracy_score(gold_standard_names, predicted_sense_names)
print(f"Accuracy: {accuracy}")

# Compute Precision, Recall, F1 Score
# Using 'macro' average since each sense is considered a class
precision, recall, f1, _ = precision_recall_fscore_support(
    gold_standard_names, predicted_sense_names, average='macro'
)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.2222222222222222
Precision: 0.0606060606060606
Recall: 0.09090909090909091
F1 Score: 0.07272727272727274


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Task 3: Evaluate Performance:

> - Measure accuracy using metrics like precision, recall, and F1 score.
> - Discuss areas of improvement in the algorithm.

The Lesk algorithm from the Natural Language Toolkit (NLTK) is incredibly simplistic. It matches the tokenized words in a sentence to each definition, and returns nothing if there are no equivalents from any of the tokens provided.

While this is a good start to learning more about machine learning as a field, it is far from how humans are able to speak and understand through language, as we rely on a series of synonyms and predefined cultural contexts that artificial intelligence cannot easily retrieve.

To expand the Word Sense Disambiguation (WSD) model, we would have to teach it the meanings of each token it ingests as well, as well as make it remember important synonyms that would match better with the definitions it is provided.