<a href="https://colab.research.google.com/github/NeoZ666/classroom_NLP/blob/main/NLP_exp4_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dictionary of known lemma forms
lemma_dict = {
    "running": "run",
    "ran": "run",
    "eaten": "eat",
    "ate": "eat",
    "better": "good",
    "worse": "bad",
    "happier": "happy",
    "happiest": "happy",
    "cats": "cat",
    "watches": "watch"
}

# Simplified POS tagging function
def pos_tag(word):
    # Heuristic rules for POS tagging based on suffixes
    if word.endswith('ing'):
        return 'VBG'  # Present participle/gerund
    if word.endswith('ed'):
        return 'VBD'  # Past tense
    if word.endswith('es') or word.endswith('s'):
        return 'NNS'  # Plural noun
    if word in ["better", "worse", "happier", "happiest"]:
        return 'JJR'  # Comparative/Superlative adjective
    return 'NN'  # Default to noun

# Function to lemmatize a word with POS tagging
def lemmatize(word):
    # Step 1: Check if the word is in the lemma dictionary
    if word in lemma_dict:
        return lemma_dict[word]
    
    # Step 2: Perform POS tagging
    pos = pos_tag(word)
    
    # Step 3: Apply lemmatization rules based on POS
    if pos == 'VBG' or pos == 'VBD':  # Verbs
        if word.endswith('ing') and len(word) > 4:
            return word[:-3]
        if word.endswith('ed') and len(word) > 3:
            return word[:-2]
    elif pos == 'NNS':  # Plural nouns
        if word.endswith('es'):
            return word[:-2]
        if word.endswith('s') and len(word) > 2:
            return word[:-1]
    elif pos == 'JJR':  # Comparative/Superlative adjectives
        if word.endswith('er') or word.endswith('est'):
            return word[:-2]
    
    # Step 4: If no rules apply, return the word as is
    return word

# Test cases
words = ["running", "ran", "eaten", "ate", "better", "happier", "cats", "watches", "played", "thinking"]
lemmatized_words = [lemmatize(word) for word in words]

# Print the results in a table format
from prettytable import PrettyTable

table = PrettyTable()
table.field_names = ["Original Word", "Lemmatized Word", "Part of Speech"]

for word in words:
    lemma = lemmatize(word)
    pos = pos_tag(word)
    table.add_row([word, lemma, pos])

print(table)

# Example sentence
sentence = "The happy cats were thinking about running and playing in the garden."
lemmatized_sentence = " ".join([lemmatize(word) for word in sentence.split()])
print(f"Original Sentence: {sentence}")
print(f"Lemmatized Sentence: {lemmatized_sentence}")


In [None]:
# @title Without NLTK
# Dictionary of known lemma forms
lemma_dict = {
    "running": "run",
    "ran": "run",
    "eaten": "eat",
    "ate": "eat",
    "better": "good",
    "worse": "bad",
    "happier": "happy",
    "happiest": "happy",
    "cats": "cat",
    "watches": "watch",
    "played": "play",
    "thinking": "think"
}

# Simplified POS tagging function
def pos_tag(word):
    # Heuristic rules for POS tagging based on suffixes
    if word.endswith('ing'):
        return 'VBG'  # Present participle/gerund
    if word.endswith('ed'):
        return 'VBD'  # Past tense
    if word.endswith('es') or word.endswith('s'):
        return 'NNS'  # Plural noun
    if word in ["and", "or"]:
        return 'CC'  # Conjunction
    if word in ["better", "worse", "happier", "happiest"]:
        return 'JJR'  # Comparative/Superlative adjective
    return 'NN'  # Default to noun

# Function to lemmatize a word with POS tagging
def lemmatize(word):
    # Step 1: Check if the word is in the lemma dictionary
    if word in lemma_dict:
        return lemma_dict[word]

    # Step 2: Perform POS tagging
    pos = pos_tag(word)

    # Step 3: Apply lemmatization rules based on POS
    if pos == 'VBG' or pos == 'VBD':  # Verbs
        if word.endswith('ing') and len(word) > 4:
            return word[:-3]
        if word.endswith('ed') and len(word) > 3:
            return word[:-2]
    elif pos == 'NNS':  # Plural nouns
        if word.endswith('es'):
            return word[:-2]
        if word.endswith('s') and len(word) > 2:
            return word[:-1]
    elif pos == 'JJR':  # Comparative/Superlative adjectives
        if word.endswith('er') or word.endswith('est'):
            return word[:-2]

    # Step 4: If no rules apply, return the word as is
    return word

# Sentence for lemmatization
sentence = "The happy cats were thinking about running and playing in the garden."
words = sentence.split()
lemmatized_words = [lemmatize(word) for word in words]

# Creating the table
from prettytable import PrettyTable

# Table for words and their lemmatized forms with POS tags
table_words = PrettyTable()
table_words.field_names = ["Original Word", "Lemmatized Word", "Part of Speech"]

# Table for verbs divided by tenses
table_verbs = PrettyTable()
table_verbs.field_names = ["Original Word", "Lemmatized Word", "Tense"]

# Populate the tables
for word in words:
    lemma = lemmatize(word)
    pos = pos_tag(word)
    table_words.add_row([word, lemma, pos])

    if pos == 'VBG':  # Present participle/gerund
        tense = "Present Continuous"
    elif pos == 'VBD':  # Past tense
        tense = "Past Tense"
    elif pos.startswith('VB'):  # Other verb tenses (simple present, etc.)
        tense = "Other Tenses"
    else:
        tense = None

    if tense:
        table_verbs.add_row([word, lemma, tense])

# Print the results
print("Words and Their Lemmatized Forms with POS Tags:")
print(table_words)
print("\nVerbs Divided by Tenses:")
print(table_verbs)

# Print the original and lemmatized sentence
lemmatized_sentence = " ".join(lemmatized_words)
print(f"\nOriginal Sentence: {sentence}")
print(f"Lemmatized Sentence: {lemmatized_sentence}")


Words and Their Lemmatized Forms with POS Tags:
+---------------+-----------------+----------------+
| Original Word | Lemmatized Word | Part of Speech |
+---------------+-----------------+----------------+
|      The      |       The       |       NN       |
|     happy     |      happy      |       NN       |
|      cats     |       cat       |      NNS       |
|      were     |       were      |       NN       |
|    thinking   |      think      |      VBG       |
|     about     |      about      |       NN       |
|    running    |       run       |      VBG       |
|      and      |       and       |       CC       |
|    playing    |       play      |      VBG       |
|       in      |        in       |       NN       |
|      the      |       the       |       NN       |
|    garden.    |     garden.     |       NN       |
+---------------+-----------------+----------------+

Verbs Divided by Tenses:
+---------------+-----------------+--------------------+
| Original Word | Lem

In [None]:
import nltk
from nltk.corpus import wordnet
from prettytable import PrettyTable
import spacy

# Download necessary resources from nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to map nltk POS tags to wordnet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Function to lemmatize using nltk and spaCy
def lemmatize(word, nltk_tag):
    wordnet_pos = get_wordnet_pos(nltk_tag)
    if wordnet_pos:
        lemma = nltk.WordNetLemmatizer().lemmatize(word, pos=wordnet_pos)
    else:
        lemma = word

    # Use spaCy for additional lemmatization if needed
    doc = nlp(word)
    spacy_lemma = doc[0].lemma_

    return spacy_lemma if spacy_lemma != word else lemma

# Function to determine verb tense using spaCy
def determine_verb_tense(word, pos_tag):
    doc = nlp(word)
    tag = doc[0].tag_

    if pos_tag.startswith('V'):
        if tag in ['VBG']:
            return "Present Continuous"
        elif tag in ['VBD']:
            return "Past Tense"
        elif tag in ['VBN']:
            return "Past Participle"
        elif tag in ['VB']:
            return "Simple Present"
        elif tag in ['VBZ']:
            return "Simple Present (3rd Person Singular)"
        elif tag in ['MD']:
            # Modal verbs (e.g., "will") often indicate future tense
            return "Future Tense (Modal)"
        else:
            return "Other Tenses"
    return None

# Sentence for lemmatization
sentence = "The happier cats will be thinking about running and playing in the better garden."
words = nltk.word_tokenize(sentence)
nltk_pos_tags = nltk.pos_tag(words)

# Creating the tables
table_words = PrettyTable()
table_words.field_names = ["Original Word", "Lemmatized Word", "Part of Speech"]

table_verbs = PrettyTable()
table_verbs.field_names = ["Original Word", "Lemmatized Word", "Tense"]

table_adjectives = PrettyTable()
table_adjectives.field_names = ["Original Word", "Lemmatized Word", "Degree"]

# Expected lemmas for accuracy check
expected_lemmas = {
    "The": "the",
    "happier": "happy",
    "cats": "cat",
    "will": "will",
    "be": "be",
    "thinking": "think",
    "about": "about",
    "running": "run",
    "and": "and",
    "playing": "play",
    "in": "in",
    "the": "the",
    "better": "good",
    "garden": "garden"
}

# Counters for accuracy calculation
correct = 0
total = len(words)

# Populate the tables
for word, nltk_tag in nltk_pos_tags:
    lemma = lemmatize(word, nltk_tag)
    table_words.add_row([word, lemma, nltk_tag])

    # Check for accuracy
    if expected_lemmas.get(word) == lemma:
        correct += 1

    # Determine the tense
    tense = determine_verb_tense(word, nltk_tag)
    if tense:
        table_verbs.add_row([word, lemma, tense])

    # Determine if the word is an adjective and categorize it
    if nltk_tag.startswith('JJ'):
        degree = "Positive"
        if nltk_tag == 'JJR':
            degree = "Comparative"
        elif nltk_tag == 'JJS':
            degree = "Superlative"
        table_adjectives.add_row([word, lemma, degree])

# Calculate accuracy
accuracy = correct / total * 100

# Print the results
print("Words and Their Lemmatized Forms with POS Tags:")
print(table_words)
print("\nVerbs Divided by Tenses:")
print(table_verbs)
print("\nAdjectives Divided by Degrees of Comparison:")
print(table_adjectives)

# Print the original and lemmatized sentence
lemmatized_sentence = " ".join([lemmatize(word, nltk_tag) for word, nltk_tag in nltk_pos_tags])
print(f"\nOriginal Sentence: {sentence}")
print(f"Lemmatized Sentence: {lemmatized_sentence}")

# Print accuracy
print(f"\nLemmatization Accuracy: {accuracy:.2f}%")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Words and Their Lemmatized Forms with POS Tags:
+---------------+-----------------+----------------+
| Original Word | Lemmatized Word | Part of Speech |
+---------------+-----------------+----------------+
|      The      |       the       |       DT       |
|    happier    |      happy      |      JJR       |
|      cats     |       cat       |      NNS       |
|      will     |       will      |       MD       |
|       be      |        be       |       VB       |
|    thinking   |      think      |      VBG       |
|     about     |      about      |       IN       |
|    running    |       run       |      VBG       |
|      and      |       and       |       CC       |
|    playing    |       play      |      VBG       |
|       in      |        in       |       IN       |
|      the      |       the       |       DT       |
|     better    |       well      |      JJR       |
|     garden    |      garden     |       NN       |
|       .       |        .        |       .        

In [None]:
import re
from prettytable import PrettyTable

# Function to determine POS tag using simple rules
def pos_tag(word):
    if re.search(r'\b(and|or|but|so|yet|for|nor)\b', word):
        return 'CC'  # Conjunction
    elif re.search(r'\b(a|an|the)\b', word):
        return 'DT'  # Determiner
    elif re.search(r'\b(is|am|are|was|were|be|being|been)\b', word):
        return 'VB'  # Verb
    elif re.search(r'\b(was|were)\b', word):
        return 'VBD'  # Past Tense Verb
    elif re.search(r'\b(be|being|been)\b', word):
        return 'VB'  # Base Form of Verb
    elif re.search(r'\b(ing)\b', word):
        return 'VBG'  # Present Participle
    elif re.search(r'\b(ed)\b', word):
        return 'VBD'  # Past Tense Verb
    elif re.search(r'\b(s|es)\b', word):
        return 'NNS'  # Plural Noun
    elif re.search(r'\b(ly)\b', word):
        return 'RB'  # Adverb
    elif re.search(r'\b(better|worse|happier|happiest)\b', word):
        return 'JJR'  # Comparative/Superlative Adjective
    else:
        return 'NN'  # Default to Noun

# Function to lemmatize words based on simple rules
def lemmatize(word, pos_tag):
    # Verb forms
    if pos_tag in ['VBG', 'VBD']:
        if re.search(r'ing$', word):
            return re.sub(r'ing$', '', word)
        elif re.search(r'ed$', word):
            return re.sub(r'ed$', '', word)
    # Nouns
    elif pos_tag == 'NNS':
        if re.search(r'es$', word):
            return re.sub(r'es$', '', word)
        elif re.search(r's$', word):
            return re.sub(r's$', '', word)
    # Adjectives
    elif pos_tag == 'JJR':
        if re.search(r'er$', word):
            return re.sub(r'er$', '', word)
        elif re.search(r'est$', word):
            return re.sub(r'est$', '', word)

    # Return the word as is if no rule applies
    return word

# Function to determine verb tense based on patterns
def determine_verb_tense(word, pos_tag):
    if pos_tag.startswith('VB'):
        if re.search(r'\b(be|being|been)\b', word):
            return "Present Continuous"
        elif re.search(r'\b(was|were)\b', word):
            return "Past Tense"
        elif re.search(r'\b(will|shall)\b', word):
            return "Future Tense (Modal)"
        else:
            return "Simple Present"
    return None

# Sentence for lemmatization
sentence = "The happier cats will be thinking about running and playing in the better garden."
words = sentence.split()

# Creating the tables
table_words = PrettyTable()
table_words.field_names = ["Original Word", "Lemmatized Word", "Part of Speech"]

table_verbs = PrettyTable()
table_verbs.field_names = ["Original Word", "Lemmatized Word", "Tense"]

table_adjectives = PrettyTable()
table_adjectives.field_names = ["Original Word", "Lemmatized Word", "Degree"]

# Expected lemmas for accuracy check
expected_lemmas = {
    "The": "the",
    "happier": "happy",
    "cats": "cat",
    "will": "will",
    "be": "be",
    "thinking": "think",
    "about": "about",
    "running": "run",
    "and": "and",
    "playing": "play",
    "in": "in",
    "the": "the",
    "better": "good",
    "garden": "garden"
}

# Counters for accuracy calculation
correct = 0
total = len(words)

# Populate the tables
for word in words:
    pos = pos_tag(word)
    lemma = lemmatize(word, pos)
    table_words.add_row([word, lemma, pos])

    # Check for accuracy
    if expected_lemmas.get(word) == lemma:
        correct += 1

    # Determine the tense
    tense = determine_verb_tense(word, pos)
    if tense:
        table_verbs.add_row([word, lemma, tense])

    # Determine if the word is an adjective and categorize it
    if pos.startswith('J'):
        degree = "Positive"
        if pos == 'JJR':
            degree = "Comparative"
        elif pos == 'JJS':
            degree = "Superlative"
        table_adjectives.add_row([word, lemma, degree])

# Calculate accuracy
accuracy = correct / total * 100

# Print the results
print("Words and Their Lemmatized Forms with POS Tags:")
print(table_words)
print("\nVerbs Divided by Tenses:")
print(table_verbs)
print("\nAdjectives Divided by Degrees of Comparison:")
print(table_adjectives)

# Print the original and lemmatized sentence
lemmatized_sentence = " ".join([lemmatize(word, pos_tag(word)) for word in words])
print(f"\nOriginal Sentence: {sentence}")
print(f"Lemmatized Sentence: {lemmatized_sentence}")

# Print accuracy
print(f"\nLemmatization Accuracy: {accuracy:.2f}%")


Words and Their Lemmatized Forms with POS Tags:
+---------------+-----------------+----------------+
| Original Word | Lemmatized Word | Part of Speech |
+---------------+-----------------+----------------+
|      The      |       The       |       NN       |
|    happier    |      happi      |      JJR       |
|      cats     |       cats      |       NN       |
|      will     |       will      |       NN       |
|       be      |        be       |       VB       |
|    thinking   |     thinking    |       NN       |
|     about     |      about      |       NN       |
|    running    |     running     |       NN       |
|      and      |       and       |       CC       |
|    playing    |     playing     |       NN       |
|       in      |        in       |       NN       |
|      the      |       the       |       DT       |
|     better    |       bett      |      JJR       |
|    garden.    |     garden.     |       NN       |
+---------------+-----------------+----------------

In [None]:
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet
from prettytable import PrettyTable

# Download necessary resources from nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('brown')

# Load Brown Corpus
brown_sents = brown.sents()

# Function to map nltk POS tags to wordnet POS tags
def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Function to lemmatize using nltk and wordnet
def lemmatize(word, nltk_tag):
    wordnet_pos = get_wordnet_pos(nltk_tag)
    if wordnet_pos:
        lemma = nltk.WordNetLemmatizer().lemmatize(word, pos=wordnet_pos)
    else:
        lemma = word
    return lemma

# Function to determine verb tense based on patterns
def determine_verb_tense(word, pos_tag):
    if pos_tag.startswith('V'):
        if pos_tag == 'VBG':
            return "Present Continuous"
        elif pos_tag == 'VBD':
            return "Past Tense"
        elif pos_tag == 'VBN':
            return "Past Participle"
        elif pos_tag == 'VB':
            return "Simple Present"
        elif pos_tag == 'VBZ':
            return "Simple Present (3rd Person Singular)"
        elif pos_tag == 'MD':
            # Modal verbs (e.g., "will") often indicate future tense
            return "Future Tense (Modal)"
        else:
            return "Other Tenses"
    return None

# Example sentence for lemmatization
sentence = "The happier cats will be thinking about running and playing in the better garden."
words = nltk.word_tokenize(sentence)
nltk_pos_tags = nltk.pos_tag(words)

# Creating the tables
table_words = PrettyTable()
table_words.field_names = ["Original Word", "Lemmatized Word", "Part of Speech"]

table_verbs = PrettyTable()
table_verbs.field_names = ["Original Word", "Lemmatized Word", "Tense"]

table_adjectives = PrettyTable()
table_adjectives.field_names = ["Original Word", "Lemmatized Word", "Degree"]

# Expected lemmas for accuracy check
expected_lemmas = {
    "The": "the",
    "happier": "happy",
    "cats": "cat",
    "will": "will",
    "be": "be",
    "thinking": "think",
    "about": "about",
    "running": "run",
    "and": "and",
    "playing": "play",
    "in": "in",
    "the": "the",
    "better": "good",
    "garden": "garden"
}

# Counters for accuracy calculation
correct = 0
total = len(words)

# Populate the tables
for word, nltk_tag in nltk_pos_tags:
    lemma = lemmatize(word, nltk_tag)
    table_words.add_row([word, lemma, nltk_tag])

    # Check for accuracy
    if expected_lemmas.get(word) == lemma:
        correct += 1

    # Determine the tense
    tense = determine_verb_tense(word, nltk_tag)
    if tense:
        table_verbs.add_row([word, lemma, tense])

    # Determine if the word is an adjective and categorize it
    if nltk_tag.startswith('JJ'):
        degree = "Positive"
        if nltk_tag == 'JJR':
            degree = "Comparative"
        elif nltk_tag == 'JJS':
            degree = "Superlative"
        table_adjectives.add_row([word, lemma, degree])

# Calculate accuracy
accuracy = correct / total * 100

# Print the results
print("Words and Their Lemmatized Forms with POS Tags:")
print(table_words)
print("\nVerbs Divided by Tenses:")
print(table_verbs)
print("\nAdjectives Divided by Degrees of Comparison:")
print(table_adjectives)

# Print the original and lemmatized sentence
lemmatized_sentence = " ".join([lemmatize(word, pos_tag(word)) for word in words])
print(f"\nOriginal Sentence: {sentence}")
print(f"Lemmatized Sentence: {lemmatized_sentence}")

# Print accuracy
print(f"\nLemmatization Accuracy: {accuracy:.2f}%")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Words and Their Lemmatized Forms with POS Tags:
+---------------+-----------------+----------------+
| Original Word | Lemmatized Word | Part of Speech |
+---------------+-----------------+----------------+
|      The      |       The       |       DT       |
|    happier    |      happy      |      JJR       |
|      cats     |       cat       |      NNS       |
|      will     |       will      |       MD       |
|       be      |        be       |       VB       |
|    thinking   |      think      |      VBG       |
|     about     |      about      |       IN       |
|    running    |       run       |      VBG       |
|      and      |       and       |       CC       |
|    playing    |       play      |      VBG       |
|       in      |        in       |       IN       |
|      the      |       the       |       DT       |
|     better    |       good      |      JJR       |
|     garden    |      garden     |       NN       |
|       .       |        .        |       .        