In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

# Ensure necessary NLTK data is downloaded
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# 1. Initialize the Models
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

# 2. Define the Stress Test Sentences
sentences = [
    "The generic general currently planned the organ organization.", # Test 1: Over-Aggressive
    "The women left the leaves by the bank."                 # Test 2: Irregular Morphology
]

# 3. Helper function to map NLTK POS tags to WordNet POS tags
# (The Lemmatizer needs to know if a word is a Verb or Noun to work correctly)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default fallback

# 4. Processing Loop
for i, sentence in enumerate(sentences,1):
    print(f"\n{'='*60}")
    print(f"TEST SENTENCE {i}: \"{sentence}\"")
    print(f"{'='*60}")
    
    # Tokenize
    tokens = word_tokenize(sentence)
    
    # Get POS Tags for the Lemmatizer
    pos_tags = nltk.pos_tag(tokens)
    
    # Print Header
    print(f"{'Token':<15} | {'Porter':<15} | {'Lancaster':<15} | {'Snowball':<15} | {'Lemmatizer':<15}")
    print("-" * 85)
    
    for token, tag in pos_tags:
        # Apply Stemmers
        p_stem = porter.stem(token)
        l_stem = lancaster.stem(token)
        s_stem = snowball.stem(token)
        
        # Apply Lemmatizer (using the dynamic POS tag)
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, wn_tag)
        
        print(f"{token:<15} | {p_stem:<15} | {l_stem:<15} | {s_stem:<15} | {lemma:<15}")
    


TEST SENTENCE 1: "The generic general currently planned the organ organization."
Token           | Porter          | Lancaster       | Snowball        | Lemmatizer     
-------------------------------------------------------------------------------------
The             | the             | the             | the             | The            
generic         | gener           | gen             | generic         | generic        
general         | gener           | gen             | general         | general        
currently       | current         | cur             | current         | currently      
planned         | plan            | plan            | plan            | plan           
the             | the             | the             | the             | the            
organ           | organ           | org             | organ           | organ          
organization    | organ           | org             | organ           | organization   
.               | .               | .   

In [7]:
import nltk
from nltk import CFG, ChartParser

# ==========================================
# SCENARIO 1: The Ambiguous Stemmed World
# ==========================================
# Correction: Added "NP -> Det N" to allow for "the organization"
stemmed_grammar = CFG.fromstring("""
  S -> NP VP
  VP -> V NP
  
  # The Rules for Noun Phrases
  NP -> Det ADJ N 
  NP -> Det N N 
  NP -> Det N 
  
  # The Vocabulary
  Det -> 'the'
  ADJ -> 'gen'
  N   -> 'gen' | 'organization'
  V   -> 'planned'
""")

print("--- 1. PARSING STEMMED TEXT (Lancaster) ---")
print("Input: 'the gen gen planned the organization'")
stemmed_parser = ChartParser(stemmed_grammar)
stemmed_sentence = "the gen gen planned the organization".split()

trees = list(stemmed_parser.parse(stemmed_sentence))
print(f"Found {len(trees)} valid parse trees (AMBIGUITY DETECTED!):\n")

for i, tree in enumerate(trees, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()


# ==========================================
# SCENARIO 2: The Clear Lemmatized World
# ==========================================
lemma_grammar = CFG.fromstring("""
  S -> NP VP
  VP -> V NP
  
  # The Rules for Noun Phrases
  NP -> Det ADJ N 
  NP -> Det N N 
  NP -> Det N
  
  # The Vocabulary
  Det -> 'the'
  ADJ -> 'generic'
  N   -> 'general' | 'organization'
  V   -> 'planned'
""")

print("\n--- 2. PARSING LEMMATIZED TEXT ---")
print("Input: 'the generic general planned the organization'")
lemma_parser = ChartParser(lemma_grammar)
lemma_sentence = "the generic general planned the organization".split()

trees = list(lemma_parser.parse(lemma_sentence))
print(f"Found {len(trees)} valid parse tree (CLEAR MEANING):\n")

for tree in trees:
    tree.pretty_print()

--- 1. PARSING STEMMED TEXT (Lancaster) ---
Input: 'the gen gen planned the organization'
Found 2 valid parse trees (AMBIGUITY DETECTED!):

Tree #1 Interpretation:
         S                              
      ___|___________                    
     |               VP                 
     |          _____|___                
     NP        |         NP             
  ___|___      |      ___|_______        
Det ADJ  N     V    Det          N      
 |   |   |     |     |           |       
the gen gen planned the     organization

Tree #2 Interpretation:
         S                              
      ___|___________                    
     |               VP                 
     |          _____|___                
     NP        |         NP             
  ___|___      |      ___|_______        
Det  N   N     V    Det          N      
 |   |   |     |     |           |       
the gen gen planned the     organization


--- 2. PARSING LEMMATIZED TEXT ---
Input: 'the generic general 