DEPENDENCIES

In [None]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet


STATISTICAL MODELS

In [None]:
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

STEMMER

In [None]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

TEXT

In [None]:

text = "The generic general currently planned the organ organization. The women left the leaves by the bank."                 

SENTENCE TOKENIZATION

In [None]:
sentences = sent_tokenize(text)

nested_tokens = [word_tokenize(sentence) for sentence in sentences]

print(f"Sentences : {sentences}")
print(f"Tokens : {nested_tokens}")

TAGGER

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default fallback

for i, sentence in enumerate(sentences,1):
    print(f"\n{'='*60}")
    print(f"TEST SENTENCE {i}: \"{sentence}\"")
    print(f"{'='*60}")
    
    # Tokenize
    tokens = word_tokenize(sentence)
    
    # Get POS Tags for the Lemmatizer
    pos_tags = nltk.pos_tag(tokens)
    
    # Print Header
    print(f"{'Token':<15} | {'Porter':<15} | {'Lancaster':<15} | {'Snowball':<15} | {'Lemmatizer':<15}")
    print("-" * 85)
    
    for token, tag in pos_tags:
        # Apply Stemmers
        p_stem = porter.stem(token)
        l_stem = lancaster.stem(token)
        s_stem = snowball.stem(token)
        
        # Apply Lemmatizer (using the dynamic POS tag)
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, wn_tag)
        
        print(f"{token:<15} | {p_stem:<15} | {l_stem:<15} | {s_stem:<15} | {lemma:<15}")

STEMMED SENTENCES

In [None]:
p_sentences = [[porter.stem(token) for token in tokens] for tokens in nested_tokens]
l_sentences = [[lancaster.stem(token) for token in tokens] for tokens in nested_tokens]
s_sentences = [[snowball.stem(token) for token in tokens] for tokens in nested_tokens]

print(p_sentences)
print(l_sentences)
print(s_sentences)

TAGGING

In [None]:
pos_tags_nested = [nltk.pos_tag(tokens) for tokens in nested_tokens]
print(pos_tags_nested)

LEMMATIZATION

In [None]:
lemmatized_sentences = []
for tokens, pos_tags in zip(nested_tokens, pos_tags_nested):
    lemmas = []
    for token, tag in pos_tags:
        wn_tag = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token, wn_tag)
        lemmas.append(lemma)
    lemmatized_sentences.append(lemmas)

print(lemmatized_sentences)

PORTER GRAMMAR

In [None]:
from nltk import CFG,ChartParser

porter_grammar = CFG.fromstring("""
    S -> NP VP Punct
    NP -> Det N | Det ADJ N | Det N N
    VP -> Adv V NP | V PP | V NP PP
    PP -> P NP
    Det -> 'The' | 'the'  
    N -> 'gener' | 'organ' | 'women' | 'leav' | 'bank'
    ADJ -> 'gener'
    V -> 'plan' | 'left'
    Adv -> 'current'
    P -> 'by'
    Punct -> '.'

    """)

porter_parser = ChartParser(porter_grammar)


PORTER PARSE TREE CONSTRUCTION


In [None]:
porter_tree1 = list(porter_parser.parse(p_sentences[0]))
porter_tree2 = list(porter_parser.parse(p_sentences[1]))

print(f"Found {len(porter_tree1)} valid parse trees for sentence 1\n")
for i, tree in enumerate(porter_tree1, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()


print(f"Found {len(porter_tree2)} valid parse trees for sentence 2\n")
for i, tree in enumerate(porter_tree2, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()

LANCASTER GRAMMAR

In [None]:
lancaster_grammar = CFG.fromstring("""
    # --- STRUCTURE ---
    S -> NP VP PUNCT
    
    # NP Rules: 
    # Det N N covers "gen gen" (noun-noun) AND "org org"
    # Det ADJ N covers "gen gen" (adj-noun) 
    NP -> Det N | Det ADJ N | Det N N
    
    # VP Rules: Matches both sentence structures
    VP -> Adv V NP | V NP PP
    PP -> P NP
    
    # --- LANCASTER VOCABULARY ---
    Det -> 'the'
    N   -> 'gen' | 'org' | 'wom' | 'leav' | 'bank'
    ADJ -> 'gen'
    V   -> 'plan' | 'left'
    Adv -> 'cur'
    P   -> 'by'
    PUNCT -> '.'
""")

lancaster_parser = ChartParser(lancaster_grammar)


LANCASTER PARSE TREE CONSTRUCTION

In [None]:
lancaster_tree1 = list(lancaster_parser.parse(l_sentences[0]))
lancaster_tree2 = list(lancaster_parser.parse(l_sentences[1]))

print(f"Found {len(lancaster_tree1)} valid parse trees for sentence 1\n")
for i, tree in enumerate(lancaster_tree1, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()


print(f"Found {len(lancaster_tree2)} valid parse trees for sentence 2\n")
for i, tree in enumerate(lancaster_tree2, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()

SNOWBALL GRAMMAR

In [None]:
snowball_grammar = CFG.fromstring("""
    # --- STRUCTURE ---
    S -> NP VP PUNCT
    
    NP -> Det N | Det ADJ N | Det N N
    VP -> Adv V NP | V NP PP
    PP -> P NP
    
    # --- SNOWBALL VOCABULARY ---
    Det -> 'the'
    
    # Notice: 'generic' is NOT here, it is in ADJ
    N   -> 'general' | 'organ' | 'women' | 'leav' | 'bank'
    
    # Distinct Adjective!
    ADJ -> 'generic'
    
    V   -> 'plan' | 'left'
    Adv -> 'current'
    P   -> 'by'
    PUNCT -> '.'
""")

snowball_parser = ChartParser(snowball_grammar)

SNOWBALL PARSE TREE CONSTRUCTION

In [None]:
snowball_tree1 = list(snowball_parser.parse(s_sentences[0]))
snowball_tree2 = list(snowball_parser.parse(s_sentences[1]))

print(f"Found {len(snowball_tree1)} valid parse trees for sentence 1\n")
for i, tree in enumerate(snowball_tree1, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()


print(f"Found {len(snowball_tree2)} valid parse trees for sentence 2\n")
for i, tree in enumerate(snowball_tree2, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()

LEMMA GRAMMAR

In [None]:
lemma_grammar = CFG.fromstring("""
    # --- STRUCTURE ---
    S -> NP VP PUNCT
    
    NP -> Det N | Det ADJ N | Det N N
    VP -> Adv V NP | V NP PP
    PP -> P NP
    
    # --- LEMMATIZER VOCABULARY ---
    # Case Sensitive!
    Det -> 'The' | 'the'
    
    # Full words restored
    N   -> 'general' | 'organ' | 'organization' | 'woman' | 'leaf' | 'bank'
    ADJ -> 'generic'
    
    # Verbs in root form
    V   -> 'plan' | 'leave'
    Adv -> 'currently'
    P   -> 'by'
    PUNCT -> '.'
""")

lemma_parser = ChartParser(lemma_grammar)

LEMMA PARSE TREE CONSTRUCTION

In [None]:
lemma_tree1 = list(lemma_parser.parse(lemmatized_sentences[0]))
lemma_tree2 = list(lemma_parser.parse(lemmatized_sentences[1]))

print(f"Found {len(lemma_tree1)} valid parse trees for sentence 1\n")
for i, tree in enumerate(lemma_tree1, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()


print(f"Found {len(lemma_tree2)} valid parse trees for sentence 2\n")
for i, tree in enumerate(lemma_tree2, 1):
    print(f"Tree #{i} Interpretation:")
    tree.pretty_print()

NAMED ENTITY RECOGNITION

In [None]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('maxent_ne_chunker_tab', quiet=True)
nltk.download('words', quiet=True)


tags = nltk.pos_tag([t.capitalize() for t in nested_tokens[1]])
# print(nested_tokens[0])


ner_tags = nltk.ne_chunk(tags)
# print(ner_tags)
print("Named Entity Recognition Output:\n")

for entity in ner_tags:
    if isinstance(entity, nltk.Tree):
        
        entity_words = [word for word, tag in entity.leaves()]
        entity_name = " ".join(entity_words)
        entity_label = entity.label()
        
        print(f"Entity: {entity_name}, Label: {entity_label}")


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(text.capitalize())
print(text)
for entity in doc.ents:
    print(f"Entity: {entity.text}, Label: {entity.label_}")


t = "Macavity is a Mystery Person in Madurai: heâ€™s called the CEO of Hidden Paw!"
doc1 = nlp(t)
for entity in doc1.ents:
    print(f"Entity: {entity.text}, Label: {entity.label_}")

In [None]:
from nltk.corpus import wordnet as wn

import nltk

nltk.download('wordnet',quiet=True)

import nltk

nltk.download('omw-1.4',quiet=True)

def get_semantic_info(word):
    synsets = wn.synsets(word)
    if not synsets:
        return None
    synset = synsets[0]
    return {
        "Definition": synset.definition(),
        "Hypernyms" : [h.name() for h in synset.hypernyms()],
        "Hyponyms" : [h.name() for h in synset.hyponyms()],
        "Root": synset.root_hypernyms()[0].name()
    }

bliss_semantics = get_semantic_info('cat')

print(bliss_semantics)



In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords',quiet=True)

english_stops = set(stopwords.words('english'))

t = "Macavity is not guilty in Madurai ; he is just a misunderstood feline"
words = t.split()

[word for word in words if word not in
english_stops]

In [None]:
class word_syn_replacer(object):

    def __init__(self, word_map):
        self.word_map = word_map

    def replace(self, word):
        return self.word_map.get(word, word)

rep_syn = word_syn_replacer({'bday' :
'birthday'})

rep_syn.replace('bday')

In [None]:
from nltk.corpus import wordnet

class word_antonym_replacer(object):
    def replace(self, word, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        
        # CORRECT LOGIC: Check length AFTER collecting all candidates
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        
        while i < l:
            word = sent[i]
            
            # Check for "not" + valid next word
            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])
                
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            
            # This part runs if:
            # 1. Word is NOT 'not'
            # 2. Word IS 'not' but no antonym was found
            words.append(word)
            i += 1
            
        return words

# Usage
rep_antonym = word_antonym_replacer()
result = rep_antonym.replace_negations(words)
print(result)