In [None]:
# -------------------------------
# Part C - Question 1 
# Complete Text Preprocessing Flow
# -------------------------------

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download required resources (first-time use)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sample text
sentence = "Machine learning models become more accurate when the data is cleaned and properly preprocessed."

# Step 1: Convert sentence into tokens
tokens_list = word_tokenize(sentence)

# Step 2: Remove stopwords
stopword_set = set(stopwords.words("english"))
tokens_no_stop = [tok for tok in tokens_list if tok.lower() not in stopword_set]

# Step 3: Lemmatize each token
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(token) for token in tokens_no_stop]

# Step 4: POS tagging → keep only nouns
pos_info = pos_tag(lemmatized)
nouns_only = [word for word, tag in pos_info if tag.startswith("NN")]

print("Original text:", sentence)
print("After tokenization:", tokens_list)
print("Without stopwords:", tokens_no_stop)
print("Lemmatized tokens:", lemmatized)
print("Filtered nouns:", nouns_only)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Filtered tokens (Word – Lemma – POS):
enjoys     → enjoy      (VERB)
playing    → play       (VERB)
football   → football   (NOUN)
reading    → read       (VERB)
books      → book       (NOUN)
library    → library    (NOUN)

Lemmatized result: ['enjoy', 'play', 'football', 'read', 'book', 'library']


In [2]:
# --------------------------------------
# Part C - Question 2 
# Pronoun Ambiguity Detection using NLTK
# --------------------------------------

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk

# Make sure required NLTK data is available
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

# Input text
text_input = "John met Alex after work, and he gave him the project files."

# Step 1: Tokenize
tokens = word_tokenize(text_input)

# Step 2: POS tagging
tagged = pos_tag(tokens)

# Step 3: Named Entity Recognition (Tree format)
ner_tree = ne_chunk(tagged)

# Extract all PERSON names from NER
person_names = []
for subtree in ner_tree:
    if hasattr(subtree, 'label') and subtree.label() == 'PERSON':
        name = " ".join([leaf[0] for leaf in subtree.leaves()])
        person_names.append(name)

# Pronouns indicating potential ambiguity
pronoun_set = {"he", "she", "they", "him", "her"}

# Detect pronouns in sentence
found_prons = [tok.lower() for tok in tokens if tok.lower() in pronoun_set]

print("Sentence:", text_input)
print("Detected Persons:", person_names)
print("Detected Pronouns:", found_prons)

# Ambiguity condition:
# If there's more than one person AND any pronoun exists
if len(person_names) > 1 and len(found_prons) > 0:
    print("\nConclusion: The pronoun reference is AMBIGUOUS.")
else:
    print("\nConclusion: No pronoun ambiguity detected.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dmkr1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dmkr1\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dmkr1\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dmkr1\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Sentence: John met Alex after work, and he gave him the project files.
Detected Persons: ['John', 'Alex']
Detected Pronouns: ['he', 'him']

Conclusion: The pronoun reference is AMBIGUOUS.
