<a href="https://colab.research.google.com/github/Hasinireddy-Ainavole/nlp-text-processing/blob/main/nlp_text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

# download necessary resources
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('omw-1.4', quiet=True)

# helper function to convert pos tags
def get_wordnet_pos(tag):
    if tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('J'):
        return 'a'
    else:
        return 'n'

text = "John enjoys playing football while Mary loves reading books in the library."

# step 1: tokenize
tokens = word_tokenize(text)
print(f"Tokens: {tokens}\n")

# step 2: remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(f"After stopwords removal: {filtered_tokens}\n")

# step 3: lemmatization
lemmatizer = WordNetLemmatizer()
pos_tags = pos_tag(filtered_tokens)
print(f"POS tags: {pos_tags}\n")

lemmatized = [(lemmatizer.lemmatize(word, get_wordnet_pos(tag)), tag)
              for word, tag in pos_tags]
print(f"After lemmatization: {lemmatized}\n")

# step 4: keep only verbs and nouns
result = [word for word, tag in lemmatized if tag.startswith('V') or tag.startswith('N')]
print(f"Final result: {result}")

Tokens: ['John', 'enjoys', 'playing', 'football', 'while', 'Mary', 'loves', 'reading', 'books', 'in', 'the', 'library', '.']

After stopwords removal: ['John', 'enjoys', 'playing', 'football', 'Mary', 'loves', 'reading', 'books', 'library', '.']

POS tags: [('John', 'NNP'), ('enjoys', 'VBZ'), ('playing', 'VBG'), ('football', 'NN'), ('Mary', 'NNP'), ('loves', 'VBZ'), ('reading', 'VBG'), ('books', 'NNS'), ('library', 'JJ'), ('.', '.')]

After lemmatization: [('John', 'NNP'), ('enjoy', 'VBZ'), ('play', 'VBG'), ('football', 'NN'), ('Mary', 'NNP'), ('love', 'VBZ'), ('read', 'VBG'), ('book', 'NNS'), ('library', 'JJ'), ('.', '.')]

Final result: ['John', 'enjoy', 'play', 'football', 'Mary', 'love', 'read', 'book']


In [10]:
import spacy

# load spacy model
nlp = spacy.load("en_core_web_sm")

text = "Chris met Alex at Apple headquarters in California. He told him about the new iPhone launch."

# process text
doc = nlp(text)

# perform NER
print("Named Entities:")
for ent in doc.ents:
    print(f"  {ent.text} - {ent.label_}")

print()

# check for pronoun ambiguity
pronouns = ["he", "she", "they", "him", "her", "them"]
found_pronouns = [token.text.lower() for token in doc if token.text.lower() in pronouns]

if found_pronouns:
    print("Warning: Possible pronoun ambiguity detected!")
    print(f"Pronouns found: {', '.join(set(found_pronouns))}")

Named Entities:
  Chris - PERSON
  Alex - PERSON
  Apple - ORG
  California - GPE
  iPhone - ORG

Pronouns found: he, him
