In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')

def nlp_preprocessing_pipeline(sentence):
    # Step 1: Tokenization
    tokens = word_tokenize(sentence)
    print("Original Tokens:", tokens)
    
    # Step 2: Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stop_words]
    print("Tokens Without Stopwords:", tokens_without_stopwords)
    
    # Step 3: Apply Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens_without_stopwords]
    print("Stemmed Words:", stemmed_tokens)

# Example sentence
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_preprocessing_pipeline(sentence)


[nltk_data] Downloading package punkt to C:\Users\Manikanta
[nltk_data]     Rajulapati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Manikanta
[nltk_data]     Rajulapati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri', '.']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.']


# 1. What is the difference between stemming and lemmatization? Provide examples with the word "running".

Stemming is a crude technique that chops off the ends of words to get to the root form, often resulting in non-words.

Lemmatization is more advanced; it uses vocabulary and morphological analysis to return the base or dictionary form of a word (called a lemma).

Example with the word “running”:

Stemming using PorterStemmer might return: run

Lemmatization (using WordNetLemmatizer with POS tag as verb) would return: run

Although they return the same in this case, for other words like “better”:

Stemmer → better

Lemmatizer → good (if POS is adjective)

So, lemmatization provides more accurate results, but is slower and more complex than stemming.

# 2. Why might removing stop words be useful in some NLP tasks, and when might it actaully be harmful?

Useful:
Removing stopwords helps reduce noise in the data by eliminating common, low-information words like "the", "in", "is". This is especially helpful in tasks like:

Topic modeling

Document classification

Information retrieval

Harmful:
However, removing stopwords can be harmful when:

The meaning of the sentence depends on those words.

For example, in sentiment analysis, removing words like "not" could change the meaning:

Original: "This movie is not good."

After stopword removal: "movie good" → wrongly interpreted as positive.

So, the use of stopword removal depends on the context of the task.