In [1]:
import shutil
import os
import nltk

# Remove the nltk_data folder (be careful with this)
nltk_data_path = os.path.expanduser('~/nltk_data')
if os.path.exists(nltk_data_path):
    shutil.rmtree(nltk_data_path)

# Re-download cleanly
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# If not already installed: pip install spacy
# And download the English model: python -m spacy download en_core_web_sm

import spacy
from nltk.stem import PorterStemmer

# Load spaCy English pipeline
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()

def nlp_preprocessing_spacy(sentence):
    doc = nlp(sentence)

    # Step 1: Tokenization
    tokens = [token.text for token in doc]
    print("Original Tokens:", tokens)

    # Step 2: Remove stopwords and punctuation
    tokens_without_stopwords = [token.text for token in doc if not token.is_stop and token.is_alpha]
    print("Tokens Without Stopwords:", tokens_without_stopwords)

    # Step 3: Stemming
    stemmed_words = [stemmer.stem(word) for word in tokens_without_stopwords]
    print("Stemmed Words:", stemmed_words)

# Example
sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_preprocessing_spacy(sentence)


Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'virtual', 'assistants', 'like', 'Alexa', 'Siri']
Stemmed Words: ['nlp', 'techniqu', 'virtual', 'assist', 'like', 'alexa', 'siri']


1. What is the difference between stemming and lemmatization?
Stemming chops off word endings to get the root form, which may not be a real word.
Lemmatization returns the dictionary base form (lemma) of a word, always valid.

Example with “running”:

Stemming: "running" → "runn"

Lemmatization: "running" → "run"

2. Why might removing stop words be useful, and when might it be harmful?
Useful: It removes common words (like “the”, “is”) that don’t carry much meaning, helping in tasks like text classification or topic modeling.
Harmful: In tasks like machine translation or question answering, stop words are important for grammar and meaning, so removing them can hurt performance.