<a href="https://colab.research.google.com/github/Namanm23/NLP/blob/main/Stemming_%26_Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer, RegexpStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Input text
text = "The quick brown foxes are jumping over the lazy dogs. They also love running in the fields."

# Tokenize text
tokens = word_tokenize(text)

# --- Stemming ---
# Initialize stemmers
porter = PorterStemmer()
lancaster = LancasterStemmer()
regexp = RegexpStemmer('ing$|s$|ed$', min=4)  # Rule-based stemmer
snowball = SnowballStemmer('english')

# Apply stemmers
porter_stemmed = [porter.stem(word) for word in tokens]
lancaster_stemmed = [lancaster.stem(word) for word in tokens]
regexp_stemmed = [regexp.stem(word) for word in tokens]
snowball_stemmed = [snowball.stem(word) for word in tokens]

# Print results of stemming
print("Porter Stemmer:", porter_stemmed)
print("Lancaster Stemmer:", lancaster_stemmed)
print("Regexp Stemmer:", regexp_stemmed)
print("Snowball Stemmer:", snowball_stemmed)

# --- Lemmatization ---
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

# Print results of lemmatization
print("\nLemmatized Words:", lemmatized_words)

# --- Optional: Sample words for further stemming comparison ---
words = ['running', 'cats', 'fishing', 'trouble', 'connection', 'organization', 'truthful', 'probabilities']

# Apply stemmers to sample words
porter_stemmed_words = [porter.stem(word) for word in words]
lancaster_stemmed_words = [lancaster.stem(word) for word in words]
regexp_stemmed_words = [regexp.stem(word) for word in words]
snowball_stemmed_words = [snowball.stem(word) for word in words]

# Print stemmed results for sample words
print("\nPorter Stemmer (Sample Words):", porter_stemmed_words)
print("Lancaster Stemmer (Sample Words):", lancaster_stemmed_words)
print("Regexp Stemmer (Sample Words):", regexp_stemmed_words)
print("Snowball Stemmer (Sample Words):", snowball_stemmed_words)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Porter Stemmer: ['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'they', 'also', 'love', 'run', 'in', 'the', 'field', '.']
Lancaster Stemmer: ['the', 'quick', 'brown', 'fox', 'ar', 'jump', 'ov', 'the', 'lazy', 'dog', '.', 'they', 'also', 'lov', 'run', 'in', 'the', 'field', '.']
Regexp Stemmer: ['The', 'quick', 'brown', 'foxe', 'are', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'They', 'also', 'love', 'runn', 'in', 'the', 'field', '.']
Snowball Stemmer: ['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'they', 'also', 'love', 'run', 'in', 'the', 'field', '.']

Lemmatized Words: ['The', 'quick', 'brown', 'fox', 'are', 'jumping', 'over', 'the', 'lazy', 'dog', '.', 'They', 'also', 'love', 'running', 'in', 'the', 'field', '.']

Porter Stemmer (Sample Words): ['run', 'cat', 'fish', 'troubl', 'connect', 'organ', 'truth', 'probabl']
Lancaster Stemmer (Sample Words): ['run', 'cat', 'fish', 'troubl', 'connect', 'org', 'truth', 'pr