In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Input text
text = "NLP is a fascinating field of study! It involves tasks like tokenization, filtration, and stemming."

### 1. Tokenization
tokens = word_tokenize(text)
print("Tokenized Words:", tokens)

### 2. Filtration (remove punctuation and non-alphanumeric)
filtered_tokens = [word for word in tokens if word.isalnum()]
print("Filtered Tokens:", filtered_tokens)

### 3. Script Validation (only English alphabets)
validated_tokens = [word for word in filtered_tokens if re.match(r'^[A-Za-z]+$', word)]
print("Script Validated Tokens:", validated_tokens)

### 4. Stop Word Removal
stop_words = set(stopwords.words('english'))
tokens_without_stopwords = [word for word in validated_tokens if word.lower() not in stop_words]
print("Tokens without Stop Words:", tokens_without_stopwords)

### 5. Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens_without_stopwords]
print("Stemmed Tokens:", stemmed_tokens)

# Final preprocessed text
preprocessed_text = " ".join(stemmed_tokens)
print("\nFinal Preprocessed Text:", preprocessed_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DSATM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Tokenized Words: ['NLP', 'is', 'a', 'fascinating', 'field', 'of', 'study', '!', 'It', 'involves', 'tasks', 'like', 'tokenization', ',', 'filtration', ',', 'and', 'stemming', '.']
Filtered Tokens: ['NLP', 'is', 'a', 'fascinating', 'field', 'of', 'study', 'It', 'involves', 'tasks', 'like', 'tokenization', 'filtration', 'and', 'stemming']
Script Validated Tokens: ['NLP', 'is', 'a', 'fascinating', 'field', 'of', 'study', 'It', 'involves', 'tasks', 'like', 'tokenization', 'filtration', 'and', 'stemming']
Tokens without Stop Words: ['NLP', 'fascinating', 'field', 'study', 'involves', 'tasks', 'like', 'tokenization', 'filtration', 'stemming']
Stemmed Tokens: ['nlp', 'fascin', 'field', 'studi', 'involv', 'task', 'like', 'token', 'filtrat', 'stem']

Final Preprocessed Text: nlp fascin field studi involv task like token filtrat stem


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DSATM\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
