In [1]:
sample_doc = "Natural Language Processing allows machines to understand human language."


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

# Download resources (only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Tokenization
tokens = word_tokenize(sample_doc)

# POS Tagging
pos = pos_tag(tokens)

# Remove stop words
stop_words = set(stopwords.words('english'))
filtered = [w for w in tokens if w.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

# Print all results

print("Tokens:", tokens)
print("POS Tags:", pos)
print("Without Stop Words:", filtered)
print("Stemmed:", stemmed)
print("Lemmatized:", lemmatized)


Tokens: ['Natural', 'Language', 'Processing', 'allows', 'machines', 'to', 'understand', 'human', 'language', '.']
POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('allows', 'VBZ'), ('machines', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]
Without Stop Words: ['Natural', 'Language', 'Processing', 'allows', 'machines', 'understand', 'human', 'language', '.']
Stemmed: ['natur', 'languag', 'process', 'allow', 'machin', 'understand', 'human', 'languag', '.']
Lemmatized: ['Natural', 'Language', 'Processing', 'allows', 'machine', 'understand', 'human', 'language', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mansi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mansi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mansi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mansi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:

docs = [
    "NLP helps machines understand language.",
    "Language is used to communicate.",
    "Machines use NLP for language processing."
]


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Term Frequency
tf = CountVectorizer()
tf_result = tf.fit_transform(docs)
print("TF Result:\n", tf_result.toarray())
print("Words:", tf.get_feature_names_out())

# TF-IDF
tfidf = TfidfVectorizer()
tfidf_result = tfidf.fit_transform(docs)
print("TF-IDF Result:\n", tfidf_result.toarray())
print("Words:", tfidf.get_feature_names_out())


TF Result:
 [[0 0 1 0 1 1 1 0 0 1 0 0]
 [1 0 0 1 1 0 0 0 1 0 0 1]
 [0 1 0 0 1 1 1 1 0 0 1 0]]
Words: ['communicate' 'for' 'helps' 'is' 'language' 'machines' 'nlp' 'processing'
 'to' 'understand' 'use' 'used']
TF-IDF Result:
 [[0.         0.         0.53409337 0.         0.31544415 0.40619178
  0.40619178 0.         0.         0.53409337 0.         0.        ]
 [0.47952794 0.         0.         0.47952794 0.28321692 0.
  0.         0.         0.47952794 0.         0.         0.47952794]
 [0.         0.4711101  0.         0.         0.27824521 0.35829137
  0.35829137 0.4711101  0.         0.         0.4711101  0.        ]]
Words: ['communicate' 'for' 'helps' 'is' 'language' 'machines' 'nlp' 'processing'
 'to' 'understand' 'use' 'used']
