In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Document
document = """
Machine learning is a subfield of artificial intelligence. It focuses on the development of algorithms and models that enable computers to learn and make predictions or decisions without explicit programming. Machine learning techniques have found applications in various domains, including image recognition, natural language processing, and data analysis.
"""

# Tokenization
tokens = word_tokenize(document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

# Printing the results
print("Tokenization:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nStop Words Removal:")
print(filtered_tokens)
print("\nStemming:")
print(stemmed_tokens)
print("\nLemmatization:")
print(lemmatized_tokens)

# Term Frequency-Inverse Document Frequency (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([document])
feature_names = tfidf_vectorizer.get_feature_names()
tfidf_representation = tfidf_matrix.toarray()

print("\nTF-IDF Representation:")
for i, feature in enumerate(feature_names):
    print(f"{feature}: {tfidf_representation[0][i]}")


Tokenization:
['Machine', 'learning', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', '.', 'It', 'focuses', 'on', 'the', 'development', 'of', 'algorithms', 'and', 'models', 'that', 'enable', 'computers', 'to', 'learn', 'and', 'make', 'predictions', 'or', 'decisions', 'without', 'explicit', 'programming', '.', 'Machine', 'learning', 'techniques', 'have', 'found', 'applications', 'in', 'various', 'domains', ',', 'including', 'image', 'recognition', ',', 'natural', 'language', 'processing', ',', 'and', 'data', 'analysis', '.']

POS Tagging:
[('Machine', 'NN'), ('learning', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('subfield', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('.', '.'), ('It', 'PRP'), ('focuses', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('development', 'NN'), ('of', 'IN'), ('algorithms', 'NN'), ('and', 'CC'), ('models', 'NNS'), ('that', 'IN'), ('enable', 'JJ'), ('computers', 'NNS'), ('to', 'TO'), ('learn', 'VB'), ('and', 'CC'), ('make', 'VB'), ('predicti

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ketul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ketul\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ketul\AppData\Roaming\nltk_data...


True