In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample Document
sample_document = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural-language generation.

Text analytics, also known as text mining, is a process of deriving high-quality information from text. It involves the extraction of useful patterns and insights from unstructured text data. Text analytics techniques include text preprocessing, sentiment analysis, named entity recognition, and topic modeling.

Machine learning algorithms play a crucial role in text analytics tasks. Supervised learning algorithms are used for tasks such as sentiment analysis and text classification, while unsupervised learning algorithms are used for tasks such as clustering and topic modeling. Deep learning techniques, especially neural networks, have shown promising results in various text analytics applications.

In summary, text analytics is a powerful tool for extracting valuable insights from textual data, enabling businesses to make data-driven decisions and improve their processes.
"""

# Tokenization
tokens = word_tokenize(sample_document)

# POS Tagging
pos_tags = pos_tag(tokens)

# Stopwords Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Print results
print("Original Tokens:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nFiltered Tokens (after removing stopwords):")
print(filtered_tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)
print("\nLemmatized Tokens:")
print(lemmatized_tokens)

# Calculate TF-IDF
corpus = [sample_document]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame for TF-IDF representation
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print("\nTF-IDF Representation:")
print(tfidf_df)

Original Tokens:
['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.', 'Challenges', 'in', 'natural', 'language', 'processing', 'frequently', 'involve', 'speech', 'recognition', ',', 'natural', 'language', 'understanding', ',', 'and', 'natural-language', 'generation', '.', 'Text', 'analytics', ',', 'also', 'known', 'as', 'text', 'mining', ',', 'is', 'a', 'process', 'of', 'deriving', 'high-quality', 'information', 'from', 'text', '.', 'It', 'involves', 'the', 'extraction', 'of', 'useful', 'patterns', 'and', 'insights', 'from', 'unstructured', 'text', 'data', '.', 'Text', 'analytics', 'techniques', 'include', 'text', 'prepro