In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Sample document
sample_document = "Text analytics is the process of converting unstructured text data into meaningful and actionable information."

# Tokenization
tokens = word_tokenize(sample_document)
print("Tokenization:", tokens)
Tokenization: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'converting', 'unstructured', 'text', 'data', 'into', 'meaningful', 'and', 'actionable', 'information', '.']

Tokenization: ['Text', 'analytics', 'is', 'the', 'process', 'of', 'converting', 'unstructured', 'text', 'data', 'into', 'meaningful', 'and', 'actionable', 'information', '.']


In [4]:
# POS Tagging
pos_tags = pos_tag(tokens)
print("\nPOS Tagging:", pos_tags)


POS Tagging: [('Text', 'NN'), ('analytics', 'NNS'), ('is', 'VBZ'), ('the', 'DT'), ('process', 'NN'), ('of', 'IN'), ('converting', 'VBG'), ('unstructured', 'JJ'), ('text', 'NN'), ('data', 'NNS'), ('into', 'IN'), ('meaningful', 'JJ'), ('and', 'CC'), ('actionable', 'JJ'), ('information', 'NN'), ('.', '.')]


In [5]:
# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in
stop_words]
print("\nStopwords Removal:", filtered_tokens)


Stopwords Removal: ['Text', 'analytics', 'process', 'converting', 'unstructured', 'text', 'data', 'meaningful', 'actionable', 'information', '.']


In [6]:
# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nStemming:", stemmed_tokens)


Stemming: ['text', 'analyt', 'process', 'convert', 'unstructur', 'text', 'data', 'meaning', 'action', 'inform', '.']


In [8]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in
filtered_tokens]
print("\nLemmatization:", lemmatized_tokens)


Lemmatization: ['Text', 'analytics', 'process', 'converting', 'unstructured', 'text', 'data', 'meaningful', 'actionable', 'information', '.']


In [9]:
# Term Frequency - Inverse Document Frequency (TF-IDF) representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform([sample_document])
print("\nTF-IDF Representation")
print(tfidf_representation)


TF-IDF Representation
  (0, 5)	0.24253562503633297
  (0, 0)	0.24253562503633297
  (0, 2)	0.24253562503633297
  (0, 8)	0.24253562503633297
  (0, 6)	0.24253562503633297
  (0, 4)	0.24253562503633297
  (0, 13)	0.24253562503633297
  (0, 3)	0.24253562503633297
  (0, 9)	0.24253562503633297
  (0, 10)	0.24253562503633297
  (0, 12)	0.24253562503633297
  (0, 7)	0.24253562503633297
  (0, 1)	0.24253562503633297
  (0, 11)	0.48507125007266594
