In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
text_data = "The quick brown fox jumps over the lazy dog. The dog barks at the cat."

In [5]:
# Stopwords removal
stop_words = set(stopwords.words('english'))
words = word_tokenize(text_data)
filtered_words = [word for word in words if word not in stop_words]
print("Stopwords removed:", filtered_words)

Stopwords removed: ['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.', 'The', 'dog', 'barks', 'cat', '.']


In [6]:
# Tokenization
sentences = sent_tokenize(text_data)
print("Tokenized sentences:", sentences)

Tokenized sentences: ['The quick brown fox jumps over the lazy dog.', 'The dog barks at the cat.']


In [7]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("Stemmed words:", stemmed_words)

Stemmed words: ['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.', 'the', 'dog', 'bark', 'cat', '.']


In [8]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized words:", lemmatized_words)

Lemmatized words: ['The', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.', 'The', 'dog', 'bark', 'cat', '.']


In [9]:
# POS tagging
pos_tags = pos_tag(filtered_words)
print("POS tags:", pos_tags)

POS tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'NNS'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('The', 'DT'), ('dog', 'NN'), ('barks', 'VBZ'), ('cat', 'NN'), ('.', '.')]


In [10]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(sentences)
print("TF-IDF vectors:", tfidf_vectors)

TF-IDF vectors:   (0, 4)	0.24359836256665346
  (0, 7)	0.34236899897971346
  (0, 8)	0.34236899897971346
  (0, 6)	0.34236899897971346
  (0, 5)	0.34236899897971346
  (0, 2)	0.34236899897971346
  (0, 9)	0.34236899897971346
  (0, 10)	0.4871967251333069
  (1, 3)	0.4251963615908802
  (1, 0)	0.4251963615908802
  (1, 1)	0.4251963615908802
  (1, 4)	0.3025307132406998
  (1, 10)	0.6050614264813996


In [11]:
# One-hot encoding
vectorizer = CountVectorizer(binary=True)
one_hot_vectors = vectorizer.fit_transform(filtered_words)
print("One-hot vectors:", one_hot_vectors)

One-hot vectors:   (0, 8)	1
  (1, 7)	1
  (2, 1)	1
  (3, 4)	1
  (4, 5)	1
  (5, 6)	1
  (6, 3)	1
  (8, 8)	1
  (9, 3)	1
  (10, 0)	1
  (11, 2)	1


In [12]:
# Bag of Words
bow_vectorizer = CountVectorizer()
bow_vectors = bow_vectorizer.fit_transform(sentences)
print("Bag of Words vectors:", bow_vectors)

Bag of Words vectors:   (0, 10)	2
  (0, 9)	1
  (0, 2)	1
  (0, 5)	1
  (0, 6)	1
  (0, 8)	1
  (0, 7)	1
  (0, 4)	1
  (1, 10)	2
  (1, 4)	1
  (1, 1)	1
  (1, 0)	1
  (1, 3)	1


In [13]:
# Unigram, Bigram, n-gram
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))  # Unigrams, bigrams, and trigrams
ngram_vectors = ngram_vectorizer.fit_transform(sentences)
print("n-gram vectors:", ngram_vectors)

n-gram vectors:   (0, 27)	2
  (0, 24)	1
  (0, 6)	1
  (0, 13)	1
  (0, 16)	1
  (0, 21)	1
  (0, 19)	1
  (0, 10)	1
  (0, 33)	1
  (0, 25)	1
  (0, 7)	1
  (0, 14)	1
  (0, 17)	1
  (0, 22)	1
  (0, 31)	1
  (0, 20)	1
  (0, 34)	1
  (0, 26)	1
  (0, 8)	1
  (0, 15)	1
  (0, 18)	1
  (0, 23)	1
  (0, 32)	1
  (1, 27)	2
  (1, 10)	1
  (1, 3)	1
  (1, 0)	1
  (1, 9)	1
  (1, 29)	1
  (1, 11)	1
  (1, 4)	1
  (1, 1)	1
  (1, 28)	1
  (1, 30)	1
  (1, 12)	1
  (1, 5)	1
  (1, 2)	1
