<a href="https://colab.research.google.com/github/Gousepasha789/Information-Retrieval-System/blob/main/Tokenization_bow_lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Download the necessary NLTK data package
nltk.download('punkt_tab') # This line is added to download the missing data

# Sample text
text = "Tokenization is the process of breaking text into words or sentences. It's an essential step in NLP!"

# ------------------ 1️⃣ WORD TOKENIZATION ------------------
# Using NLTK
word_tokens = word_tokenize(text)
print("\n🔹 Word Tokens (NLTK):", word_tokens)


🔹 Word Tokens (NLTK): ['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'text', 'into', 'words', 'or', 'sentences', '.', 'It', "'s", 'an', 'essential', 'step', 'in', 'NLP', '!']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [19]:
# ------------------ 2️⃣ CHARACTER TOKENIZATION ------------------
char_tokens = list(text)  # Splitting text into individual characters
print("\n🔹 Character Tokens:", char_tokens)


🔹 Character Tokens: ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'i', 's', ' ', 't', 'h', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 's', 's', ' ', 'o', 'f', ' ', 'b', 'r', 'e', 'a', 'k', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'i', 'n', 't', 'o', ' ', 'w', 'o', 'r', 'd', 's', ' ', 'o', 'r', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', 's', '.', ' ', 'I', 't', "'", 's', ' ', 'a', 'n', ' ', 'e', 's', 's', 'e', 'n', 't', 'i', 'a', 'l', ' ', 's', 't', 'e', 'p', ' ', 'i', 'n', ' ', 'N', 'L', 'P', '!']


In [20]:
# ------------------ 3️⃣ SENTENCE TOKENIZATION ------------------
# Using NLTK
sentence_tokens = sent_tokenize(text)
print("\n🔹 Sentence Tokens (NLTK):", sentence_tokens)


# Using spaCy
sentence_tokens_spacy = [sent.text for sent in doc.sents]
print("\n🔹 Sentence Tokens (spaCy):", sentence_tokens_spacy)


🔹 Sentence Tokens (NLTK): ['Tokenization is the process of breaking text into words or sentences.', "It's an essential step in NLP!"]

🔹 Sentence Tokens (spaCy): ['Tokenization is the process of breaking text into words or sentences.', "It's an essential step in NLP!"]


In [21]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

# ------------------ WordPiece Tokenization (Fixed) ------------------
wp_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
wp_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# ✅ Use a larger vocabulary size to get meaningful subwords
wp_trainer = trainers.WordPieceTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=500)
wp_tokenizer.train_from_iterator([text], trainer=wp_trainer)

# Tokenize with WordPiece
wp_tokens = wp_tokenizer.encode(text).tokens
print("\n🔹 WordPiece Tokens (Fixed):", wp_tokens)



🔹 WordPiece Tokens (Fixed): ['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'text', 'into', 'words', 'or', 'sentences', '.', 'It', "'", 's', 'an', 'essential', 'step', 'in', 'NLP', '!']


In [22]:

# ------------------ 4️⃣ BAG OF WORDS (BoW) ------------------
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([text])

# Get feature names (unique words)
bow_features = vectorizer.get_feature_names_out()

# Convert BoW matrix to array
bow_array = bow_matrix.toarray()

print("\n🔹 Bag of Words Features:", bow_features)
print("\n🔹 BoW Matrix:\n", bow_array)


🔹 Bag of Words Features: ['an' 'breaking' 'essential' 'in' 'into' 'is' 'it' 'nlp' 'of' 'or'
 'process' 'sentences' 'step' 'text' 'the' 'tokenization' 'words']

🔹 BoW Matrix:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [23]:
lemmatizer = WordNetLemmatizer()

# Tokenize words
word_tokens = word_tokenize(text)

# Lemmatize each word
lemmatized_words_nltk = [lemmatizer.lemmatize(word) for word in word_tokens]

print("\n🔹 Lemmatized Words (NLTK):", lemmatized_words_nltk)



🔹 Lemmatized Words (NLTK): ['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'text', 'into', 'word', 'or', 'sentence', '.', 'It', "'s", 'an', 'essential', 'step', 'in', 'NLP', '!']
