# Install and import necessary libraries

In [1]:
!pip install nltk scikit-learn



In [2]:
import nltk
import string
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer

# Download necessary NLTK datasets

In [3]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# User input text

In [4]:
text = input("Enter the text for N-gram modeling: ")

Enter the text for N-gram modeling: The sun is shining bright today.


# Lowercasing (Normalization)

In [5]:
normalized_text = text.lower()
print("\nNormalized Text:\n", normalized_text)


Normalized Text:
 the sun is shining bright today.


# Sentence Tokenization

In [6]:
sentences = sent_tokenize(normalized_text)
print("\nSentence Tokenization:\n", sentences)


Sentence Tokenization:
 ['the sun is shining bright today.']


# Word Tokenization

In [7]:
words = word_tokenize(normalized_text)
print("\nWord Tokenization:\n", words)


Word Tokenization:
 ['the', 'sun', 'is', 'shining', 'bright', 'today', '.']


# Remove punctuation

In [8]:
words_no_punct = [word for word in words if word not in string.punctuation]
print("\nPunctuation Removal:\n", words_no_punct)


Punctuation Removal:
 ['the', 'sun', 'is', 'shining', 'bright', 'today']


# Stopword Removal

In [9]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words_no_punct if word not in stop_words]
print("\nStopword Removal:\n", filtered_words)


Stopword Removal:
 ['sun', 'shining', 'bright', 'today']


# Stemming

In [10]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("\nStemming:\n", stemmed_words)


Stemming:
 ['sun', 'shine', 'bright', 'today']


# Lemmatization

In [11]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("\nLemmatization:\n", lemmatized_words)


Lemmatization:
 ['sun', 'shining', 'bright', 'today']


# Function to generate n-grams

In [12]:
def generate_ngrams(words, n):
    return list(ngrams(words, n))

# Generate Unigrams, Bigrams, and Trigrams

In [13]:
unigrams = generate_ngrams(lemmatized_words, 1)
print("\nUnigrams:\n", unigrams)
bigrams = generate_ngrams(lemmatized_words, 2)
print("\nBigrams:\n", bigrams)
trigrams = generate_ngrams(lemmatized_words, 3)
print("\nTrigrams:\n", trigrams)


Unigrams:
 [('sun',), ('shining',), ('bright',), ('today',)]

Bigrams:
 [('sun', 'shining'), ('shining', 'bright'), ('bright', 'today')]

Trigrams:
 [('sun', 'shining', 'bright'), ('shining', 'bright', 'today')]


# Convert n-grams to string format for CountVectorizer

In [14]:
unigram_text = [" ".join(gram) for gram in unigrams]
bigram_text = [" ".join(gram) for gram in bigrams]
trigram_text = [" ".join(gram) for gram in trigrams]

# Initialize CountVectorizer for Bag of N-grams Model

In [15]:
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Includes unigrams, bigrams, and trigrams

# Fit and transform the text

In [16]:
bow_matrix = vectorizer.fit_transform([" ".join(lemmatized_words)])

# Display Vocabulary (word-to-index mapping)

In [17]:
print("\nVocabulary (Word to Index Mapping):\n", vectorizer.vocabulary_)


Vocabulary (Word to Index Mapping):
 {'sun': 5, 'shining': 2, 'bright': 0, 'today': 8, 'sun shining': 6, 'shining bright': 3, 'bright today': 1, 'sun shining bright': 7, 'shining bright today': 4}


# Convert sparse matrix to array

In [18]:
bow_array = bow_matrix.toarray()

# Display the BoW matrix

In [19]:
print("\nBag of N-grams Matrix:\n", bow_array)


Bag of N-grams Matrix:
 [[1 1 1 1 1 1 1 1 1]]


# Convert BoW matrix to DataFrame for better readability

In [20]:
bow_df = pd.DataFrame(bow_array, columns=vectorizer.get_feature_names_out())
print("\nBag of N-grams Representation:\n", bow_df)


Bag of N-grams Representation:
    bright  bright today  shining  shining bright  shining bright today  sun  \
0       1             1        1               1                     1    1   

   sun shining  sun shining bright  today  
0            1                   1      1  
