# Install and import necessary libraries

In [1]:
!pip install nltk



In [2]:
import nltk
import string
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from sklearn.feature_extraction.text import CountVectorizer

# Download required NLTK resources

In [3]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

# Take input from the user for text preprocessing

In [4]:
text = input("Enter the text for preprocessing: ")

Enter the text for preprocessing: Natural Language Processing is a great field for research and application


# Convert text to lowercase (Normalization)

In [5]:
normalized_text = text.lower()
print("\nNormalized Text:\n", normalized_text)


Normalized Text:
 natural language processing is a great field for research and application


# Tokenizing the text into sentences

In [6]:
sentences = sent_tokenize(normalized_text)
print("\nSentence Tokenization:\n", sentences)


Sentence Tokenization:
 ['natural language processing is a great field for research and application']


# Tokenizing the text into words

In [7]:
words = word_tokenize(normalized_text)
print("\nWord Tokenization:\n", words)


Word Tokenization:
 ['natural', 'language', 'processing', 'is', 'a', 'great', 'field', 'for', 'research', 'and', 'application']


# Removing punctuation from the tokenized words

In [8]:
words_no_punct = [word for word in words if word not in string.punctuation]
print("\nPunctuation Removal:\n", words_no_punct)


Punctuation Removal:
 ['natural', 'language', 'processing', 'is', 'a', 'great', 'field', 'for', 'research', 'and', 'application']


# Removing stopwords (common words that do not add meaning)

In [9]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words_no_punct if word not in stop_words]
print("\nStopword Removal:\n", filtered_words)


Stopword Removal:
 ['natural', 'language', 'processing', 'great', 'field', 'research', 'application']


# Applying stemming (reducing words to their root form)

In [10]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]
print("\nStemming:\n", stemmed_words)


Stemming:
 ['natur', 'languag', 'process', 'great', 'field', 'research', 'applic']


# Applying lemmatization (reducing words to their base form)

In [11]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("\nLemmatization:\n", lemmatized_words)


Lemmatization:
 ['natural', 'language', 'processing', 'great', 'field', 'research', 'application']


# Downloading additional resources for POS tagging

In [12]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

# Performing Part-of-Speech (POS) tagging on tokenized words

In [13]:
pos_tags = pos_tag(words)
print("\nPOS Tagging:\n", pos_tags)


POS Tagging:
 [('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('field', 'NN'), ('for', 'IN'), ('research', 'NN'), ('and', 'CC'), ('application', 'NN')]


# Downloading additional resources for Named Entity Recognition

In [14]:
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


True

# Performing Named Entity Recognition (NER) to identify named entities

In [15]:
ner_chunks = ne_chunk(pos_tags)
print("\nNamed Entity Recognition (NER):\n", ner_chunks)


Named Entity Recognition (NER):
 (S
  natural/JJ
  language/NN
  processing/NN
  is/VBZ
  a/DT
  great/JJ
  field/NN
  for/IN
  research/NN
  and/CC
  application/NN)


# Initialize the CountVectorizer for Bag of Words (BoW) model

In [16]:
vectorizer = CountVectorizer()

# Fit and transform the filtered text

In [17]:
bow_matrix = vectorizer.fit_transform([" ".join(filtered_words)])

# Display vocabulary (unique words and indices)

In [18]:
print("\nVocabulary (Word to Index Mapping):\n", vectorizer.vocabulary_)


Vocabulary (Word to Index Mapping):
 {'natural': 4, 'language': 3, 'processing': 5, 'great': 2, 'field': 1, 'research': 6, 'application': 0}


# Convert sparse matrix to array into a dense array


In [19]:
bow_array = bow_matrix.toarray()

# Display the BoW matrix

In [20]:
print("\nBag of Words matrix:\n", bow_array)


Bag of Words matrix:
 [[1 1 1 1 1 1 1]]


# Convert BoW matrix to a DataFrame for better visualization

In [21]:
bow_df = pd.DataFrame(bow_array, columns=vectorizer.get_feature_names_out())
print("\nBag of Words Representation:\n", bow_df)


Bag of Words Representation:
    application  field  great  language  natural  processing  research
0            1      1      1         1        1           1         1
