In [1]:
# Assignment 07: Advanced Text Preprocessing and Analysis using NLTK

In [2]:
!pip install nltk




In [3]:
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, FreqDist, ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
## Input Text (Public Information Example)
text = """
Natural Language Processing is widely used in public information systems.
Governments use NLP to analyze feedback, monitor public opinion,
and improve communication with citizens.
"""

print(text)



Natural Language Processing is widely used in public information systems.
Governments use NLP to analyze feedback, monitor public opinion,
and improve communication with citizens.



In [5]:
## Text Normalization
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
print(text)

## Sentence and Word Tokenization



natural language processing is widely used in public information systems
governments use nlp to analyze feedback monitor public opinion
and improve communication with citizens



In [7]:
import nltk
nltk.download('punkt_tab')

## Sentence and Word Tokenization
sentences = sent_tokenize(text)
words = word_tokenize(text)

print("Sentences:", sentences)
print("Words:", words)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sentences: ['\nnatural language processing is widely used in public information systems\ngovernments use nlp to analyze feedback monitor public opinion\nand improve communication with citizens']
Words: ['natural', 'language', 'processing', 'is', 'widely', 'used', 'in', 'public', 'information', 'systems', 'governments', 'use', 'nlp', 'to', 'analyze', 'feedback', 'monitor', 'public', 'opinion', 'and', 'improve', 'communication', 'with', 'citizens']


In [8]:
## Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stop_words and w.isalpha()]
print(filtered_words)


['natural', 'language', 'processing', 'widely', 'used', 'public', 'information', 'systems', 'governments', 'use', 'nlp', 'analyze', 'feedback', 'monitor', 'public', 'opinion', 'improve', 'communication', 'citizens']


In [9]:
## Stemming vs Lemmatization Comparison
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

comparison = [(w, stemmer.stem(w), lemmatizer.lemmatize(w)) for w in filtered_words]
comparison


[('natural', 'natur', 'natural'),
 ('language', 'languag', 'language'),
 ('processing', 'process', 'processing'),
 ('widely', 'wide', 'widely'),
 ('used', 'use', 'used'),
 ('public', 'public', 'public'),
 ('information', 'inform', 'information'),
 ('systems', 'system', 'system'),
 ('governments', 'govern', 'government'),
 ('use', 'use', 'use'),
 ('nlp', 'nlp', 'nlp'),
 ('analyze', 'analyz', 'analyze'),
 ('feedback', 'feedback', 'feedback'),
 ('monitor', 'monitor', 'monitor'),
 ('public', 'public', 'public'),
 ('opinion', 'opinion', 'opinion'),
 ('improve', 'improv', 'improve'),
 ('communication', 'commun', 'communication'),
 ('citizens', 'citizen', 'citizen')]

In [16]:
## Part-of-Speech (POS) Tag Distribution
nltk.download('averaged_perceptron_tagger_eng')
pos_tags = pos_tag(filtered_words)
pos_freq = FreqDist(tag for (word, tag) in pos_tags)

print(pos_tags)
print("\nPOS Frequency Distribution:")
pos_freq


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('widely', 'RB'), ('used', 'VBN'), ('public', 'JJ'), ('information', 'NN'), ('systems', 'NNS'), ('governments', 'NNS'), ('use', 'VBP'), ('nlp', 'JJ'), ('analyze', 'NN'), ('feedback', 'NN'), ('monitor', 'NN'), ('public', 'JJ'), ('opinion', 'NN'), ('improve', 'VB'), ('communication', 'NN'), ('citizens', 'NNS')]

POS Frequency Distribution:


FreqDist({'NN': 8, 'JJ': 4, 'NNS': 3, 'RB': 1, 'VBN': 1, 'VBP': 1, 'VB': 1})

In [12]:
## N-grams (Bigrams and Trigrams)
bigrams = list(ngrams(filtered_words, 2))
trigrams = list(ngrams(filtered_words, 3))

print("Bigrams:", bigrams)
print("Trigrams:", trigrams)


Bigrams: [('natural', 'language'), ('language', 'processing'), ('processing', 'widely'), ('widely', 'used'), ('used', 'public'), ('public', 'information'), ('information', 'systems'), ('systems', 'governments'), ('governments', 'use'), ('use', 'nlp'), ('nlp', 'analyze'), ('analyze', 'feedback'), ('feedback', 'monitor'), ('monitor', 'public'), ('public', 'opinion'), ('opinion', 'improve'), ('improve', 'communication'), ('communication', 'citizens')]
Trigrams: [('natural', 'language', 'processing'), ('language', 'processing', 'widely'), ('processing', 'widely', 'used'), ('widely', 'used', 'public'), ('used', 'public', 'information'), ('public', 'information', 'systems'), ('information', 'systems', 'governments'), ('systems', 'governments', 'use'), ('governments', 'use', 'nlp'), ('use', 'nlp', 'analyze'), ('nlp', 'analyze', 'feedback'), ('analyze', 'feedback', 'monitor'), ('feedback', 'monitor', 'public'), ('monitor', 'public', 'opinion'), ('public', 'opinion', 'improve'), ('opinion', 'im

In [11]:
## Collocation Extraction (Meaningful Word Pairs)
finder = BigramCollocationFinder.from_words(filtered_words)
collocations = finder.nbest(BigramAssocMeasures.likelihood_ratio, 5)

print("Top Collocations:")
collocations


Top Collocations:


[('analyze', 'feedback'),
 ('communication', 'citizens'),
 ('feedback', 'monitor'),
 ('governments', 'use'),
 ('improve', 'communication')]

In [13]:
## Text Statistics
print("Total sentences:", len(sentences))
print("Total words:", len(words))
print("Unique words:", len(set(filtered_words)))
print("Lexical diversity:", len(set(filtered_words)) / len(filtered_words))


Total sentences: 1
Total words: 24
Unique words: 18
Lexical diversity: 0.9473684210526315
