In [1]:
# Read and Load the Text File
text_file = open("sentences.txt")
text = text_file.read()

print("Data type of text:", type(text))
print("Text content:\n", text)
print("\nLength of the text:", len(text))

# Import Required Libraries and Download NLTK Resources
import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')  # Needed for POS tagging

# Sentence and Word Tokenization
sentences = sent_tokenize(text)
print("\nNumber of sentences:", len(sentences))

words = word_tokenize(text)
print("Number of words:", len(words))
print("Tokenized words:", words)

# Frequency Distribution of All Words (including punctuation)
fdist_all = FreqDist(words)
print("\nTop 10 most common tokens (including punctuation):")
print(fdist_all.most_common(10))

# Remove Punctuation
#string.punctuation is a built-in string in Python that contains all standard punctuation characters
words_no_punc = [w for w in words if w not in string.punctuation] #if w not in string.punctuation only it will be included
print("\nWords after removing punctuation:", words_no_punc)

#Remove Stopwords
stopwords_list = stopwords.words('english')
print("\nStopwords list (sample):", stopwords_list[:10])  # Show only first 10 

clean_words = [w for w in words_no_punc if w.lower() not in stopwords_list]
#converts each word to lowercase so comparison with stopwords is case-insensitive.
print("\nCleaned words (no stopwords):", clean_words)
print("Number of cleaned words:", len(clean_words))

# Frequency Distribution of Cleaned Words
fdist_clean = FreqDist(clean_words)
print("\nTop 10 most common cleaned words:")
print(fdist_clean.most_common(10))

# POS Tagging
pos_tags = pos_tag(clean_words)
print("\n🔹 Part of Speech (POS) Tagging:")
for word, tag in pos_tags:
    print(f"{word}: {tag}")

Data type of text: <class 'str'>
Text content:
 QVC Network Inc. said it completed its acquisition of CVN Cos. for about $ 423 million .
The spirits , of course , could hardly care less whether people do or do n't believe in them .
The debt ceiling is scheduled to fall to $ 2.8 trillion from $ 2.87 trillion at midnight tonight .


Length of the text: 283


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admine\AppData\Roaming\nltk_data...



Number of sentences: 3
Number of words: 56
Tokenized words: ['QVC', 'Network', 'Inc.', 'said', 'it', 'completed', 'its', 'acquisition', 'of', 'CVN', 'Cos.', 'for', 'about', '$', '423', 'million', '.', 'The', 'spirits', ',', 'of', 'course', ',', 'could', 'hardly', 'care', 'less', 'whether', 'people', 'do', 'or', 'do', "n't", 'believe', 'in', 'them', '.', 'The', 'debt', 'ceiling', 'is', 'scheduled', 'to', 'fall', 'to', '$', '2.8', 'trillion', 'from', '$', '2.87', 'trillion', 'at', 'midnight', 'tonight', '.']

Top 10 most common tokens (including punctuation):
[('$', 3), ('.', 3), ('of', 2), ('The', 2), (',', 2), ('do', 2), ('to', 2), ('trillion', 2), ('QVC', 1), ('Network', 1)]

Words after removing punctuation: ['QVC', 'Network', 'Inc.', 'said', 'it', 'completed', 'its', 'acquisition', 'of', 'CVN', 'Cos.', 'for', 'about', '423', 'million', 'The', 'spirits', 'of', 'course', 'could', 'hardly', 'care', 'less', 'whether', 'people', 'do', 'or', 'do', "n't", 'believe', 'in', 'them', 'The', '

[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
