In [1]:
import nltk
from nltk.tokenize import (
    word_tokenize,
    sent_tokenize,
    WhitespaceTokenizer,
    WordPunctTokenizer
)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
# 1. Resource Setup
def setup_resources():
    resources = ['punkt', 'punkt_tab', 'averaged_perceptron_tagger_eng', 'wordnet', 'stopwords']
    for res in resources:
        nltk.download(res, quiet=True)

setup_resources()

In [3]:
class NLPAnalyzer:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        """Removes special characters and extra whitespace."""
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text.strip().lower()

    def perform_analysis(self, raw_text):
        print(f"Original Text: {raw_text}\n")

        # A. Sentence Tokenization
        sentences = sent_tokenize(raw_text)
        print(f"1. Sentences Detected ({len(sentences)}): {sentences}")

        # B. Word Tokenization (Using WordPunct for detail)
        tokens = WordPunctTokenizer().tokenize(raw_text)
        print(f"2. Word Tokens: {tokens[:10]}...")

        # C. Cleaning & Stopword Removal
        cleaned = [w.lower() for w in tokens if w.lower().isalnum() and w.lower() not in self.stop_words]
        print(f"3. Cleaned Tokens (No Stopwords): {cleaned[:10]}...")

        # D. POS Tagging
        # Tags words as NNP (Proper Noun), VBZ (Verb), JJ (Adjective), etc.
        pos_tags = nltk.pos_tag(tokens)
        print("\n4. Part-of-Speech (POS) Tagging (First 5):")
        for word, tag in pos_tags[:5]:
            print(f"   {word:<12} -> {tag}")

        # E. Basic Frequency Distribution
        freq_dist = nltk.FreqDist(cleaned)
        print(f"\n5. Most Common Words: {freq_dist.most_common(3)}")

In [4]:
if __name__ == "__main__":
    analyzer = NLPAnalyzer()

    sample_data = (
        "NLTK is a leading platform for building Python programs. "
        "It provides easy-to-use interfaces to over 50 corpora and lexical resources."
    )

    analyzer.perform_analysis(sample_data)

Original Text: NLTK is a leading platform for building Python programs. It provides easy-to-use interfaces to over 50 corpora and lexical resources.

1. Sentences Detected (2): ['NLTK is a leading platform for building Python programs.', 'It provides easy-to-use interfaces to over 50 corpora and lexical resources.']
2. Word Tokens: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', '.']...
3. Cleaned Tokens (No Stopwords): ['nltk', 'leading', 'platform', 'building', 'python', 'programs', 'provides', 'easy', 'use', 'interfaces']...

4. Part-of-Speech (POS) Tagging (First 5):
   NLTK         -> NNP
   is           -> VBZ
   a            -> DT
   leading      -> VBG
   platform     -> NN

5. Most Common Words: [('nltk', 1), ('leading', 1), ('platform', 1)]
