# NLTK Complete Guide - Section 17: Real-World Projects

This notebook covers practical NLP projects:
1. Text Summarization
2. Keyword Extraction
3. Spam Classifier
4. Question Answering
5. Chatbot Foundation
6. Document Similarity

In [None]:
import nltk
import re
import heapq
from collections import Counter, defaultdict

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, ne_chunk

## Project 1: Extractive Text Summarization

In [None]:
class TextSummarizer:
    """Extractive text summarization using sentence scoring"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
    
    def preprocess(self, text):
        """Clean and tokenize text"""
        # Remove special characters
        text = re.sub(r'\s+', ' ', text)
        return text
    
    def score_sentences(self, text):
        """Score sentences based on word frequency"""
        text = self.preprocess(text)
        sentences = sent_tokenize(text)
        
        # Calculate word frequencies
        words = word_tokenize(text.lower())
        words = [w for w in words if w.isalpha() and w not in self.stop_words]
        word_freq = Counter(words)
        
        # Normalize frequencies
        max_freq = max(word_freq.values()) if word_freq else 1
        word_freq = {w: f/max_freq for w, f in word_freq.items()}
        
        # Score sentences
        sentence_scores = {}
        for sent in sentences:
            words = word_tokenize(sent.lower())
            score = sum(word_freq.get(w, 0) for w in words if w.isalpha())
            # Normalize by sentence length to avoid bias toward long sentences
            word_count = len([w for w in words if w.isalpha()])
            if word_count > 0:
                sentence_scores[sent] = score / word_count
        
        return sentence_scores
    
    def summarize(self, text, num_sentences=3):
        """Generate summary with top N sentences"""
        scores = self.score_sentences(text)
        
        # Get top sentences while maintaining order
        top_sentences = heapq.nlargest(num_sentences, scores, key=scores.get)
        
        # Reorder by appearance in original text
        sentences = sent_tokenize(text)
        summary_sentences = [s for s in sentences if s in top_sentences]
        
        return ' '.join(summary_sentences)
    
    def summarize_ratio(self, text, ratio=0.3):
        """Summarize to a ratio of original length"""
        sentences = sent_tokenize(text)
        num_sentences = max(1, int(len(sentences) * ratio))
        return self.summarize(text, num_sentences)

In [None]:
# Test the summarizer
article = """
Artificial intelligence has transformed numerous industries in recent years. 
Machine learning algorithms can now process vast amounts of data to identify patterns 
that humans might miss. Natural language processing enables computers to understand 
and generate human language. Deep learning models have achieved remarkable results in 
image recognition, speech synthesis, and game playing. Companies are investing heavily 
in AI research and development. The technology has applications in healthcare, finance, 
transportation, and entertainment. However, AI also raises ethical concerns about privacy, 
job displacement, and algorithmic bias. Researchers are working to address these challenges 
while continuing to push the boundaries of what's possible. The future of AI promises 
even more exciting developments as computing power increases and algorithms improve.
"""

summarizer = TextSummarizer()

print("ORIGINAL TEXT:")
print("=" * 60)
print(article.strip())
print(f"\n({len(sent_tokenize(article))} sentences)")

print("\n" + "=" * 60)
print("SUMMARY (3 sentences):")
print("=" * 60)
summary = summarizer.summarize(article, num_sentences=3)
print(summary)

## Project 2: Keyword Extraction

In [None]:
class KeywordExtractor:
    """Extract keywords using TF-IDF-like scoring and POS filtering"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def extract_candidates(self, text):
        """Extract candidate keywords (nouns and noun phrases)"""
        tokens = word_tokenize(text.lower())
        tagged = pos_tag(tokens)
        
        # Filter for nouns and adjectives
        candidates = []
        for word, tag in tagged:
            if tag.startswith(('NN', 'JJ')) and word.isalpha():
                if word not in self.stop_words and len(word) > 2:
                    lemma = self.lemmatizer.lemmatize(word)
                    candidates.append(lemma)
        
        return candidates
    
    def extract_phrases(self, text):
        """Extract noun phrases as potential keywords"""
        from nltk.chunk import RegexpParser
        from nltk.tree import Tree
        
        grammar = r"NP: {<JJ>*<NN.*>+}"
        parser = RegexpParser(grammar)
        
        sentences = sent_tokenize(text)
        phrases = []
        
        for sent in sentences:
            tokens = word_tokenize(sent)
            tagged = pos_tag(tokens)
            tree = parser.parse(tagged)
            
            for subtree in tree:
                if isinstance(subtree, Tree) and subtree.label() == 'NP':
                    phrase = ' '.join(w.lower() for w, t in subtree.leaves())
                    if len(phrase.split()) > 1:  # Multi-word phrases
                        phrases.append(phrase)
        
        return phrases
    
    def extract_keywords(self, text, top_n=10, include_phrases=True):
        """Extract top keywords"""
        # Single word keywords
        candidates = self.extract_candidates(text)
        word_freq = Counter(candidates)
        
        # Score by frequency and position
        keywords = word_freq.most_common(top_n)
        
        result = {'single_words': keywords}
        
        if include_phrases:
            phrases = self.extract_phrases(text)
            phrase_freq = Counter(phrases)
            result['phrases'] = phrase_freq.most_common(top_n // 2)
        
        return result

In [None]:
extractor = KeywordExtractor()

text = """
Machine learning is a subset of artificial intelligence that enables computers 
to learn from data. Deep learning uses neural networks with multiple layers. 
Natural language processing helps machines understand human language. 
Computer vision allows systems to interpret visual information. 
Reinforcement learning trains agents through rewards and penalties.
Data science combines machine learning with statistical analysis.
Feature engineering is crucial for machine learning model performance.
"""

keywords = extractor.extract_keywords(text, top_n=10)

print("KEYWORD EXTRACTION")
print("=" * 40)

print("\nTop Single Words:")
for word, freq in keywords['single_words']:
    print(f"  {word}: {freq}")

print("\nTop Phrases:")
for phrase, freq in keywords['phrases']:
    print(f"  {phrase}: {freq}")

## Project 3: Spam Classifier

In [None]:
from nltk.classify import NaiveBayesClassifier

class SpamClassifier:
    """Simple spam detection using Naive Bayes"""
    
    def __init__(self):
        self.classifier = None
        self.stop_words = set(stopwords.words('english'))
        
        # Spam indicators
        self.spam_words = {
            'free', 'winner', 'cash', 'prize', 'urgent', 'congratulations',
            'click', 'subscribe', 'offer', 'limited', 'act', 'now',
            'money', 'credit', 'loan', 'discount', 'deal', 'buy'
        }
    
    def extract_features(self, text):
        """Extract features from text"""
        text_lower = text.lower()
        words = set(word_tokenize(text_lower))
        
        features = {
            # Word presence
            'has_free': 'free' in words,
            'has_winner': 'winner' in words,
            'has_click': 'click' in words,
            'has_urgent': 'urgent' in words,
            'has_money': 'money' in words,
            
            # Patterns
            'has_url': bool(re.search(r'https?://', text_lower)),
            'has_phone': bool(re.search(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', text)),
            'has_email': bool(re.search(r'@\w+\.\w+', text)),
            'has_caps': bool(re.search(r'[A-Z]{3,}', text)),
            'has_exclaim': '!' in text,
            'exclaim_count': text.count('!') > 2,
            'has_dollar': '$' in text,
            
            # Statistics
            'spam_word_count': len(words & self.spam_words),
            'short_message': len(words) < 20,
        }
        
        return features
    
    def train(self, training_data):
        """Train classifier with labeled data: [(text, label), ...]"""
        featuresets = [
            (self.extract_features(text), label)
            for text, label in training_data
        ]
        self.classifier = NaiveBayesClassifier.train(featuresets)
    
    def classify(self, text):
        """Classify text as spam or ham"""
        features = self.extract_features(text)
        return self.classifier.classify(features)
    
    def probability(self, text):
        """Get spam probability"""
        features = self.extract_features(text)
        prob_dist = self.classifier.prob_classify(features)
        return {
            'spam': prob_dist.prob('spam'),
            'ham': prob_dist.prob('ham')
        }

In [None]:
# Training data
training_data = [
    # Spam examples
    ("CONGRATULATIONS! You've won $1000! Click here NOW!", "spam"),
    ("FREE MONEY! Act now for your cash prize!", "spam"),
    ("Urgent: Your account needs verification. Click link.", "spam"),
    ("Winner! Claim your free gift card today!", "spam"),
    ("Limited offer! Buy now and save 90%!", "spam"),
    ("You're selected for exclusive deal! Call 555-1234", "spam"),
    ("Get rich quick! Make $5000 daily from home!", "spam"),
    ("DISCOUNT!!! Subscribe now for FREE samples!!!", "spam"),
    
    # Ham examples
    ("Hey, are we still meeting for lunch tomorrow?", "ham"),
    ("The project deadline has been extended to Friday.", "ham"),
    ("Thanks for sending the report. I'll review it today.", "ham"),
    ("Can you pick up groceries on your way home?", "ham"),
    ("Meeting rescheduled to 3pm in conference room B.", "ham"),
    ("Happy birthday! Hope you have a great day.", "ham"),
    ("I attached the documents you requested.", "ham"),
    ("Let me know when you're free to discuss the proposal.", "ham"),
]

# Train classifier
spam_classifier = SpamClassifier()
spam_classifier.train(training_data)

print("Spam Classifier trained!")
print("\nMost informative features:")
spam_classifier.classifier.show_most_informative_features(5)

In [None]:
# Test messages
test_messages = [
    "FREE iPhone! Click now to claim your prize!",
    "Don't forget about the team meeting at 2pm.",
    "URGENT! Your account will be suspended! Act NOW!",
    "Can you send me the updated spreadsheet?",
    "Winner selected! Claim $500 gift card here!",
]

print("SPAM DETECTION RESULTS")
print("=" * 60)

for msg in test_messages:
    result = spam_classifier.classify(msg)
    probs = spam_classifier.probability(msg)
    
    emoji = "ðŸš«" if result == "spam" else "âœ…"
    
    print(f"\n{emoji} {result.upper()} ({probs['spam']:.1%} spam)")
    print(f"   \"{msg[:50]}...\"" if len(msg) > 50 else f"   \"{msg}\"")

## Project 4: Simple Question Answering

In [None]:
class SimpleQA:
    """Simple extractive question answering"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
    
    def preprocess(self, text):
        """Tokenize and filter"""
        words = word_tokenize(text.lower())
        return [w for w in words if w.isalpha() and w not in self.stop_words]
    
    def similarity(self, sent1, sent2):
        """Calculate word overlap similarity"""
        words1 = set(self.preprocess(sent1))
        words2 = set(self.preprocess(sent2))
        
        if not words1 or not words2:
            return 0
        
        intersection = words1 & words2
        union = words1 | words2
        
        return len(intersection) / len(union)
    
    def find_answer(self, context, question, top_n=1):
        """Find best matching sentence(s) for the question"""
        sentences = sent_tokenize(context)
        
        # Score each sentence
        scored = []
        for sent in sentences:
            score = self.similarity(sent, question)
            scored.append((sent, score))
        
        # Sort by score
        scored.sort(key=lambda x: x[1], reverse=True)
        
        if top_n == 1:
            return scored[0][0] if scored[0][1] > 0 else "No answer found."
        
        return [s for s, score in scored[:top_n] if score > 0]

In [None]:
qa = SimpleQA()

context = """
Python was created by Guido van Rossum and first released in 1991.
It is a high-level, interpreted programming language known for its simplicity.
Python emphasizes code readability and uses significant indentation.
The language supports multiple programming paradigms, including procedural,
object-oriented, and functional programming. Python has a large standard library
and is widely used in web development, data science, and artificial intelligence.
The name Python was inspired by Monty Python's Flying Circus.
"""

questions = [
    "Who created Python?",
    "When was Python first released?",
    "What is Python used for?",
    "Where does the name Python come from?",
]

print("QUESTION ANSWERING")
print("=" * 60)
print(f"Context: {context.strip()[:100]}...")
print("=" * 60)

for q in questions:
    answer = qa.find_answer(context, q)
    print(f"\nQ: {q}")
    print(f"A: {answer}")

## Project 5: Simple Chatbot

In [None]:
import random

class SimpleChatbot:
    """Pattern-matching chatbot with NLTK"""
    
    def __init__(self, name="Bot"):
        self.name = name
        self.context = {}
        
        # Response patterns
        self.patterns = {
            r'hello|hi|hey': [
                "Hello! How can I help you today?",
                "Hi there! What can I do for you?",
                "Hey! Nice to meet you!"
            ],
            r'how are you': [
                "I'm doing great, thanks for asking!",
                "I'm fine! How about you?",
                "All good here!"
            ],
            r'what is your name|who are you': [
                f"I'm {name}, your friendly chatbot!",
                f"My name is {name}. Nice to meet you!"
            ],
            r'bye|goodbye|quit|exit': [
                "Goodbye! Have a great day!",
                "See you later!",
                "Bye! Take care!"
            ],
            r'thank': [
                "You're welcome!",
                "Happy to help!",
                "No problem!"
            ],
            r'weather': [
                "I can't check the weather, but I hope it's nice!",
                "Sorry, I don't have access to weather data."
            ],
            r'help': [
                "I can chat with you! Try saying hello or asking me questions.",
                "I'm here to help! Just start a conversation."
            ],
        }
        
        self.default_responses = [
            "I'm not sure I understand. Could you rephrase that?",
            "Interesting! Tell me more.",
            "I see. What else would you like to talk about?",
            "Could you explain that differently?"
        ]
    
    def preprocess(self, text):
        """Clean input text"""
        return text.lower().strip()
    
    def match_pattern(self, text):
        """Find matching pattern"""
        for pattern, responses in self.patterns.items():
            if re.search(pattern, text):
                return random.choice(responses)
        return None
    
    def respond(self, user_input):
        """Generate response to user input"""
        text = self.preprocess(user_input)
        
        # Check for pattern match
        response = self.match_pattern(text)
        
        if response:
            return response
        
        # Default response
        return random.choice(self.default_responses)
    
    def chat(self):
        """Interactive chat loop"""
        print(f"{self.name}: Hello! I'm {self.name}. Type 'quit' to exit.")
        print("-" * 50)
        
        while True:
            user_input = input("You: ").strip()
            
            if not user_input:
                continue
            
            if re.search(r'bye|quit|exit', user_input.lower()):
                print(f"{self.name}: Goodbye!")
                break
            
            response = self.respond(user_input)
            print(f"{self.name}: {response}")

In [None]:
# Demo the chatbot (non-interactive)
bot = SimpleChatbot("NLTK-Bot")

demo_inputs = [
    "Hello!",
    "What is your name?",
    "How are you doing?",
    "What's the weather like?",
    "Tell me a joke",
    "Thank you!",
    "Goodbye!"
]

print("CHATBOT DEMO")
print("=" * 50)

for user_input in demo_inputs:
    response = bot.respond(user_input)
    print(f"You: {user_input}")
    print(f"Bot: {response}")
    print()

## Project 6: Document Similarity

In [None]:
import math

class DocumentSimilarity:
    """Calculate document similarity using TF-IDF and cosine similarity"""
    
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.documents = []
        self.vocabulary = set()
        self.idf = {}
    
    def preprocess(self, text):
        """Tokenize and normalize text"""
        tokens = word_tokenize(text.lower())
        return [
            self.lemmatizer.lemmatize(w)
            for w in tokens
            if w.isalpha() and w not in self.stop_words
        ]
    
    def fit(self, documents):
        """Fit on a collection of documents"""
        self.documents = [self.preprocess(doc) for doc in documents]
        
        # Build vocabulary and calculate IDF
        doc_count = len(self.documents)
        word_doc_count = defaultdict(int)
        
        for doc in self.documents:
            for word in set(doc):
                word_doc_count[word] += 1
                self.vocabulary.add(word)
        
        # Calculate IDF
        for word, count in word_doc_count.items():
            self.idf[word] = math.log(doc_count / count)
    
    def tf_idf(self, document):
        """Calculate TF-IDF vector for a document"""
        if isinstance(document, str):
            document = self.preprocess(document)
        
        word_count = Counter(document)
        max_count = max(word_count.values()) if word_count else 1
        
        vector = {}
        for word in self.vocabulary:
            tf = word_count.get(word, 0) / max_count
            idf = self.idf.get(word, 0)
            vector[word] = tf * idf
        
        return vector
    
    def cosine_similarity(self, vec1, vec2):
        """Calculate cosine similarity between two vectors"""
        dot_product = sum(vec1.get(k, 0) * vec2.get(k, 0) for k in self.vocabulary)
        
        mag1 = math.sqrt(sum(v ** 2 for v in vec1.values()))
        mag2 = math.sqrt(sum(v ** 2 for v in vec2.values()))
        
        if mag1 == 0 or mag2 == 0:
            return 0
        
        return dot_product / (mag1 * mag2)
    
    def similarity_matrix(self, documents):
        """Calculate pairwise similarity matrix"""
        self.fit(documents)
        vectors = [self.tf_idf(doc) for doc in self.documents]
        
        n = len(documents)
        matrix = [[0] * n for _ in range(n)]
        
        for i in range(n):
            for j in range(n):
                matrix[i][j] = self.cosine_similarity(vectors[i], vectors[j])
        
        return matrix
    
    def find_similar(self, query, documents, top_n=3):
        """Find most similar documents to a query"""
        self.fit(documents)
        query_vector = self.tf_idf(query)
        
        similarities = []
        for i, doc in enumerate(self.documents):
            doc_vector = self.tf_idf(doc)
            sim = self.cosine_similarity(query_vector, doc_vector)
            similarities.append((i, sim))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_n]

In [None]:
# Test document similarity
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks with multiple layers.",
    "Natural language processing helps computers understand text.",
    "Python is widely used for machine learning and AI.",
    "Data science combines statistics with programming.",
]

sim_calc = DocumentSimilarity()
matrix = sim_calc.similarity_matrix(documents)

print("DOCUMENT SIMILARITY MATRIX")
print("=" * 50)

# Print matrix
print("\n     ", end="")
for i in range(len(documents)):
    print(f"Doc{i+1:>6}", end="")
print()

for i, row in enumerate(matrix):
    print(f"Doc{i+1}", end="")
    for val in row:
        print(f"{val:>7.2f}", end="")
    print()

print("\n" + "=" * 50)
print("Documents:")
for i, doc in enumerate(documents):
    print(f"Doc{i+1}: {doc[:50]}...")

In [None]:
# Find similar documents to a query
query = "How does artificial intelligence work?"

similar = sim_calc.find_similar(query, documents, top_n=3)

print(f"\nQuery: '{query}'")
print("\nMost similar documents:")
for idx, score in similar:
    print(f"  {score:.2f}: {documents[idx]}")

## Summary

### Projects Covered

| Project | Key Techniques |
|---------|----------------|
| Text Summarization | Sentence scoring, word frequency |
| Keyword Extraction | TF-IDF, POS filtering, noun phrases |
| Spam Classifier | Naive Bayes, feature extraction |
| Question Answering | Sentence similarity, extractive QA |
| Chatbot | Pattern matching, regex |
| Document Similarity | TF-IDF, cosine similarity |

### Key Takeaways
- Combine multiple NLTK tools for real applications
- Preprocessing is crucial for all NLP tasks
- Feature engineering impacts model performance
- Start simple, then add complexity as needed