# Assignment 1: NLTK Tokenization, Stemming, and Lemmatization

This notebook demonstrates various tokenization techniques, stemming methods, and lemmatization using the NLTK library.

## 1. Import Required Libraries

In [6]:
import nltk
from nltk.tokenize import (WhitespaceTokenizer, WordPunctTokenizer, 
                           TreebankWordTokenizer, TweetTokenizer, MWETokenizer)
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
import numpy as np
import pandas as pd

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('wordnet_ic', quiet=True)
nltk.download('omw-1.4', quiet=True)

print("All required libraries imported successfully!")

All required libraries imported successfully!


## 2. Sample Text Data

In [7]:
# Sample texts for demonstration
sample_text = """Natural Language Processing (NLP) is fascinating! 
                It's used in machine learning, deep learning, and AI. 
                Dr. Smith is working on NLP applications."""

sample_tweet = "I'm loving this! @NLPenthusiasts #Python #MachineLearning :) Check out https://example.com"

print("Sample Text:")
print(sample_text)
print("\nSample Tweet:")
print(sample_tweet)

Sample Text:
Natural Language Processing (NLP) is fascinating! 
                It's used in machine learning, deep learning, and AI. 
                Dr. Smith is working on NLP applications.

Sample Tweet:
I'm loving this! @NLPenthusiasts #Python #MachineLearning :) Check out https://example.com


## 3. Whitespace Tokenization

Splits text on whitespace characters only.

In [8]:
ws_tokenizer = WhitespaceTokenizer()
ws_tokens = ws_tokenizer.tokenize(sample_text)

print("Whitespace Tokenization:")
print(f"Number of tokens: {len(ws_tokens)}")
print(f"Tokens: {ws_tokens}")
print(f"\nFirst 10 tokens: {ws_tokens[:10]}")

Whitespace Tokenization:
Number of tokens: 22
Tokens: ['Natural', 'Language', 'Processing', '(NLP)', 'is', 'fascinating!', "It's", 'used', 'in', 'machine', 'learning,', 'deep', 'learning,', 'and', 'AI.', 'Dr.', 'Smith', 'is', 'working', 'on', 'NLP', 'applications.']

First 10 tokens: ['Natural', 'Language', 'Processing', '(NLP)', 'is', 'fascinating!', "It's", 'used', 'in', 'machine']


## 4. Punctuation-based Tokenization

Separates words from punctuation marks.

In [9]:
wp_tokenizer = WordPunctTokenizer()
wp_tokens = wp_tokenizer.tokenize(sample_text)

print("Punctuation-based Tokenization (WordPunctTokenizer):")
print(f"Number of tokens: {len(wp_tokens)}")
print(f"Tokens: {wp_tokens}")
print(f"\nFirst 15 tokens: {wp_tokens[:15]}")

Punctuation-based Tokenization (WordPunctTokenizer):
Number of tokens: 32
Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'fascinating', '!', 'It', "'", 's', 'used', 'in', 'machine', 'learning', ',', 'deep', 'learning', ',', 'and', 'AI', '.', 'Dr', '.', 'Smith', 'is', 'working', 'on', 'NLP', 'applications', '.']

First 15 tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'fascinating', '!', 'It', "'", 's', 'used', 'in', 'machine']


## 5. Treebank Tokenization

Follows Penn Treebank tokenization rules, handling contractions specially.

In [10]:
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(sample_text)

print("Treebank Tokenization:")
print(f"Number of tokens: {len(treebank_tokens)}")
print(f"Tokens: {treebank_tokens}")
print(f"\nNotice how contractions are handled: It's -> It + 's")
# Show comparison with contractions
comparison_text = "It's a beautiful day. I've never seen such beauty."
print(f"\nComparison text: {comparison_text}")
print(f"Treebank tokens: {treebank_tokenizer.tokenize(comparison_text)}")

Treebank Tokenization:
Number of tokens: 29
Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'fascinating', '!', 'It', "'s", 'used', 'in', 'machine', 'learning', ',', 'deep', 'learning', ',', 'and', 'AI.', 'Dr.', 'Smith', 'is', 'working', 'on', 'NLP', 'applications', '.']

Notice how contractions are handled: It's -> It + 's

Comparison text: It's a beautiful day. I've never seen such beauty.
Treebank tokens: ['It', "'s", 'a', 'beautiful', 'day.', 'I', "'ve", 'never', 'seen', 'such', 'beauty', '.']


## 6. Tweet Tokenization

Handles social media text with emoticons, hashtags, mentions, and URLs.

In [11]:
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(sample_tweet)

print("Tweet Tokenization:")
print(f"Number of tokens: {len(tweet_tokens)}")
print(f"Tokens: {tweet_tokens}")
print(f"\nNotice: Mentions (@), hashtags (#), URLs, and emoticons (:)) are preserved as separate tokens")

Tweet Tokenization:
Number of tokens: 11
Tokens: ["I'm", 'loving', 'this', '!', '@NLPenthusiasts', '#Python', '#MachineLearning', ':)', 'Check', 'out', 'https://example.com']

Notice: Mentions (@), hashtags (#), URLs, and emoticons (:)) are preserved as separate tokens


## 7. Multi-Word Expression (MWE) Tokenization

Treats multi-word expressions as single tokens.

In [None]:
# MWE Tokenizer - treats multi-word expressions as single tokens
mwe_text = "New York is a major city. Machine learning is fascinating."

# Define multi-word expressions
mwe_tokenizer = MWETokenizer([('New', 'York'), ('Machine', 'learning')])

# First tokenize the text
basic_tokens = mwe_text.split()
print("Basic tokenization (before MWE):")
print(basic_tokens)

# Then apply MWE
from nltk.tokenize import word_tokenize
basic_tokens_nltk = word_tokenize(mwe_text)
mwe_tokens = mwe_tokenizer.tokenize(basic_tokens_nltk)

print("\nMWE Tokenization (after MWE):")
print(f"Tokens: {mwe_tokens}")
print(f"Notice how 'New York' and 'Machine learning' are treated as single tokens using underscore")

## 8. Porter Stemmer

Reduces words to their root form using the Porter stemming algorithm.

In [None]:
porter_stemmer = PorterStemmer()

# Sample words to stem
words_to_stem = ['running', 'runs', 'ran', 'runner', 'easily', 'fairly',
                  'working', 'worked', 'works', 'organization', 'organize',
                  'organized', 'organizes', 'programming', 'programs']

print("Porter Stemmer Results:")
print("-" * 50)
print(f"{'Original Word':<20} | {'Stemmed Word':<20}")
print("-" * 50)

stemmed_porter = []
for word in words_to_stem:
    stem = porter_stemmer.stem(word)
    stemmed_porter.append((word, stem))
    print(f"{word:<20} | {stem:<20}")

print("\nObservations:")
print("- 'running', 'runs', 'ran' all stem to 'run'")
print("- 'organize', 'organized', 'organizing' all stem to 'organ'")

## 9. Snowball Stemmer

Universal stemmer supporting multiple languages.

In [None]:
# Snowball Stemmer
snowball_stemmer = SnowballStemmer('english')

print("Snowball Stemmer Results (English):")
print("-" * 60)
print(f"{'Original Word':<20} | {'Porter':<15} | {'Snowball':<15}")
print("-" * 60)

for word in words_to_stem:
    porter_stem = porter_stemmer.stem(word)
    snowball_stem = snowball_stemmer.stem(word)
    print(f"{word:<20} | {porter_stem:<15} | {snowball_stem:<15}")

# Demonstrate Snowball with different languages
print("\n\nSnowball Stemmer - Multilingual Support:")
print("-" * 50)

languages = ['english', 'french', 'spanish', 'german']
test_words = {
    'english': ['running', 'organized'],
    'french': ['exÃ©cution', 'organisation'],
    'spanish': ['corriendo', 'organizado'],
    'german': ['laufen', 'organisiert']
}

for lang in languages:
    if lang in test_words:
        stemmer = SnowballStemmer(lang)
        print(f"\n{lang.upper()}:")
        for word in test_words[lang]:
            print(f"  {word} -> {stemmer.stem(word)}")

## 10. Lemmatization with WordNet

Convert words to their base form (lemma) using WordNet database.

In [None]:
# WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Helper function to get POS tag for lemmatization
def get_wordnet_pos(treebank_tag):
    """Convert Treebank POS tags to WordNet POS tags"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# Sample text for lemmatization
text_for_lemma = "The cars are running quickly. He is running away. They have run."

# Tokenize and tag POS
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text_for_lemma)
pos_tags = pos_tag(tokens)

print("Lemmatization with POS Tagging:")
print("-" * 60)
print(f"{'Token':<15} | {'POS Tag':<10} | {'Lemma':<15}")
print("-" * 60)

lemmatized_words = []
for token, pos in pos_tags:
    wordnet_pos = get_wordnet_pos(pos)
    if wordnet_pos is None:
        lemma = lemmatizer.lemmatize(token)
    else:
        lemma = lemmatizer.lemmatize(token, pos=wordnet_pos)
    lemmatized_words.append(lemma)
    print(f"{token:<15} | {pos:<10} | {lemma:<15}")

print(f"\nOriginal text: {text_for_lemma}")
print(f"Lemmatized text: {' '.join(lemmatized_words)}")

# Comparison: Lemmatization vs Stemming
print("\n\n" + "="*60)
print("Lemmatization vs Stemming Comparison:")
print("="*60)
comparison_words = ['running', 'runs', 'ran', 'easily', 'better', 'was', 'is', 'am']
print(f"{'Word':<15} | {'Lemma':<15} | {'Porter Stem':<15} | {'Snowball Stem':<15}")
print("-" * 60)
for word in comparison_words:
    lemma = lemmatizer.lemmatize(word)
    porter = porter_stemmer.stem(word)
    snowball = snowball_stemmer.stem(word)
    print(f"{word:<15} | {lemma:<15} | {porter:<15} | {snowball:<15}")