In [35]:
import re
import numpy as np
from collections import Counter


In [36]:
def simple_tokenize(text):
    """
    Basic word tokenization using whitespace and punctuation.
    
    Steps:
    1. Convert to lowercase
    2. Split on whitespace and punctuation
    3. Remove empty tokens
    
    Example:
        "Hello, World!" -> ["hello", "world"]
    
    Args:
        text: Input string
    
    Returns:
        List of tokens
    """
    text = text.lower()
    tokens = re.split(r'\W+', text)
    tokens = [token for token in tokens if token]
    return tokens


In [37]:
def tokenize_with_punctuation(text):
    """
    Tokenize while preserving punctuation as separate tokens.
    
    Example:
        "Hello, World!" -> ["hello", ",", "world", "!"]
    
    Hint: Use re.findall("\\w+|[^\\w\\s]", text.lower())
    """
    return re.findall(r"\w+|[^\w\s]", text.lower())


## Task 1.2: Test Your Tokenizer

Test with these edge cases:


In [38]:
test_cases = [
    "Hello, World!",                    # Basic punctuation
    "I'm learning NLP.",                # Contractions
    "Email: test@example.com",          # Special characters
    "The price is $19.99!",             # Numbers and symbols
    "   Multiple   spaces   here   ",   # Extra whitespace
    "UPPERCASE and lowercase",          # Case handling
]

for text in test_cases:
    print(f"Input: {text}")
    print(f"Simple: {simple_tokenize(text)}")
    print(f"With punct: {tokenize_with_punctuation(text)}")
    print()


Input: Hello, World!
Simple: ['hello', 'world']
With punct: ['hello', ',', 'world', '!']

Input: I'm learning NLP.
Simple: ['i', 'm', 'learning', 'nlp']
With punct: ['i', "'", 'm', 'learning', 'nlp', '.']

Input: Email: test@example.com
Simple: ['email', 'test', 'example', 'com']
With punct: ['email', ':', 'test', '@', 'example', '.', 'com']

Input: The price is $19.99!
Simple: ['the', 'price', 'is', '19', '99']
With punct: ['the', 'price', 'is', '$', '19', '.', '99', '!']

Input:    Multiple   spaces   here   
Simple: ['multiple', 'spaces', 'here']
With punct: ['multiple', 'spaces', 'here']

Input: UPPERCASE and lowercase
Simple: ['uppercase', 'and', 'lowercase']
With punct: ['uppercase', 'and', 'lowercase']



## Part 2: Vocabulary Building (15 min)


### Task 2.1: Build Vocabulary from Corpus


In [39]:
class Vocabulary:
    """
    Vocabulary manager for tokenized text.
    
    Special tokens:
    - <PAD>: Padding token (index 0)
    - <UNK>: Unknown token (index 1)
    - <BOS>: Beginning of sequence (index 2)
    - <EOS>: End of sequence (index 3)
    """
    
    def __init__(self, min_freq=1):
        """
        Initialize vocabulary with special tokens.
        
        Args:
            min_freq: Minimum frequency for a token to be included
        """
        self.min_freq = min_freq
        self.word2idx = {"<PAD>": 0, "<UNK>": 1, "<BOS>": 2, "<EOS>": 3}
        self.idx2word = {0: "<PAD>", 1: "<UNK>", 2: "<BOS>", 3: "<EOS>"}
        self.word_freq = Counter()
    
    def build_vocab(self, texts, tokenizer_fn):
        """
        Build vocabulary from a list of texts.
        
        Steps:
        1. Tokenize each text
        2. Count word frequencies
        3. Add words meeting min_freq threshold
        
        Args:
            texts: List of strings
            tokenizer_fn: Function to tokenize each text
        """
        for text in texts:
            tokens = tokenizer_fn(text)
            self.word_freq.update(tokens)
        
        for word, freq in self.word_freq.items():
            if freq >= self.min_freq and word not in self.word2idx:
                idx = len(self.word2idx)
                self.word2idx[word] = idx
                self.idx2word[idx] = word
    
    def encode(self, tokens):
        """
        Convert list of tokens to list of indices.
        
        Unknown tokens map to <UNK> (index 1).
        """
        return [self.word2idx.get(token, 1) for token in tokens]
    
    def decode(self, indices):
        """Convert list of indices back to tokens."""
        return [self.idx2word.get(idx, "<UNK>") for idx in indices]
    
    def __len__(self):
        return len(self.word2idx)


In [40]:
corpus = [
    "The cat sat on the mat.",
    "The dog ran in the park.",
    "A cat and a dog are friends.",
    "The mat is on the floor."
]

vocab = Vocabulary(min_freq=2)
vocab.build_vocab(corpus, simple_tokenize)

print(f"Vocabulary size: {len(vocab)}")
print(f"Words: {list(vocab.word2idx.keys())}")

test_text = "The cat is happy"
tokens = simple_tokenize(test_text)
encoded = vocab.encode(tokens)
decoded = vocab.decode(encoded)

print(f"\nTokens: {tokens}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")


Vocabulary size: 10
Words: ['<PAD>', '<UNK>', '<BOS>', '<EOS>', 'the', 'cat', 'on', 'mat', 'dog', 'a']

Tokens: ['the', 'cat', 'is', 'happy']
Encoded: [4, 5, 1, 1]
Decoded: ['the', 'cat', '<UNK>', '<UNK>']


## Part 3: Complete Tokenization Pipeline (15 min)


### Task 3.1: Build TextPreprocessor Class


In [41]:
class TextPreprocessor:
    """
    Complete text preprocessing pipeline.
    
    Usage:
        preprocessor = TextPreprocessor()
        preprocessor.fit(train_texts)
        train_sequences = preprocessor.transform(train_texts)
        test_sequences = preprocessor.transform(test_texts)
    """
    
    def __init__(self, max_vocab_size=10000, min_freq=2, max_seq_length=100):
        self.max_vocab_size = max_vocab_size
        self.min_freq = min_freq
        self.max_seq_length = max_seq_length
        self.vocab = Vocabulary(min_freq=min_freq)
    
    def fit(self, texts):
        """Build vocabulary from training texts."""
        temp_vocab = Vocabulary(min_freq=self.min_freq)
        temp_vocab.build_vocab(texts, simple_tokenize)
        
        if len(temp_vocab.word_freq) > 0:
            sorted_words = sorted(temp_vocab.word_freq.items(), key=lambda x: x[1], reverse=True)
            limited_words = sorted_words[:self.max_vocab_size - 4]
            
            self.vocab.word_freq = Counter(dict(sorted_words))
            for word, freq in limited_words:
                if word not in self.vocab.word2idx:
                    idx = len(self.vocab.word2idx)
                    self.vocab.word2idx[word] = idx
                    self.vocab.idx2word[idx] = word
    
    def transform(self, texts):
        """
        Transform texts to padded sequences.
        
        Steps:
        1. Tokenize each text
        2. Encode tokens to indices
        3. Pad/truncate to max_seq_length
        """
        sequences = []
        for text in texts:
            tokens = simple_tokenize(text)
            encoded = self.vocab.encode(tokens)
            
            if len(encoded) > self.max_seq_length:
                encoded = encoded[:self.max_seq_length]
            else:
                encoded = encoded + [0] * (self.max_seq_length - len(encoded))
            
            sequences.append(encoded)
        
        return np.array(sequences)
    
    def fit_transform(self, texts):
        """Fit and transform in one step."""
        self.fit(texts)
        return self.transform(texts)


### Task 3.2: Test on Real Dataset


In [42]:
reviews = [
    "This movie was absolutely fantastic! Great acting and plot.",
    "Terrible film. Waste of time and money.",
    "An average movie. Nothing special but watchable.",
    "One of the best movies I've ever seen!",
    "Boring and predictable. Skip this one."
]

preprocessor = TextPreprocessor(max_seq_length=20)
sequences = preprocessor.fit_transform(reviews)

print(f"Vocabulary size: {len(preprocessor.vocab)}")
print(f"Sequence shape: {sequences.shape}")
print(f"\nSample sequence: {sequences[0]}")
print(f"Decoded: {preprocessor.vocab.decode(sequences[0])}")


Vocabulary size: 36
Sequence shape: (5, 20)

Sample sequence: [ 5  6  9 10 11 12 13  4 14  0  0  0  0  0  0  0  0  0  0  0]
Decoded: ['this', 'movie', 'was', 'absolutely', 'fantastic', 'great', 'acting', 'and', 'plot', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


## Reflection Questions


### 1. How does min_freq affect vocabulary size?


In [43]:
test_corpus = [
    "The cat sat on the mat.",
    "The dog ran in the park.",
    "A cat and a dog are friends.",
    "The mat is on the floor.",
    "I love cats and dogs."
]

for min_freq in [1, 2, 5]:
    vocab = Vocabulary(min_freq=min_freq)
    vocab.build_vocab(test_corpus, simple_tokenize)
    print(f"min_freq={min_freq}: Vocabulary size = {len(vocab)}")
    print(f"  Words: {sorted([w for w in vocab.word2idx.keys() if w not in ['<PAD>', '<UNK>', '<BOS>', '<EOS>']])}")
    print()


min_freq=1: Vocabulary size = 23
  Words: ['a', 'and', 'are', 'cat', 'cats', 'dog', 'dogs', 'floor', 'friends', 'i', 'in', 'is', 'love', 'mat', 'on', 'park', 'ran', 'sat', 'the']

min_freq=2: Vocabulary size = 11
  Words: ['a', 'and', 'cat', 'dog', 'mat', 'on', 'the']

min_freq=5: Vocabulary size = 5
  Words: ['the']



**Answer:** Higher `min_freq` filters out rare words, reducing vocabulary size. This helps:
- Reduce memory usage and computation
- Focus on common, meaningful words
- Teh trade of is that may lose important rare words (e.g., domain-specific terms)


### 2. What happens when a test text contains words not in the vocabulary?


In [44]:
train_corpus = ["The cat sat on the mat.", "The dog ran in the park."]
test_text = "The elephant is happy"

vocab = Vocabulary(min_freq=1)
vocab.build_vocab(train_corpus, simple_tokenize)

tokens = simple_tokenize(test_text)
encoded = vocab.encode(tokens)
decoded = vocab.decode(encoded)

print(f"Test text: '{test_text}'")
print(f"Tokens: {tokens}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")
print(f"\nNote: 'elephant' and 'happy' → <UNK> (index 1) because they're not in training vocabulary")


Test text: 'The elephant is happy'
Tokens: ['the', 'elephant', 'is', 'happy']
Encoded: [4, 1, 1, 1]
Decoded: ['the', '<UNK>', '<UNK>', '<UNK>']

Note: 'elephant' and 'happy' → <UNK> (index 1) because they're not in training vocabulary


**Answer:** Unknown words map to `<UNK>`. Impact on model performance may lsoe some semantic meaning.

### 3. How do you choose max_seq_length?

**Answer:** Balance between information retention and efficiency with computation:
- If its too short it will truncate important information, as well as losing context
- Wastes memory due to padding


### 4. Why do we fit on training data only, then transform both train and test?

**Answer:** Keeps the integrity fo the evaluation by making sure the model is only tested on data its never seen before.
