# Normalization

Key Steps in Normalization:
- Lowercasing: Changes all characters to lowercase (e.g., "HELLO" → "hello").
- Removing Punctuation: Strips out symbols like commas and periods.
     (e.g., "Hi, there!" → "Hi there").
- Handling Contractions: Expands shortened forms (e.g., "don’t" → "do not").


In [11]:
# Import necessary libraries
import re
import string
import nltk
import spacy
from typing import List, Dict
from nltk.tokenize import word_tokenize

In [12]:
# Dictionary for common contractions
CONTRACTIONS: Dict[str, str] = {
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'d": " would",
    "'ll": " will",
    "'t": " not",
    "'ve": " have",
    "'m": " am"
}

In [13]:
def expand_contractions(text: str) -> str:
    """
    Expand common contractions in the text.

    Parameters:
    text (str): The input text containing contractions.

    Returns:
    str: The text with contractions expanded.
    """
    # Iterate through the CONTRACTIONS dictionary and replace contractions with their expansions
    for contraction, expansion in CONTRACTIONS.items():
        # Use regex to find exact word boundaries for accurate replacement
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expansion, text)
    return text

def remove_punctuation(text: str) -> str:
    """
    Remove punctuation from the text.

    Parameters:
    text (str): The input text containing punctuation.

    Returns:
    str: The text without punctuation.
    """
    # Use translate() with a translation table to remove all punctuation characters
    return text.translate(str.maketrans('', '', string.punctuation))


In [14]:
def normalize_text(text: str) -> str:
    """
    Normalize text by:
    1. Lowercasing the text.
    2. Expanding contractions.
    3. Removing punctuation.

    Parameters:
    text (str): The input text to normalize.

    Returns:
    str: The normalized text.
    """
    # Step 1: Convert all characters to lowercase to ensure uniformity
    text = text.lower()
    print("Lowercased Text:", text)

    # Step 2: Expand contractions using the expand_contractions function
    text = expand_contractions(text)
    print("After Expanding Contractions:", text)

    # Step 3: Remove punctuation using the remove_punctuation function
    text = remove_punctuation(text)
    print("Tokens without Punctuation:", text)

    # Return the fully normalized text
    return text


In [15]:
# Example usage
if __name__ == "__main__":
    sample_text = "I can't believe it's not butter! You're amazing, aren't you?"
    normalized_text = normalize_text(sample_text)
    print("\nOriginal Text:", sample_text)
    print("Normalized Text:", normalized_text)


Lowercased Text: i can't believe it's not butter! you're amazing, aren't you?
After Expanding Contractions: i cannot believe it is not butter! you are amazing, aren not you?
Tokens without Punctuation: i cannot believe it is not butter you are amazing aren not you

Original Text: I can't believe it's not butter! You're amazing, aren't you?
Normalized Text: i cannot believe it is not butter you are amazing aren not you
