# Implementation: Cleaning Text

**Goal**: Use NLTK to clean a raw sentence.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Download NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Tokenize (Split by words)
    tokens = nltk.word_tokenize(text)
    
    # 3. Remove Punctuation
    # string.punctuation = !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
    tokens = [word for word in tokens if word not in string.punctuation]
    
    # 4. Remove Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # 5. Lemmatization (Root form)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Test
raw = "The runners were running fast, faster than the wind!"
cleaned = clean_text(raw)

print(f"Raw: {raw}")
print(f"Cleaned: {cleaned}")
# Expected: ['runner', 'running', 'fast', 'faster', 'wind'] -> 'running' might lemmatize to 'run' depending on POS

## Conclusion
Notice how "were", "the", "than" are gone. Punctuation is gone. 
This reduces the vocabulary size significantly, making training easier.