# Text Preprocessing ‚Äì More Examples (Jupyter Notebook)

This notebook contains multiple real-world text preprocessing examples that can be run cell by cell.

## Install & Download Requirements (Run Once)

In [None]:
!pip install nltk

## Download NLTK Resources

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

## Import Libraries

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Example 1: Social Media Text

In [None]:
text = "OMG!!! I love this phone üòçüòç #Amazing #WorthIt"
text = text.lower()
text = re.sub(r"#\w+", "", text)
text = re.sub(r"[^\w\s]", "", text)
tokens = word_tokenize(text)
[w for w in tokens if w not in stopwords.words('english')]

## Example 2: Email Text Cleaning

In [None]:
text = "Dear User, Please contact us at support@example.com ASAP!!!"
text = text.lower()
text = re.sub(r"\S+@\S+", "", text)
text = re.sub(r"[^\w\s]", "", text)
text.split()

## Example 3: Stopword Removal Impact

In [None]:
text = "This movie is not good"
tokens = word_tokenize(text.lower())
tokens, [w for w in tokens if w not in stopwords.words('english')]

## Example 4: Stemming vs Lemmatization

In [None]:
words = ['running', 'flies', 'better', 'studies']
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
[stemmer.stem(w) for w in words], [lemmatizer.lemmatize(w) for w in words]

## Example 5: URL & Number Removal

In [None]:
text = "India won the match by 6 wickets. Read more at https://news.com"
text = text.lower()
text = re.sub(r"http\S+", "", text)
text = re.sub(r"\d+", "", text)
text = re.sub(r"[^\w\s]", "", text)
text.split()

## Example 6: Sentence Tokenization

In [None]:
text = "NLP is powerful. It is used in AI. Many companies rely on it."
sent_tokenize(text)

## Example 7: Complete Reusable Preprocessing Pipeline

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords.words('english')]
    return tokens

preprocess_text("Contact me at test@mail.com!!! NLP is AWESOME üòç")