In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

# Text processing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag

# Scikit-learn for text processing and clustering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

# Download required NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('averaged_perceptron_tagger')

print("Libraries imported and NLTK data downloaded successfully!")


In [None]:
# Sample documents for text processing
documents = [
    "Machine learning is a subset of artificial intelligence that focuses on algorithms and statistical models.",
    "Deep learning uses neural networks with multiple layers to learn patterns in data automatically.",
    "Natural language processing enables computers to understand, interpret, and generate human language.",
    "Computer vision allows machines to interpret and understand visual information from the world around them.",
    "Data science combines statistics, programming, and domain expertise to extract insights from data.",
    "Python is a popular programming language for data science, machine learning, and web development.",
    "Statistical analysis helps us understand patterns, relationships, and trends in datasets.",
    "Big data technologies handle large volumes of structured and unstructured data efficiently.",
    "Cloud computing provides scalable and flexible computing resources over the internet.",
    "Artificial intelligence aims to create systems that can perform tasks typically requiring human intelligence.",
    "The weather today is sunny and warm, perfect for outdoor activities and sports.",
    "Cooking delicious meals requires fresh ingredients, proper techniques, and creative recipes.",
    "Travel planning involves researching destinations, booking accommodations, and creating itineraries.",
    "Sports activities promote physical fitness, teamwork, and mental well-being for participants.",
    "Music and art inspire creativity, emotional expression, and cultural understanding across societies."
]

# Create labels for different topics
labels = [
    'Technology', 'Technology', 'Technology', 'Technology', 'Technology',
    'Technology', 'Technology', 'Technology', 'Technology', 'Technology',
    'Lifestyle', 'Lifestyle', 'Lifestyle', 'Lifestyle', 'Lifestyle'
]

# Create a DataFrame
df = pd.DataFrame({
    'document': documents,
    'category': labels,
    'doc_id': range(len(documents))
})

print(f"Created dataset with {len(documents)} documents")
print(f"Categories: {set(labels)}")
print("\nFirst few documents:")
for i, doc in enumerate(documents[:3]):
    print(f"{i+1}. {doc}")
    
df.head()


In [None]:
# Example text for tokenization
sample_text = "Hello, World! This is a sample sentence. It contains punctuation, numbers like 123, and symbols @#$."

print("Original text:")
print(sample_text)
print("\n" + "="*50)

# 1. Word tokenization using NLTK
nltk_tokens = word_tokenize(sample_text)
print(f"\n1. NLTK Word Tokenization ({len(nltk_tokens)} tokens):")
print(nltk_tokens)

# 2. Simple split tokenization
simple_tokens = sample_text.split()
print(f"\n2. Simple Split Tokenization ({len(simple_tokens)} tokens):")
print(simple_tokens)

# 3. Regular expression tokenization
regex_tokens = re.findall(r'\b\w+\b', sample_text)
print(f"\n3. Regex Tokenization ({len(regex_tokens)} tokens):")
print(regex_tokens)

# 4. Sentence tokenization
sentences = sent_tokenize(sample_text)
print(f"\n4. Sentence Tokenization ({len(sentences)} sentences):")
for i, sent in enumerate(sentences):
    print(f"   {i+1}. {sent}")

# Compare tokenization methods
print("\n" + "="*50)
print("COMPARISON:")
print(f"NLTK preserves punctuation: {any(token in string.punctuation for token in nltk_tokens)}")
print(f"Simple split keeps punctuation attached: {'World!' in simple_tokens}")
print(f"Regex extracts only word characters: {regex_tokens}")


In [None]:
def comprehensive_text_cleaner(text):
    """Comprehensive text cleaning function"""
    # Store original for comparison
    original = text
    
    # Step 1: Convert to lowercase
    text = text.lower()
    
    # Step 2: Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Step 3: Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Step 4: Remove numbers (optional - depends on use case)
    text = re.sub(r'\d+', '', text)
    
    # Step 5: Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Step 6: Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Step 7: Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# Demonstrate cleaning pipeline
messy_text = """
   Hello!!! This is a MESSY text with NUMBERS 123, URLs like https://example.com, 
   emails like user@domain.com, and    too much    whitespace... 
   It also has @special #symbols and unnecessary punctuation!!! 
""".strip()

print("Original messy text:")
print(repr(messy_text))
print(f"Length: {len(messy_text)}")

cleaned_text = comprehensive_text_cleaner(messy_text)
print("\nCleaned text:")
print(repr(cleaned_text))
print(f"Length: {len(cleaned_text)}")

# Apply cleaning to our document collection
print("\n" + "="*60)
print("CLEANING DOCUMENT COLLECTION:")

df['cleaned_document'] = df['document'].apply(comprehensive_text_cleaner)

# Show before and after for first few documents
for i in range(3):
    print(f"\nDocument {i+1}:")
    print(f"Original:  {df.iloc[i]['document']}")
    print(f"Cleaned:   {df.iloc[i]['cleaned_document']}")
