### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load SMS Spam Collection dataset (small snippet from UCI repo)
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', names=['label', 'message'])

# Map labels to binary
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Basic text preprocessing function
def preprocess_text(text):
    text = text.lower()                # lowercase
    text = text.replace('\n', ' ')    # remove new lines
    return text

data['clean_message'] = data['message'].apply(preprocess_text)

# Extract TF-IDF features
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(data['clean_message'])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['label'], test_size=0.2, random_state=42)

print("TF-IDF feature matrix shape:", X_train.shape)


TF-IDF feature matrix shape: (4457, 1000)
