### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [3]:
# write your code from here
# Import required libraries
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Step 2: Basic preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = text.strip()
    return text

# Apply preprocessing
df['clean_text'] = df['message'].apply(preprocess_text)

# Step 3: Encode labels (ham=0, spam=1)
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Step 4: Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label_encoded'], test_size=0.2, random_state=42
)

# Step 5: Convert text to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Check output shape
print("TF-IDF training features shape:", X_train_tfidf.shape)
print("TF-IDF test features shape:", X_test_tfidf.shape)


TF-IDF training features shape: (4457, 1247)
TF-IDF test features shape: (1115, 1247)
