# Implementation: Pipelines & Best Practices

We will build a robust processing pipeline that handles mixed data types automatically.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create Dummy Data
np.random.seed(42)
df = pd.DataFrame({
    'Age': np.random.randint(18, 80, 100),
    'Income': np.random.normal(50000, 10000, 100),
    'City': np.random.choice(['New York', 'London', 'Paris', np.nan], 100),
    'Purchased': np.random.choice([0, 1], 100)
})

# Add missing values to Age
df.loc[:10, 'Age'] = np.nan

print("Raw Data Head:")
display(df.head())

## 1. Splitting
ALWAYS split before doing anything else.

In [None]:
X = df.drop('Purchased', axis=1)
y = df['Purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Building the Pipeline
We define separate sub-pipelines for Numeric and Categorical data.

In [None]:
# Numeric Pipeline: Impute Mean -> Scale
num_features = ['Age', 'Income']
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical Pipeline: Impute Constant -> OneHotEncode
cat_features = ['City']
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine them
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ]
)

# Full Pipeline (Preprocessing + Model)
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Fit
full_pipeline.fit(X_train, y_train)

# Predict
y_pred = full_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("The pipeline handled missing values, encoding, and scaling behind the scenes!")