# ML Pipeline Example

---

In [3]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [2]:
df = sns.load_dataset("titanic")

X = df[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['age', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

numeric_transformer = Pipeline(steps=[
    (imputer := 'imputer', SimpleImputer(strategy='median')),
])

categorical_transformer = Pipeline(steps=[
    (encoder := 'encoder', OneHotEncoder(handle_unknown='ignore')),
    (imputer := 'imputer', SimpleImputer(strategy='most_frequent')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ])

pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ]
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.79


# Hyperparameter Tuning in Pipeline

---

In [6]:
pipeline = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', RandomForestClassifier(random_state=42))
    ]
)

hyperparameters = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [None, 5, 10, 20],
    'model__min_samples_split': [2, 5, 10,20]
}

grid_search = GridSearchCV(
    pipeline,
    hyperparameters,
    cv=5,
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

print(f"Accuracy after hyperparameter tuning: {accuracy_score(y_test, y_pred):.2f}")
print(f"Best hyperparameters: {grid_search.best_params_}")

Accuracy after hyperparameter tuning: 0.82
Best hyperparameters: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 300}
