### Lecture 8 - ML pipelines: Struktur and automatisering
Assignment: Build a minimal training pipeline

Instructions:

* Create a small end-to-end pipeline in code
* Save outputs and metrics
* Keep short comments explaining design choices

## Task 1: Pipeline in code
Build a small end-to-end pipeline with preprocessing and a model.

In [None]:
# TODO: Build a scikit-learn Pipeline with:
# - StandardScaler
# - Model of choice (LogisticRegression or SVC)

# A pipeline is a series of steps that we run
# In ML, we often use pipelines for preprocessing steps
# such as standardization, transformation, and reshaping,
# but also for training.

# Today, we build a pipeline with standardization and model creation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Below, we build a pipeline.
# The pipeline we have built puts together both our preprocessing step
# and the creation of our model into one run.
# This makes it very easy to recreate the same workflow.

pipeline = Pipeline(
    steps = [
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

In [2]:
# TODO: Train and evaluate on a dataset
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

data = load_iris(as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

In [3]:
print("X_test shape:", X_test.shape)
print("X_train shape:", X_train.shape)

X_test shape: (38, 4)
X_train shape: (112, 4)


See L4_assignment_CLASSROOM.ipynb for detailed EDA:

[04_scikit_learn_api.ipynb](../Code_alongs/04_scikit_learn_api.ipynb)

[Web Version](https://github.com/LAjoyan/ML-Frameworks/blob/main/Code_alongs/04_scikit_learn_api.ipynb)


In [None]:
# The training:


# With our pipeline, we train and then evaluate our model

pipeline.fit(X_train,y_train)
preds = pipeline.predict(X_test)

metrics = {
    "accuracy": accuracy_score(y_test, preds),
    "f1_macro": f1_score(y_test, preds, average="macro")
}

print(metrics)
{'accuracy': 1.0, 'f1_macro': 1.0}

{'accuracy': 1.0, 'f1_macro': 1.0}


{'accuracy': 1.0, 'f1_macro': 1.0}

## Task 2: Automate training
Wrap the workflow into a reusable experiment function.

In [None]:
# TODO: Wrap training in a function run_experiment(config)

# A function like this risks becoming very long
# We have to walk a fine line between generalizability
# and brevity/readability.
# If the idea is to be able to reuse our experiment function,
# then a long length is a pill we may have to swallow

def run_experiment(config = {"scaler": StandardScaler(), "model": LogisticRegression(max_iter=1000), "params": None}):
    
# Below is a generalizable pipeline
# BUT, it may not be maximally useful,
# since the user is forced to keep track of
# and pass in all the separate steps themselves

    pipeline = Pipeline(
        steps= [
            ("scaler", config["scaler"]),
            ("model", config["model"]),
            ("params", config["params"])
        ]
    )

def run_experiment_not_generalised(config):
    # Use a small dataset to keep runtime low
    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(
        iris.data, iris.target, test_size=0.2, random_state=42, stratify=iris.target
    )

    pipeline = Pipeline(
        steps=[
            ("scaler", StandardScaler()),
            ("model", LogisticRegression(max_iter=config["max_iter"], C=config["C"])),
        ]
    )

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    metrics = {
        "accuracy": accuracy_score(y_test, preds),
        "f1_macro": f1_score(y_test, preds, average="macro"),
        "params": {"C": config["C"], "max_iter": config["max_iter"]},
    }

    # Save outputs
    ("metrics.json").write_text(json.dumps(metrics, indent=2))
    joblib.dump(pipeline, "model.joblib")

    return metrics

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import get_scorer

def run_experiment(config):
    """
    Run a general ML experiment entirely from a config dict.

    config: dict
        {
            "X": feature matrix,
            "y": target vector,
            "test_size": float (optional, default=0.2),
            "random_state": int (optional, default=42),
            "preprocessing": list of (name, transformer) tuples (optional),
            "model": sklearn estimator,
            "params": dict of hyperparameters for GridSearchCV (optional),
            "scoring": str or callable metric (optional, default='accuracy')
        }
    """
    
    # Extract from config with defaults
# We fetch the relevant values from the config
    X = config["X"]
    y = config["y"]
    test_size = config.get("test_size", 0.2)
    random_state = config.get("random_state", 42)
    preprocessing = config.get("preprocessing", [])
    model = config["model"]
    params = config.get("params", None)
    scoring = config.get("scoring", "accuracy")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Build pipeline
    # Below, we build a list of steps
    # first is all preprocessing (scaling, transforms, etc.)
    # and last is the model.

    steps = preprocessing + [("model", model)] 

# Steps typically look something like this
#    steps= [
#             ("scaler", config["scaler"]),
#             ("model", config["model"]), 
#        ]

    pipeline = Pipeline(steps=steps)
    
    # Wrap with GridSearchCV if params provided
# ADVANCED: take a look yourselves if youâ€™re curious
    if params:
        pipeline = GridSearchCV(pipeline, params, cv=5, n_jobs=-1, scoring=scoring)
    
    # Fit
    pipeline.fit(X_train, y_train)
    
    # Evaluate
    # Here we use sklearn's get_scorer() to calculate our performance
# Previously, we usually imported a specific metric (f1-score, accuracy)
# We also usually run a confusion matrix; we can do that later

    scorer = get_scorer(scoring)
    score = scorer(pipeline, X_test, y_test)
    
    print(f"{scoring} on test set: {score:.4f}")
    
    return pipeline, score

In [8]:
# TODO: Save metrics to metrics.json

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.svm import SVC
import joblib
import json

X, y = load_iris(return_X_y=True)

configs = []
config_LogReg_025 = {
    "config_name": "LogReg_025",
    "X": X,
    "y": y,
    "test_size": 0.25,
    "random_state": 123,
    "preprocessing": [
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=2))
    ],
    "model": LogisticRegression(max_iter=1000),
    "params": {
        "model__C": [0.1, 1, 10],
    },
    "scoring": "accuracy"
}

config_SVM_03 = {
    "config_name": "SVM_03",
    "X": X,
    "y": y,
    "test_size": 0.3,
    "random_state": 456,
    "preprocessing": [
        ("scaler", StandardScaler())
    ],
    "model": SVC(),
    "params": {
        "model__C": [0.1, 1, 10],
        "model__kernel": ["linear", "rbf"]
    },
    "scoring": "accuracy"
}

# create a config for a third experiment with different preprocessing and model
config_LogReg_02 = {
    "config_name": "LogReg_02",
    "X": X,
    "y": y,
    "test_size": 0.2,
    "random_state": 789,
    "preprocessing": [
        ("scaler", StandardScaler())
    ],
    "model": LogisticRegression(max_iter=1000),
    "params": {
        "model__C": [0.01, 0.1, 1],
    },
    "scoring": "accuracy"
}

configs = [config_LogReg_025, config_SVM_03, config_LogReg_02]

for config in configs:

    pipeline, score = run_experiment(config)

    # Save outputs
    metrics = {
        "accuracy": score,
        "params": pipeline.best_params_ if hasattr(pipeline, "best_params_") else None
    }

    # save our metrics to a json file
    with open(f"metrics_{config['config_name']}.json", "w") as f:
        json.dump(metrics, f, indent=2) 

    # save our model to a joblib file
    joblib.dump(pipeline, f"model_{config['config_name']}.joblib")

accuracy on test set: 0.8421
accuracy on test set: 0.9778
accuracy on test set: 0.9667


In [9]:
# TODO: Save the trained model with joblib

joblib.dump(pipeline, "model.joblib")

['model.joblib']