In [None]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

In [None]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (
    Artifact,
    Dataset,
    Input,
    Model,
    Output,
    Metrics,
    ClassificationMetrics,
    component,
    OutputPath,
    InputPath,
)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.types import artifact_types

#### Project and Pipeline Configurations

In [1]:
# The Google Cloud project that this pipeline runs in.
PROJECT_ID = "de23-398309"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://data_de2023_2065718"


#### Pipeline Component : Data Ingestion

In [2]:
@dsl.component(
    packages_to_install=["pandas", "google-cloud-storage"],
    base_image="python:3.10.7-slim",
)
def download_data(
    project_id: str, bucket: str, file_name: str, dataset: Output[Dataset]
):
    """download data"""
    from google.cloud import storage
    import pandas as pd
    import logging
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # Downloaing the file from a google bucket
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket)
    blob = bucket.blob(file_name)
    blob.download_to_filename(dataset.path + ".csv")
    logging.info("Downloaded Data!")

NameError: name 'dsl' is not defined

#### Pipeline Component : Train and Test Split

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"], base_image="python:3.10.7-slim"
)
def train_test_split(
    dataset: Input[Dataset],
    dataset_train_X: Output[Dataset],
    dataset_test_X: Output[Dataset],
    dataset_train_y: Output[Dataset],
    dataset_test_y: Output[Dataset],
):
    """train_test_split"""
    import pandas as pd
    import logging
    import sys
    from sklearn.model_selection import train_test_split as tts

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    alldata = pd.read_csv(dataset.path, index_col=None)
    X_train, X_test, y_train, y_test = tts(
        alldata.drop("quality"), alldata["quality"], test_size=0.20, random_state=6
    )
    X_train.to_csv(dataset_train_X.path + ".csv", index=False, encoding="utf-8-sig")
    X_test.to_csv(dataset_test_X.path + ".csv", index=False, encoding="utf-8-sig")
    y_train.to_csv(dataset_train_y.path + ".csv", index=False, encoding="utf-8-sig")
    y_test.to_csv(dataset_test_y.path + ".csv", index=False, encoding="utf-8-sig")

#### Pipeline Component : PCA


In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"], base_image="python:3.10.7-slim"
)
def PCA(standard_features: Input[Dataset], pca_features: Output[Dataset]):
    """ """

    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    import pandas as pd
    import json
    import logging
    import sys
    import os
    import pickle

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    scaler = StandardScaler()

    train_features = pd.read_csv(standard_features.path + ".csv")
    scaled_features = pd.DataFrame(
        scaler.fit_transform(train_features),
        columns=train_features.columns,
        index=train_features.index,
    )

    pca = PCA(n_components=0.1, svd_solver="full")
    pca_df = pca.fit_transform(scaled_features)

    pca_df.to_csv(pca_features.path + ".csv", index=False, encoding="utf-8-sig")

#### Pipeline Component : Training-RF 

In [1]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"], base_image="python:3.10.7-slim"
)
def train_rf(
    train_features_X: Input[Dataset],
    test_features_X: Input[Dataset],
    train_features_y: Input[Dataset],
    test_features_y: Input[Dataset],
    out_model: Output[Model],
) -> NamedTuple("outputs", metrics=dict):
    """ """
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.model_selection import GridSearchCV

    import pandas as pd
    import json
    import logging
    import sys
    import os
    import pickle

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    X_train = pd.read_csv(train_features_X.path + ".csv")
    y_train = pd.read_csv(train_features_y.path + ".csv")

    logging.info(X_train.columns)

    parameters = {
        "n_estimators": [100, 250],
        "criterion": ["gini", "entropy", "log_loss"],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 5, 10],
    }

    rf = RandomForestClassifier(random_state=6)
    rf_gs = GridSearchCV(rf, parameters)
    rf_gs.fit(X_train, y_train)
    best_params = rf_gs.best_params_

    X_test = pd.read_csv(test_features_X.path + ".csv")
    y_test = pd.read_csv(test_features_y.path + ".csv")

    # Predicting Test Set

    y_pred = rf_gs.predict(X_test)
    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="micro"),
        "recall": recall_score(y_test, y_pred, average="micro"),
        "f1_score": f1_score(y_test, y_pred, average="micro"),
    }

    logging.info(metrics_dict)

    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algo"] = "rf_gs"
    out_model.metadata["best_params"] = best_params

    # Save the model
    m_file = out_model.path + ".pkl"
    with open(m_file, "wb") as f:
        pickle.dump(rf_gs, f)

    outputs = NamedTuple("outputs", metrics=dict)
    return outputs(metrics_dict)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1858410057.py, line 33)

#### Pipeline Component : Training-GBC 

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"], base_image="python:3.10.7-slim"
)
def train_gbc(
    train_features_X: Input[Dataset],
    test_features_X: Input[Dataset],
    train_features_y: Input[Dataset],
    test_features_y: Input[Dataset],
    out_model: Output[Model],
) -> NamedTuple("outputs", metrics=dict):
    """ """
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.model_selection import GridSearchCV

    import pandas as pd
    import json
    import logging
    import sys
    import os
    import pickle

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    X_train = pd.read_csv(train_features_X.path + ".csv")
    y_train = pd.read_csv(train_features_y.path + ".csv")

    logging.info(X_train.columns)

    parameters = {
        "n_estimators": [100, 250],
        "criterion": ["gini", "entropy", "log_loss"],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 5, 10],
    }

    gbc = GradientBoostingClassifier(random_state=6)
    gbc_gs = GridSearchCV(gbc, parameters)
    gbc_gs.fit(X_train, y_train)
    best_params = gbc_gs.best_params_

    X_test = pd.read_csv(test_features_X.path + ".csv")
    y_test = pd.read_csv(test_features_y.path + ".csv")

    # Predicting Test Set

    y_pred = gbc_gs.predict(X_test)
    metrics_dict = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average="micro"),
        "recall": recall_score(y_test, y_pred, average="micro"),
        "f1_score": f1_score(y_test, y_pred, average="micro"),
    }

    logging.info(metrics_dict)

    out_model.metadata["file_type"] = ".pkl"
    out_model.metadata["algo"] = "gbc_gs"
    out_model.metadata["best_params"] = best_params

    # Save the model
    m_file = out_model.path + ".pkl"
    with open(m_file, "wb") as f:
        pickle.dump(gbc_gs, f)

    outputs = NamedTuple("outputs", metrics=dict)
    return outputs(metrics_dict)

#### Pipeline Component : Prediction-RF


In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"], base_image="python:3.10.7-slim"
)
def predict_rf(model: Input[Model], features: Input[Dataset], results: Output[Dataset]):
    import pandas as pd
    import pickle
    import json
    import logging
    import sys
    import os

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    df = pd.read_csv(features.path + ".csv")

    filename = model.path + ".pkl"

    # Loading the saved model
    model_rf = pickle.load(open(filename, "rb"))

    X_test = df.drop(columns=["quality"])

    df_complete = df.copy()
    y_pred = model_rf.predict(X_test)
    logging.info(y_pred)
    df_complete["pclass"] = y_pred.tolist()
    df_complete.to_csv(results.path + ".csv", index=False, encoding="utf-8-sig")


#### Pipeline Component : Prediction-RF

In [None]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"], base_image="python:3.10.7-slim"
)
def predict_rf(model: Input[Model], features: Input[Dataset], results: Output[Dataset]):
    import pandas as pd
    import pickle
    import json
    import logging
    import sys
    import os

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    df = pd.read_csv(features.path + ".csv")

    filename = model.path + ".pkl"

    # Loading the saved model
    model_gbc = pickle.load(open(filename, "rb"))

    X_test = df.drop(columns=["quality"])

    df_complete = df.copy()
    y_pred = model_gbc.predict(X_test)
    logging.info(y_pred)
    df_complete["pclass"] = y_pred.tolist()
    df_complete.to_csv(results.path + ".csv", index=False, encoding="utf-8-sig")


#### Pipeline Component : Algorithm Selection 

In [None]:
@dsl.component(base_image="python:3.10.7-slim")
def compare_model(rf_metrics: dict, gbc_metrics: dict) -> str:
    import logging
    import json
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.info(rf_metrics)
    logging.info(gbc_metrics)
    if rf_metrics.get("f1_score") > gbc_metrics.get("f1_score"):
        return "rf"
    else:
        return "gbc"

### Upload Model and Metrics to Google Bucket 

In [None]:
@dsl.component(
    packages_to_install=["google-cloud-storage"], base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    """upload model to gsc"""
    from google.cloud import storage
    import logging
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob(
        str(model.metadata["algo"]) + "_model" + str(model.metadata["file_type"])
    )
    blob.upload_from_filename(model.path + str(model.metadata["file_type"]))

    print("Saved the model to GCP bucket : " + model_repo)


#### Define the Pipeline

In [None]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(name="winequality-predictor-training-pipeline-v1")
def pipeline(project_id: str, data_bucket: str, data_filename: str, model_repo: str):
    di_op = download_data(
        project_id=project_id, bucket=data_bucket, file_name=trainset_filename
    )

    training_mlp_job_run_op = train_mlp(features=di_op.outputs["dataset"])

    training_lr_job_run_op = train_lr(features=di_op.outputs["dataset"])

    pre_di_op = download_data(
        project_id=project_id, bucket=data_bucket, file_name=testset_filename
    ).after(training_mlp_job_run_op, training_lr_job_run_op)

    comp_model__op = compare_model(
        mlp_metrics=training_mlp_job_run_op.outputs["metrics"],
        lr_metrics=training_lr_job_run_op.outputs["metrics"],
    ).after(training_mlp_job_run_op, training_lr_job_run_op)

    # defining the branching condition
    with dsl.If(comp_model__op.output == "MLP"):
        predict_mlp_job_run_op = predict_mlp(
            model=training_mlp_job_run_op.outputs["out_model"],
            features=pre_di_op.outputs["dataset"],
        )
        upload_model_mlp_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=training_mlp_job_run_op.outputs["out_model"],
        ).after(predict_mlp_job_run_op)

    with dsl.If(comp_model__op.output == "LR"):
        predict_lr_job_run_op = predict_lr(
            model=training_lr_job_run_op.outputs["out_model"],
            features=pre_di_op.outputs["dataset"],
        )
        upload_model_lr_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=training_lr_job_run_op.outputs["out_model"],
        ).after(predict_lr_job_run_op)
