In [11]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform



In [1]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

In [3]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "degroup11"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://temp_degroup11"

Component: Data Ingestion

In [4]:
@dsl.component(
    packages_to_install=["pandas","google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def download_data(project_id: str, bucket: str, file_name: str, dataset: Output[Dataset]):
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Downloaing the file from a google bucket 
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket)
    blob = bucket.blob(file_name)
    blob.download_to_filename(dataset.path + ".csv")
    logging.info('Downloaded Data!')

Component: Feature selection

In [5]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def feature_selection(dataset: Input[Dataset], dataset_train_x: Output[Dataset], dataset_train_y: Output[Dataset], dataset_test_y: Output[Dataset], dataset_test_x: Output[Dataset]):
    '''train_test_split'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    complete_data = pd.read_csv(dataset.path+".csv", index_col=None)
    
    label_encoder = LabelEncoder()
    for i in ['Geography', 'Gender']:
        complete_data[i]= label_encoder.fit_transform(complete_data[i])
    
    X_Columns = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']
    Y_columns= ['Exited']
    
    X = complete_data.copy()[X_Columns]
    y = complete_data.copy()[Y_columns]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y)
    
    X_train.to_csv(dataset_train_x.path + ".csv" , index=False, encoding='utf-8-sig')
    y_train.to_csv(dataset_train_y.path + ".csv" , index=False, encoding='utf-8-sig')
    X_test.to_csv(dataset_test_x.path + ".csv" , index=False, encoding='utf-8-sig')
    y_test.to_csv(dataset_test_y.path + ".csv" , index=False, encoding='utf-8-sig')

Component: Training RF model

In [6]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def train_rfc(features_x: Input[Dataset], features_y: Input[Dataset], out_model: Output[Model]):
    '''train a MLP with default parameters'''
    import pandas as pd
    import json
    import logging 
    import sys
    import os
    import pickle
    from sklearn.ensemble import RandomForestClassifier
        
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df_x = pd.read_csv(features_x.path+".csv")
    df_y = pd.read_csv(features_y.path+".csv")
    
    logging.info(df_x.columns)
        
    # define model
    model_rfc = RandomForestClassifier()
    # fit model
    model_rfc.fit(df_x[:], df_y['Exited'])

    #save the model
    out_model.metadata["framework"] = "RFC"
    file_name = out_model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_rfc, file)   

Component: Evaluate model performance

In [7]:
@dsl.component(
    packages_to_install = [
       "pandas", "scikit-learn", "numpy"], 
    base_image="python:3.10.7-slim"
)

def rfc_model_evaluation(test_set_x:  Input[Dataset], test_set_y:  Input[Dataset], model_rfc: Input[Model], kpi: Output[Metrics]):
    import pandas as pd
    import logging     
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import json
    import pickle
    from numpy import nan_to_num
    from sklearn.metrics import accuracy_score
    
    def threshold_check(val1, val2):
        cond = False
        if val1 >= val2 :
            cond = True
        return cond

    data_x = pd.read_csv(test_set_x.path+".csv")
    data_y = pd.read_csv(test_set_y.path+".csv")
    
    #Loading the saved model with joblib
    m_filename = model_rfc.path + ".pkl"
    model = pickle.load(open(m_filename, 'rb'))
    
    predictions_rfc = model.predict(data_x)
    accuracy = accuracy_score(data_y, predictions_rfc)
    
    #model.metadata["accuracy"] = float(accuracy)
    kpi.log_metric("accuracy", float(accuracy))

Upload model, metrics to google buckets

In [8]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob('model.pkl')
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

Defining the pipeline

In [9]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="churn-predictor-training-pipeline")
def pipeline(project_id: str, data_bucket: str, file_name: str, dataset_uri: str, model_repo: str, model_repo_uri:str):    
    
    di_op = download_data(
        project_id=project_id,
        bucket=data_bucket,
        file_name=file_name
    )
     
    model_feature_selection = feature_selection(dataset=di_op.output)
        
    model_train = train_rfc(features_x=model_feature_selection.outputs["dataset_train_x"], features_y=model_feature_selection.outputs["dataset_train_y"])
    
    model_evaluation = rfc_model_evaluation(
        test_set_x=model_feature_selection.outputs["dataset_test_x"],
        test_set_y=model_feature_selection.outputs["dataset_test_y"],
        model_rfc=model_train.output,
    )
    

    upload_model_to_gc_op = upload_model_to_gcs(
        project_id=project_id,
        model_repo=model_repo,
        model=model_train.output
    )