In [1]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

Collecting google-cloud-pipeline-components>2
  Obtaining dependency information for google-cloud-pipeline-components>2 from https://files.pythonhosted.org/packages/72/c5/2f3d5da670bb627fd06a29ef799d51e6d514dde8bae8281530e6e6e6f7f9/google_cloud_pipeline_components-2.5.0-py3-none-any.whl.metadata
  Downloading google_cloud_pipeline_components-2.5.0-py3-none-any.whl.metadata (5.9 kB)
Downloading google_cloud_pipeline_components-2.5.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: google-cloud-pipeline-components
  Attempting uninstall: google-cloud-pipeline-components
    Found existing installation: google-cloud-pipeline-components 2.4.1
    Uninstalling google-cloud-pipeline-components-2.4.1:
      Successfully uninstalled google-cloud-pipeline-components-2.4.1
Successfully installed google-cloud-pipeline-components-2.5.0


In [15]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [1]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

In [2]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "degroup11"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://temp_degroup11"

Component: Downloading data

In [3]:
@dsl.component(
    packages_to_install=["pandas","google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def download_data(project_id: str, bucket: str, file_name: str, dataset: Output[Dataset]):
    '''download data'''
    from google.cloud import storage
    import pandas as pd
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    # Downloaing the file from a google bucket 
    client = storage.Client(project=project_id)
    bucket = client.bucket(bucket)
    blob = bucket.blob(file_name)
    blob.download_to_filename(dataset.path + ".csv")
    logging.info('Downloaded Data!')

Component: Encoding and splitting the data

In [4]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def feature_selection(dataset: Input[Dataset], dataset_train_x: Output[Dataset], dataset_train_y: Output[Dataset], dataset_test_y: Output[Dataset], dataset_test_x: Output[Dataset]):
    '''train_test_split'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    complete_data = pd.read_csv(dataset.path+".csv", index_col=None)
    
    label_encoder = LabelEncoder()
    for i in ['Geography', 'Gender']:
        complete_data[i]= label_encoder.fit_transform(complete_data[i])
    
    X_Columns = ['CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary']
    Y_columns= ['Exited']
    
    X = complete_data.copy()[X_Columns]
    y = complete_data.copy()[Y_columns]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y)
    
    X_train.to_csv(dataset_train_x.path + ".csv" , index=False, encoding='utf-8-sig')
    y_train.to_csv(dataset_train_y.path + ".csv" , index=False, encoding='utf-8-sig')
    X_test.to_csv(dataset_test_x.path + ".csv" , index=False, encoding='utf-8-sig')
    y_test.to_csv(dataset_test_y.path + ".csv" , index=False, encoding='utf-8-sig')

Component: Training RandomForrestClassifier model

In [5]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def train_rfc(features_x: Input[Dataset], features_y: Input[Dataset], out_model: Output[Model]):
    '''train a MLP with default parameters'''
    import pandas as pd
    import json
    import logging 
    import sys
    import os
    import pickle
    from sklearn.ensemble import RandomForestClassifier
        
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    df_x = pd.read_csv(features_x.path+".csv")
    df_y = pd.read_csv(features_y.path+".csv")
    
    logging.info(df_x.columns)
        
    # define model
    model_rfc = RandomForestClassifier()
    # fit model
    model_rfc.fit(df_x[:], df_y['Exited'])

    #save the model
    out_model.metadata["framework"] = "RFC"
    file_name = out_model.path + f".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_rfc, file)

Component: Training LogisticRegression model

In [6]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def train_logistic(features_x: Input[Dataset], features_y: Input[Dataset], out_model: Output[Model]):
    '''train a MLP with default parameters'''
    import pandas as pd
    import json
    import logging
    import sys
    import os
    import pickle
    from sklearn.linear_model import LogisticRegression

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    df_x = pd.read_csv(features_x.path+".csv")
    df_y = pd.read_csv(features_y.path+".csv")

    logging.info(df_x.columns)

    # define model
    model_logisitc = LogisticRegression()
    # fit model
    model_logisitc.fit(df_x[:], df_y['Exited'])

    #save the model
    out_model.metadata["framework"] = "Logistic"
    file_name = out_model.path + f".pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(model_logisitc, file)

Component: Training SupportVectorClassifier model

In [7]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn"],
    base_image="python:3.10.7-slim"
)
def train_svm(features_x: Input[Dataset], features_y: Input[Dataset], out_model: Output[Model]):
    '''train a MLP with default parameters'''
    import pandas as pd
    import json
    import logging
    import sys
    import os
    import pickle
    from sklearn.svm import SVC

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    df_x = pd.read_csv(features_x.path+".csv")
    df_y = pd.read_csv(features_y.path+".csv")

    logging.info(df_x.columns)

    # define model
    model_SVC = SVC(class_weight='balanced',C= 1, gamma= 'auto', kernel= 'rbf')

    # fit model
    model_SVC.fit(df_x[:], df_y['Exited'])

    #save the model
    out_model.metadata["framework"] = "SVC"
    file_name = out_model.path + f".pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(model_SVC, file)

Component: Evaluating model and selecting best one

In [8]:
@dsl.component(
    packages_to_install = [
       "pandas", "scikit-learn", "numpy"], 
    base_image="python:3.10.7-slim"
)

def model_evaluation(test_set_x:  Input[Dataset], test_set_y:  Input[Dataset], 
                         model_rfc: Input[Model], model_lr: Input[Model], model_svc: Input[Model]) -> str:
    import pandas as pd
    import logging     
    from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score
    import json
    import pickle
    from numpy import nan_to_num
    from sklearn.metrics import accuracy_score

    data_x = pd.read_csv(test_set_x.path+".csv")
    data_y = pd.read_csv(test_set_y.path+".csv")
    
    #Loading the saved rfc model with joblib
    m_filename = model_rfc.path + ".pkl"
    model_rfc_op = pickle.load(open(m_filename, 'rb'))
    
    predictions_rfc = model_rfc_op.predict(data_x)
    accuracy_rfc = accuracy_score(data_y, predictions_rfc)
    
    #Loading the saved logistic model with joblib
    m_filename = model_lr.path + ".pkl"
    model_logistic_op = pickle.load(open(m_filename, 'rb'))

    predictions_logistic = model_logistic_op.predict(data_x)
    accuracy_logistic = accuracy_score(data_y, predictions_logistic)
    
    #Loading the saved SVC model with joblib
    m_filename = model_svc.path + ".pkl"
    model_svc_op = pickle.load(open(m_filename, 'rb'))
    
    predictions_svc = model_svc_op.predict(data_x)
    accuracy_svc = accuracy_score(data_y, predictions_svc)
    
    #compare models
    if accuracy_rfc > accuracy_logistic:
        if accuracy_rfc >= accuracy_svc:
            return "RFC"
        else:
            return "SVC"
    else:
        if accuracy_logistic >= accuracy_svc:
            return "Logistic"
        else:
            return "SVC"


Component: Uploading model to Google Cloud

In [9]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob('model.pkl')
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

Component: Commit to Git

In [10]:
@dsl.container_component
def commit_github(user_password: str, git_repo:str, target_file: str):
    

    return dsl.ContainerSpec(
        image='alpine/git:2.40.1',
        command=[
            'sh', '-c', '''GURL="https://$0@$1"\
                            && git clone $GURL\
                            && cd DE_Group11\
                            && echo "model uploaded" >> $2\
                            && git config --global user.email "testuser@example.com"\
                            && git config --global user.name "Test User"\
                            && git commit -am "model uploaded"\
                            && git push $GURL --all
                            '''
        ],
        args=[user_password, git_repo, target_file])

Component: get GitHub password and username from GCS secrets

In [11]:
@dsl.component(
    packages_to_install=["google-cloud-secret-manager"],
    base_image="python:3.10.7-slim"
)
def get_git_password_user(project_id: str) -> str:  
    
    from google.cloud import secretmanager
    import logging 
    import sys

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    # ID of the secrets.
    secret_id_user = "Github_account_name"
    secret_id_pass = "Github_access_token"

    # Create the Secret Manager client.
    client = secretmanager.SecretManagerServiceClient()

    # Build the resource name of the secret version.
    user_resource = f"projects/{project_id}/secrets/{secret_id_user}/versions/1"

    # Get the secret version.
    user_response = client.access_secret_version(request={"name": user_resource})    
   # Get the value of the secret
    user_payload = user_response.payload.data.decode("UTF-8")

    # Build the resource name of the secret version.
    pass_resource = f"projects/{project_id}/secrets/{secret_id_pass}/versions/1"
    # Get the secret version.
    pass_response = client.access_secret_version(request={"name": pass_resource})
    pass_payload = pass_response.payload.data.decode("UTF-8")
    
    logging.info('Github credential retrieved!')
    return user_payload + ":" + pass_payload  # Never print or log this!    
    

The pipeline

In [12]:
# Define the workflow of the pipeline.
@kfp.dsl.pipeline(
    name="churn-predictor-training-pipeline")
def pipeline(project_id: str, data_bucket: str, file_name: str, dataset_uri: str, model_repo: str, model_repo_uri:str, git_repo:str):    
    
    di_op = download_data(
        project_id=project_id,
        bucket=data_bucket,
        file_name=file_name
    )
     
    model_feature_selection = feature_selection(dataset=di_op.output)
        
    model_train_rfc = train_rfc(features_x=model_feature_selection.outputs["dataset_train_x"], features_y=model_feature_selection.outputs["dataset_train_y"])
    
    #model_train_linear
    model_train_logistic = train_logistic(features_x=model_feature_selection.outputs["dataset_train_x"], features_y=model_feature_selection.outputs["dataset_train_y"])
    
    model_train_svm = train_svm(features_x=model_feature_selection.outputs["dataset_train_x"], features_y=model_feature_selection.outputs["dataset_train_y"])
    
    model_evaluation_op = model_evaluation(
        test_set_x=model_feature_selection.outputs["dataset_test_x"],
        test_set_y=model_feature_selection.outputs["dataset_test_y"],
        model_lr=model_train_logistic.output,
        model_rfc=model_train_rfc.output,
        model_svc=model_train_svm.output
    )
    
    with dsl.If(model_evaluation_op.output=="RFC"):
        upload_model_mlp_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=model_train_rfc.output)
        
        # Get Github password and user names 
        pass_user_op = get_git_password_user(
            project_id=project_id
        ).after(upload_model_mlp_to_gc_op)
        # Commit to model_mlp folder
        commit_op = commit_github(
            user_password=pass_user_op.output,
            git_repo=git_repo,       
            target_file="synchronizer/model_rfc/history.txt"
        ).after(pass_user_op)
        
    with dsl.If(model_evaluation_op.output=="Logistic"):
        upload_model_mlp_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=model_train_logistic.output)
        
        # Get Github password and user names 
        pass_user_op = get_git_password_user(
            project_id=project_id
        ).after(upload_model_mlp_to_gc_op)
        # Commit to model_mlp folder
        commit_op = commit_github(
            user_password=pass_user_op.output,
            git_repo=git_repo,       
            target_file="synchronizer/model_lr/history.txt"
        ).after(pass_user_op)
        
    with dsl.If(model_evaluation_op.output=="SVC"):
        upload_model_mlp_to_gc_op = upload_model_to_gcs(
            project_id=project_id,
            model_repo=model_repo,
            model=model_train_svm.output)
        
        # Get Github password and user names 
        pass_user_op = get_git_password_user(
            project_id=project_id
        ).after(upload_model_mlp_to_gc_op)
        # Commit to model_mlp folder
        commit_op = commit_github(
            user_password=pass_user_op.output,
            git_repo=git_repo,       
            target_file="synchronizer/model_svc/history.txt"
        ).after(pass_user_op)

Compiler

In [13]:
from kfp import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='churn_predictor_training_pipeline.yaml')

In [14]:
import google.cloud.aiplatform as aip

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=PROJECT_ID,
    location=REGION,
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="churn-predictor",
    enable_caching=False,
    template_path="churn_predictor_training_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
    location=REGION,
    parameter_values={
        'project_id': 'degroup11', # makesure to use your project id 
        'data_bucket': 'data_degroup11',  # makesure to use your data bucket name
        'file_name': 'churn.csv',
        'dataset_uri':'gs://data_degroup11/churn.csv',
        'model_repo':'models_degroup11', # makesure to use your model bucket name 
        'model_repo_uri':'gs://models_degroup11', # makesure to use your model bucket name 
        'git_repo':"github.com/GeorgeAntono/DE_Group11.git"
    }
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/789631459256/locations/us-central1/pipelineJobs/churn-predictor-training-pipeline-20231021183916
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/789631459256/locations/us-central1/pipelineJobs/churn-predictor-training-pipeline-20231021183916')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/churn-predictor-training-pipeline-20231021183916?project=789631459256
PipelineJob projects/789631459256/locations/us-central1/pipelineJobs/churn-predictor-training-pipeline-20231021183916 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/789631459256/locations/us-central1/pipelineJobs/churn-predictor-training-pipeline-20231021183916 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/789631459256/locations/us-central1/pipelineJobs/churn-predictor-training-pipeline-20231021183916 current state:
