# Vertex AI Pipeline for Scikit-learn Classification 

### Set up project definition

In [4]:
import os

PROJECT_ID = "crazy-hippo-01"

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

Project ID:  crazy-hippo-01


### Set up current timestamp

In [5]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
print('Timestamp: ', TIMESTAMP)

Timestamp:  20210625152839


### Import Libraries and define constants

In [2]:
BUCKET_NAME = "gs://crazy-vertex-ai-pipelines"
REGION = "us-central1" 
ML_PROJECT_NAME = "scikit-learn-classifier"
USER = "crazy-hippo" 

In [3]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

PIPELINE_ROOT = "{}/{}/{}".format(BUCKET_NAME, ML_PROJECT_NAME, USER)

PIPELINE_ROOT

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin


'gs://crazy-vertex-ai-pipelines/scikit-learn-classifier/crazy-hippo'

In [6]:
import json
from typing import NamedTuple

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google.client import AIPlatformClient

from kfp.v2.dsl import (
    Input,
    Output,
    Artifact,
    Model,
    Dataset,
    Metrics,
    InputPath
)

### Define Pipeline Components

#### 1. Pre-processing Component

In [28]:
@component(output_component_file='pre-processing.yaml',
          base_image='python:3.9',
          packages_to_install=['pandas', 
                             'google-cloud-bigquery', 
                             'pyarrow' , 
                             'gcsfs',
                             'numpy',
                             'sklearn'
                              ])
def pre_processing(
        X_TRAIN : Output[Dataset],
        X_TEST : Output[Dataset],
        y_TRAIN : Output[Dataset],
        y_TEST : Output[Dataset],
        #pipeline_metrics: Output[Metrics]) -> NamedTuple(
        #  'ComponentOutputs',
        #  [
        #      ('xxxx', str),
        #  ]
    ):
    
    import pandas as pd
    import sklearn as sk
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    
    from google.cloud.bigquery import Client, QueryJobConfig
    
    
    #Initiate BigQuery Client
    client = Client(project='crazy-hippo-01')
    
    query = """SELECT *
    FROM `crazy-hippo-01.census_data_us.census_raw` 
    """
    
    #Run Query
    job = client.query(query)
    df = job.to_dataframe()
    
    #Make Feature Selections
    X = df[['age', 'workclass', 'gender', 'occupation', 'education_num', 'marital_status', 'relationship', 'capital_gain']]
    y = df[['income_bracket']]
    
    #One-hot encode data using Pandas get_dummies function
    X = pd.get_dummies(X, prefix=['workclass', 'gender','occupation','marital_status','relationship'])

    #Normalize data using Scikit-learn function
    scaler = preprocessing.MinMaxScaler()
    X[['age','education_num','capital_gain']] = scaler.fit_transform(X[['age','education_num','capital_gain']])
    
    # Change label string into integer to be able to use in model training
    le = preprocessing.LabelEncoder()
    le.fit(y['income_bracket'])
    y['income_bracket'] = le.transform(y['income_bracket'])
    y = y['income_bracket'].values
    
    #Split data in train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    y_train = pd.DataFrame(y_train, columns=['income_bracket'])
    y_test = pd.DataFrame(y_train, columns=['income_bracket'])
    
    #Write dataframes to CSV artifact and store in GCS
    X_train.to_csv(X_TRAIN.path, index=False, header=True)
    X_test.to_csv(X_TEST.path, index=False, header=True)
    y_train.to_csv(y_TRAIN.path, index=False, header=True)
    y_test.to_csv(y_TEST.path, index=False, header=True)
    
    print("Artifacts written to Artifact Repository")

    
    
    

#### 2. Training Component

In [59]:
@component(output_component_file='training.yaml',
          base_image='python:3.9',
          packages_to_install=['pandas', 
                             'google-cloud-bigquery', 
                             'pyarrow' , 
                             'gcsfs',
                             'numpy',
                             'sklearn',
                            # 'pickle5'
                              ])
def training(
        X_TRAIN : Input[Dataset],
        X_TEST : Input[Dataset],
        y_TRAIN : Input[Dataset],
        y_TEST : Input[Dataset],
        MODEL: Output[Model]
        #pipeline_metrics: Output[Metrics]) -> NamedTuple(
        #  'ComponentOutputs',
        #  [
        #      ('xxxx', str),
        #  ]
    ):
    import pandas as pd
    import sklearn as sk
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    import pickle

    #Read Dataset Artifacts
    X_train = pd.read_csv(X_TRAIN.path)
    X_test = pd.read_csv(X_TEST.path)
    y_train = pd.read_csv(y_TRAIN.path)
    y_test = pd.read_csv(y_TEST.path)
    
    #Make Labels into numpy array
    y_train = y_train.to_numpy().reshape((len(y_train)))
    y_test = y_test.to_numpy().reshape((len(y_test)))
    
    #Initiate Scikit-Learn Training Process
    LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr', max_iter=200).fit(X_train, y_train)
    
    # Export the classifier to a file
    
    #file_name = '/scikit_model.sav'
    pickle.dump(LR, open(MODEL.path, 'wb'))



In [60]:
@dsl.pipeline(
  name='scikitlearnclassifer01',
  description='Binary Classification Model with Tensorflow Deep Learning and Connected Pre-processing Layers'
)
def scikit_classifier_earnings_v1(
    pipeline: str = 'Scikit-Learn Earnings Classifer',
    framework: str = 'Scikit-learn',
    input_path: str = 'crazy-hippo-01.census_data_us.census_raw',
    dataset_version: int = 3
    ):
    
    first_step = pre_processing()
   
    second_step = training(first_step.outputs['X_TRAIN'],
                          first_step.outputs['X_TEST'],
                          first_step.outputs['y_TRAIN'],
                          first_step.outputs['y_TEST']
                          )


### Compile and Run Pipeline

In [61]:
from kfp.v2 import compiler  

compiler.Compiler().compile(
    pipeline_func=scikit_classifier_earnings_v1, package_path="scikit_classifier_earnings_v1.json"
)

In [62]:
from kfp.v2.google.client import AIPlatformClient  # noqa: F811

api_client = AIPlatformClient(
    project_id=PROJECT_ID,
    region=REGION,
)

In [63]:
SERVICE_ACCOUNT = 'pipelines-vertex-ai@crazy-hippo-01.iam.gserviceaccount.com'

In [64]:
response = api_client.create_run_from_job_spec(
    job_spec_path="scikit_classifier_earnings_v1.json",
    pipeline_root=PIPELINE_ROOT,
    service_account=SERVICE_ACCOUNT 
)