In [None]:
# Specify the Google Cloud region where the services are located
REGION = "asia-southeast2"

# Define the service account email used for Google Cloud services authentication
SERVICE_ACCOUNT = 'compute@developer.gserviceaccount.com'

# Provide the service account's unique identifier number
SERVICE_ACCOUNT_NUMBER = '1182'

# Execute the command to get the current Google Cloud project ID
project = !gcloud config get-value project
PROJECT_ID = project[0]

# Print the retrieved project ID
print("Current Google Cloud Project ID:", PROJECT_ID)

In [None]:
# Define the client name associated with the operations
CLIENT_NAME = "client"

# Specify the URI of the Google Cloud Storage bucket
BUCKET_URI = 'gs://dev-bucket-ds-dashboard'

# Extract and define the bucket name from the bucket URI
BUCKET_NAME = BUCKET_URI

# Define the root directory for pipelines within the bucket, specific to the client
PIPELINE_ROOT = f"{BUCKET_URI}/data_science/{CLIENT_NAME}"

# Define the full Google Cloud Storage bucket path for storing temporary data
GCS_BUCKET_NAME = 'gs://dev-bucket-ds-dashboard/data_science/data_vertex/'

# Define the full path to the dataset within the GCS bucket
GCS_PATH = 'gs://dev-bucket-ds-dashboard/data_science/data_vertex/dataset.csv'

# Specify the format of the training dataset
TRAINING_DATASET_FORMAT = "csv"

In [None]:
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.custom_job import utils
from kfp import compiler, dsl
from kfp.dsl import Artifact, Dataset, Input, Metrics, Model, Output, component, OutputPath, ClassificationMetrics
import google.cloud.aiplatform as aiplatform
import kfp
from typing import NamedTuple
import pandas as pd
import google.cloud.bigquery as bigquery
from google.cloud import storage

In [None]:
DEPLOY_IMAGE = "asia-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"

In [None]:
# Define the base path for all project and location based operations
PARENT = f"projects/{PROJECT_ID}/locations/{REGION}"

# Configure the API endpoint specific to the region for AI Platform
API_ENDPOINT = f"{REGION}-aiplatform.googleapis.com"

# Set client options with the specific API endpoint
client_options = {"api_endpoint": API_ENDPOINT}

# Initialize the AI Platform Job Service Client with the specified endpoint
CLIENT = aiplatform.gapic.JobServiceClient(client_options=client_options)

# Print the API endpoint used for the client to verify the configuration
print("AI Platform API Endpoint:", API_ENDPOINT)

In [None]:
@component(
    packages_to_install=[
        "google-cloud-bigquery[pandas]==3.10.0",
        "google-cloud-storage",
        "pandas",
        "joblib",
        "scikit-learn",
    ],
    base_image = 'python:3.9'
)
def get_and_preprocess_dataset(
    project_id : str,
    dataset: Output[Dataset],
    client_name: str
) :
    """
    Retrieves and preprocesses a dataset from BigQuery based on the project and client details,
    then applies encoding and dumps to a CSV.

    Parameters:
        project_id (str): The Google Cloud project ID.
        dataset (Output[Dataset]): The dataset to be processed and stored.
        client_name (str): The client name to filter the data.

    """

    import pandas as pd
    from sklearn.preprocessing import OrdinalEncoder
    from google.cloud import bigquery

    def ensure_min_dpd(df):
        df.loc[df['dpd_number'] < 90, 'dpd_number'] = 90
        return df

    def remove_zero_principal(df):
        return df[df['outstanding_principal_amount'] != 0]

    def create_employment_dummies(df):
        employment_types = ['EMPLOYEE', 'ENTREPRENEUR', 'STUDENT', 'OTHERS', 'FREELANCE',
                            'GOVERNMENT EMPLOYEE', 'POLICE/MILITARY', 'UNEMPLOYED', 'UNKNOWN']
        df_employment = pd.get_dummies(df['employment_type_name'])
        df_employment = df_employment.reindex(columns=employment_types, fill_value=0)
        df_employment.columns = [f'employment_type_{col}' for col in df_employment.columns]
        return df_employment

    def create_gender_dummies(df):
        gender = ['M', 'F']
        df_gender = pd.get_dummies(df['gender_name'])
        df_gender = df_gender.reindex(columns=gender, fill_value=0)
        df_gender.columns = [f'gender_name_{col}' for col in df_gender.columns]
        return df_gender

    def concatenate_dataframes(df, df_employment, df_gender):
        return pd.concat([df, df_employment, df_gender], axis=1)

    def drop_unnecessary_columns(df):
        drop_columns = ['external_loan_number', 'ktp_province_name', 'total_payment', 'age',
                        'campaign_valid_from', 'campaign_valid_to', 'campaign_name', 'client_name',
                        'employment_type_name', 'gender_name', 'Ever_Pay']
        return df.drop(columns=drop_columns)

    def apply_ordinal_encoding(df, cols):
        encoder = OrdinalEncoder()
        df[cols] = encoder.fit_transform(df[cols])
        return df


    # Get dataset from BigQuery
    client = bigquery.Client(project=project_id)
    query = """
    SELECT *
    FROM `pav-id-vertexai-dev.data_science.ds_pre_call`
    WHERE client_name = "{0}"
    LIMIT 100
    """.format(client_name)

    job_config = bigquery.QueryJobConfig()
    query_job = client.query(query=query, job_config=job_config)
    df = query_job.result().to_dataframe()

    # Apply functions
    df = ensure_min_dpd(df)
    df = remove_zero_principal(df)
    df_employment = create_employment_dummies(df)
    df_gender = create_gender_dummies(df)
    df = concatenate_dataframes(df, df_employment, df_gender)
    df = drop_unnecessary_columns(df)
    df = apply_ordinal_encoding(df, ['dpd_number', 'outstanding_principal_amount'])

    df.to_csv(dataset.path, index=False)

In [None]:
@component(
    packages_to_install=[
        "pandas",
        "numpy",
        "joblib",
        "scikit-learn",
        "xgboost",
    ],
    base_image = 'python:3.9'
)
def model_training(
    dataset: Input[Dataset],
    model: Output[Model],
    metrics: Output[Metrics],
    metrics_clf: Output[ClassificationMetrics],
    client_name: str
):

    import os
    import joblib
    import numpy as np
    import pandas as pd
    from xgboost import XGBClassifier
    from sklearn.metrics import (accuracy_score, precision_recall_curve,
                                 roc_auc_score, f1_score, precision_score,
                                 recall_score, confusion_matrix, make_scorer)
    from sklearn.model_selection import train_test_split, GridSearchCV

    # Load the training census dataset
    with open(dataset.path, "r") as train_data:
        raw_data = pd.read_csv(train_data)

    # Step 4: Splitting data into features (X) and target (y)
    X = raw_data.drop(columns=['Ever_Pay'])
    y = raw_data['Ever_Pay']
    y=y.astype('int')


    # Step 5: Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y,
        random_state=42, shuffle=True
    )

    # Custom scoring function
    def custom_scorer(y_true, y_pred):
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        return f1 if precision > recall else 0

    # Create the scorer
    scorer = make_scorer(custom_scorer, greater_is_better=True)

    # Define the model
    xgb_model = XGBClassifier(random_state=42)

    # Define the hyperparameter grid for scale_pos_weight
    param_grid = {
        'scale_pos_weight': np.arange(0,1)
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring=scorer, cv=5)

    # Fit the model
    grid_search.fit(X_train, y_train)

    #Create Prediction
    predictions = grid_search.predict(X_test)


    # Menghitung skor akurasi, AUC, dan kurva presisi-recall
    score = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, pos_label=1, average='binary')
    precision = precision_score(y_test, predictions, pos_label=1, average='binary')
    recall = recall_score(y_test, predictions, pos_label=1, average='binary')
    _ = precision_recall_curve(y_test, predictions)

    # Log metrics
    metrics.log_metric("accuracy", (score * 100.0))
    metrics.log_metric("framework", "XGBoost")
    metrics.log_metric("F1-Score", f1)
    metrics.log_metric("precision", precision)
    metrics.log_metric("recall", recall)
    metrics.log_metric("dataset_size", len(raw_data))

    # Export the model to a file
    os.makedirs(model.path, exist_ok=True)
    model_file_path = os.path.join(model.path, 'model.joblib')
    joblib.dump(grid_search.best_estimator_, model_file_path)

In [None]:
DISPLAY_NAME = "Pre-Call-model"
PREBUILT_CONTAINER = "asia-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest"
GCS_PATH = 'gs://dev-bucket-ds-dashboard/model'

In [None]:
@component(
    packages_to_install=["google-cloud-aiplatform==1.25.0"],
    base_image = 'python:3.9'
)
def deploy_model(
    model: Input[Model],
    project_id: str,
    model_name: str,
    # gcs_path: str,
    region: str,
    vertex_model: Output[Model],
    vertex_endpoint: Output[Artifact],
    client_name : str
):
    """
    Deploys a machine learning model to Google Cloud Vertex AI.

    Parameters:
        - model: The model to be deployed.
        - project_id: Google Cloud project ID.
        - model_name: Display name of the model in Vertex AI.
        - region: Google Cloud region to deploy the model.
        - vertex_model: Output artifact representing the deployed model.
        - vertex_endpoint: Output artifact representing the deployment endpoint.
        - client_name: Name of the client, used for additional context (currently not used in deployment).
    """

    from google.cloud import aiplatform

    aiplatform.init(project=project_id)

    listed_model = aiplatform.Model.list( # GET ALL MODEL IN MODEL REGISTRY , FILTERED BY DISPLAY NAME
        filter= 'display_name = "{}"'.format(model_name),
        project=project_id,
        location=region
    )

    if len(listed_model) > 0: #IF MODEL EXISTS IN REGISTRY
        model_version = listed_model[0]
        deployed_model = aiplatform.Model.upload(
            display_name= model_name,
            parent_model=model_version.resource_name,
            artifact_uri = model.uri,
            serving_container_image_uri = "asia-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",
            location = region,
        )
    else: #IF MODEL IS NEW IN REGISTRY
         deployed_model = aiplatform.Model.upload(
            display_name = model_name,
            artifact_uri = model.uri,
            serving_container_image_uri = "asia-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-0:latest",
            location=region,
        )

    # Deploy the model to an endpoint
#     endpoint = deployed_model.deploy(machine_type="e2-standard-4")

#     # Set the URIs for the deployed model and its endpoint in the output artifacts
#     vertex_endpoint.uri = endpoint.resource_name
#     vertex_model.uri = deployed_model.resource_name

In [None]:
# Define pipeline
@dsl.pipeline(
    name="precall-training-pipeline",
    pipeline_root = PIPELINE_ROOT
)
def pre_call_pipeline(
    project_id: str = PROJECT_ID ,
    bucket_name: str = BUCKET_NAME,
    bucket_uri: str = BUCKET_URI,
    client_name: str = CLIENT_NAME,
    region: str = REGION
):
    """
    Pipeline for training and deploying a model for pre-call analysis.
    This pipeline handles data retrieval, model training, and model deployment.

    Args:
        project_id (str): Google Cloud Project ID where the services are hosted.
        bucket_name (str): Name of the Google Cloud Storage bucket for storing pipeline artifacts.
        bucket_uri (str): URI of the bucket where inputs/outputs are stored.
        client_name (str): Client name to personalize or segment the data/model.
        region (str): Google Cloud region for deploying services and models.
    """

    export_dataset_task = get_and_preprocess_dataset(
        project_id=project_id,
        client_name = client_name
    ).set_display_name('Export Dataset')

    training_task = model_training(
        dataset=export_dataset_task.outputs["dataset"],
        client_name = client_name
    ).set_display_name('Model Training')

    model_deploy_task = deploy_model(
        model=training_task.outputs["model"],
        project_id=project_id,
        model_name= f"{client_name}_precall_model",
        region = region,
        client_name = client_name
    ).set_display_name('Model Deployment')


# Compile the pipeline function into a YAML definition
compiler.Compiler().compile(
    pipeline_func=pre_call_pipeline,
    package_path="pre_call_training_pipeline.yaml"
)
print("Pipeline has been compiled successfully.")

# Initialize the Google Cloud Storage client
storage_client = storage.Client()

# Access the specific bucket
bucket = storage_client.bucket("dev-bucket-ds-dashboard")

# Create a blob in the specified bucket directory
blob = bucket.blob("yaml/pre_call_training_pipeline.yaml")

# Upload the compiled YAML file to Google Cloud Storage
blob.upload_from_filename("pre_call_training_pipeline.yaml")
print("Pipeline YAML has been uploaded successfully.")

In [None]:
job = aiplatform.PipelineJob(
    display_name=f"{CLIENT_NAME}-pre-call-training-pipeline",
    template_path="pre_call_pipeline.yaml",
    pipeline_root= PIPELINE_ROOT,
    location = REGION,
    # parameter_values={
    #     client_name = 'client'
    # }
)

job.run()