In [1]:
import os

import google.cloud.aiplatform as aip

from google_cloud_pipeline_components.experimental.custom_job import utils
from kfp.v2 import compiler, dsl
from kfp.v2.dsl import component

from datetime import datetime

In [2]:
TIMESTAMP = '20230322094518' # datetime.now().strftime("%Y%m%d%H%M%S")

In [3]:
TIMESTAMP

'20230322094518'

In [4]:
shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID = shell_output[0]

In [5]:
PROJECT_ID

'groupby-development'

In [8]:
BUCKET_URI = "gs://gbi_ml/classification_hackathon/pipelines"

In [9]:
BUCKET_URI

'gs://gbi_ml/classification_hackathon/pipelines'

In [10]:
! gsutil mb -l $REGION $BUCKET_URI

CommandException: Incorrect option(s) specified. Usage:

  gsutil mb [-b (on|off)] [-c <class>] [-k <key>] [-l <location>] [-p <project>]
            [--autoclass] [--retention <time>] [--pap <setting>]
            [--rpo (ASYNC_TURBO|DEFAULT)] gs://<bucket_name>...

For additional help run:
  gsutil help mb


In [11]:
! gsutil ls -al $BUCKET_URI

CommandException: One or more URLs matched no objects.


In [12]:
shell_output = !gcloud auth list 2>/dev/null
SERVICE_ACCOUNT = shell_output[2].strip()[8:]
print("Service Account:", SERVICE_ACCOUNT)

Service Account: 937725678441-compute@developer.gserviceaccount.com


In [13]:
EXPERIMENT_NAME = "shopper-persuasion"  # @param {type:"string"}
REGION = "us-central1"

In [11]:
SKLEARN_VERSION = "sklearn-cpu.0-23"
TRAIN_IMAGE = 'us-docker.pkg.dev/vertex-ai/training/scikit-learn-cpu.0-23:latest'

In [14]:
MACHINE_TYPE = "n1-standard"
VCPU = "16"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU

In [16]:
PIPELINE_ROOT = BUCKET_URI

In [28]:
DISPLAY_NAME = 'classification_hackathon_model'

In [18]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

## Model train pipeline

In [22]:
import kfp
from pathlib import Path

from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component, 
                        OutputPath, 
                        InputPath)

In [61]:
@component(
  packages_to_install = [
      "pandas",
      "lightgbm==3.3.3",
      "scikit-learn==1.0.2",
      "gcsfs",
      "pickle5"
  ], base_image="python:3.9",
)
def train_classifier(
  # Use Input[T] to get a metadata-rich handle to the 
  # input artifact of type `Dataset`.
  model: Output[Model], 
):
    import pickle
    import pickle5
    
    import gcsfs
    import pandas as pd
    import lightgbm as lgb
    from sklearn.metrics import accuracy_score, classification_report
    
    # N = 100
    base_dir = 'gs://gbi_ml/classification_hackathon/'
    base_dir_gcsfs = base_dir[5:]
    fs = gcsfs.GCSFileSystem()
    
    with fs.open(base_dir+'id2label.pickle', 'rb') as handle:
        id2label = pickle.load(handle)
    
    with fs.open(base_dir+'X_train.pickle', 'rb') as handle:
        train_embs = pickle5.load(handle)
    
    with fs.open(base_dir+'X_test.pickle', 'rb') as handle:
        test_embs = pickle5.load(handle)
        
    y_train = pd.read_csv(base_dir+'train_labels.csv').label.values
    y_test = pd.read_csv(base_dir+'test_labels.csv').label.values
    
    # X_train, X_test, y_train, y_test = train_test_split(
    #     train_embs, y_train, test_size=0.1, random_state = 42, stratify=y_train)

    # Get the train and test data for the training sequence
#     train_dataset = lgb.Dataset(train_embs, label=y_train)
#     test_dataset = lgb.Dataset(test_embs, label=y_test)
    
    parameters = {
    # 'application': 'binary',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'boosting': 'gbdt',
    # 'num_leaves': 31,
    # 'feature_fraction': 0.5,
    # 'bagging_fraction': 0.5,
    # 'bagging_freq': 20,
    # 'learning_rate': 0.05,
    # 'verbose': 0,
    # 'max_depth': 10,
    # 'device': 'gpu',
    'num_classes': len(id2label),
    'min_data_in_leaf':300
    }

    # Train the classifier
    # model = lgb.LGBMClassifier()
    # model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=- 1, learning_rate=0.1, n_estimators=300, device = "gpu")
    # classifier = lgb.train(
    #     parameters, train_dataset, valid_sets=[train_dataset, test_dataset],
    #     #  num_boost_round=5000,
    #     early_stopping_rounds=10)
    classifier = lgb.LGBMClassifier()
    classifier.fit(train_embs, y_train, eval_set=(test_embs, y_test))
    
    y_test_pred = classifier.predict(test_embs)
    
    with fs.open(base_dir+'y_pred_lgb.pickle', 'wb') as handle:
        pickle5.dump(y_test_pred, handle)
    # report = classification_report(y_test, y_test_pred, output_dict=True, target_names=list(id2label.values))
    # df_report = pandas.DataFrame(report).transpose()
    # df_report.to_csv(base_dir+'classification_report_test.csv')

    model.metadata["framework"] = "lightgbm"
    file_name = model.path + ".pickle"
    with open(file_name, 'wb') as handle:  
        pickle.dump(classifier, handle)

In [62]:
train_classifier_op = utils.create_custom_training_job_op_from_component(
    train_classifier, 
                                                                        machine_type='n1-standard-8',
                                                                        accelerator_type="NVIDIA_TESLA_P100"
                                                                        )

In [63]:
@dsl.pipeline(
    name="classification",
    pipeline_root=PIPELINE_ROOT
)
def pipeline(
    project_id: str = PROJECT_ID
):
    # train_classifier()
    train_classifier_op(
        project=project_id,
        location=REGION
    )

In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="classification_training_spec.json"
)

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path="classification_training_spec.json",
    pipeline_root=PIPELINE_ROOT,
    enable_caching=False
)

job.run(service_account=SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/937725678441/locations/us-central1/pipelineJobs/classification-20230323100708
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/937725678441/locations/us-central1/pipelineJobs/classification-20230323100708')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/classification-20230323100708?project=937725678441
PipelineJob projects/937725678441/locations/us-central1/pipelineJobs/classification-20230323100708 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/937725678441/locations/us-central1/pipelineJobs/classification-20230323100708 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/937725678441/locations/us-central1/pipelineJobs/classification-20230323100708 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/937725678441/locations/us-central1/pipelineJobs/classifi

## Evaluate

In [72]:
import gcsfs
import pickle
import pandas as pd

base_dir = 'gs://gbi_ml/classification_hackathon/'
fs = gcsfs.GCSFileSystem()
with fs.open(base_dir+'y_pred_lgb.pickle', 'rb') as handle:
    y_test_pred = pickle.load(handle)

In [73]:
y_test = pd.read_csv(base_dir+'test_labels.csv').label.values

In [74]:
with fs.open(base_dir+'id2label.pickle', 'rb') as handle:
    id2label = pickle.load(handle)

labels = list(id2label.keys())

In [90]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

def evaluate(y_true, y_pred, df_name):
    top1_acc = accuracy_score(y_true, y_pred)
    top3_acc = top_k_accuracy_score(y_true, y_pred, k=3, labels=labels)
    result = {'top1_acc': top1_acc, 'top3_acc': top3_acc}
    print(f'Accuracy on {df_name} dataset:', result)
    return result

In [91]:
evaluate(y_test, y_test_pred, 'test')

ValueError: Number of given labels (666) not equal to the number of classes in 'y_score' (2).

In [77]:
with fs.open('gs://gbi_ml/classification_hackathon/pipelines/937725678441/classification-20230323100708/train-classifier_-2506190795346804736/model.pickle', 'rb') as handle:
    model = pickle.load(handle)

In [78]:
import pickle5
with fs.open(base_dir+'X_test.pickle', 'rb') as handle:
    test_embs = pickle5.load(handle)

In [79]:
y_test_pred = model.predict(test_embs)

In [92]:
y_test_pred

array([ 79,  79,  79, ..., 537,  79,  79])