## Data Split

In [1]:
import kfp
from kfp.v2 import dsl
from kfp.v2.dsl import component, Input, Output, Dataset, Artifact
from google.cloud import storage
import os
from google.cloud import aiplatform

  from kfp.v2 import dsl


In [5]:
# Environment Variables
GCP_PROJECT = "amazonreviewssentimentanalysis"
GCP_REGION = "us-central1"
BUCKET_NAME = "arsa_model_deployment_uscentral"
DATA_PATH = f"gs://{BUCKET_NAME}/input/labeled_data_1perc.csv"
OUTPUT_DIR = f"gs://{BUCKET_NAME}/output/data/"
CODE_BUCKET_PATH = f"gs://{BUCKET_NAME}/code"
DATA_PREP_CODE = f"gs://{BUCKET_NAME}/code/data_prep"
TRAINER_CODE = f"gs://{BUCKET_NAME}/code/trainer"
MODEL_SAVE_PATH = f"gs://{BUCKET_NAME}/output/models/"


In [6]:
# Initialize Google Cloud Storage client
client = storage.Client(project=GCP_PROJECT)
bucket = client.bucket(BUCKET_NAME)

# Function to upload folder to GCS
def upload_folder_to_gcs(local_folder, bucket, destination_folder):
    # Strip the `gs://<bucket_name>/` prefix from the destination path
    if destination_folder.startswith(f"gs://{bucket.name}/"):
        destination_folder = destination_folder[len(f"gs://{bucket.name}/"):]

    for root, _, files in os.walk(local_folder):
        for file in files:
            local_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_path, local_folder)
            print(local_path,relative_path)

            gcs_path = os.path.join(destination_folder, local_path).replace("\\", "/")
            blob = bucket.blob(gcs_path)
            blob.upload_from_filename(local_path)
            print(f"Uploaded {local_path} to gs://{bucket.name}/{gcs_path}")





In [7]:
#Upload code to GCP
upload_folder_to_gcs("data_prep", bucket, CODE_BUCKET_PATH)
upload_folder_to_gcs("trainer", bucket, CODE_BUCKET_PATH)


data_prep/prepare_data.py prepare_data.py
Uploaded data_prep/prepare_data.py to gs://arsa_model_deployment_uscentral/code/data_prep/prepare_data.py
data_prep/utils/data_loader.py utils/data_loader.py
Uploaded data_prep/utils/data_loader.py to gs://arsa_model_deployment_uscentral/code/data_prep/utils/data_loader.py
data_prep/utils/__init__.py utils/__init__.py
Uploaded data_prep/utils/__init__.py to gs://arsa_model_deployment_uscentral/code/data_prep/utils/__init__.py
trainer/best_hyperparameters.json best_hyperparameters.json
Uploaded trainer/best_hyperparameters.json to gs://arsa_model_deployment_uscentral/code/trainer/best_hyperparameters.json
trainer/experiment_runner_optuna.py experiment_runner_optuna.py
Uploaded trainer/experiment_runner_optuna.py to gs://arsa_model_deployment_uscentral/code/trainer/experiment_runner_optuna.py
trainer/train_save.py train_save.py
Uploaded trainer/train_save.py to gs://arsa_model_deployment_uscentral/code/trainer/train_save.py
trainer/utils/bert_mod

In [42]:
@component(
    packages_to_install=["pandas", "scikit-learn", "google-cloud-storage","torch","gcsfs"],
)
def data_prep_stage(
    code_bucket_path: str,
    input_path: str,
    output_dir: str,
):
    import os
    import sys
    import importlib.util
    import pandas as pd
    from google.cloud import storage

    # Logging setup
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Download code from GCS
    client = storage.Client()
    bucket = client.bucket(code_bucket_path.split('/')[2])
    prefix = '/'.join(code_bucket_path.split('/')[3:])
    blobs = client.list_blobs(bucket, prefix=prefix)

    code_dir = "/tmp/code"
    os.makedirs(code_dir, exist_ok=True)

    # for blob in blobs:
    #     if blob.name.endswith(".py"):
    #         file_path = os.path.join(code_dir, os.path.basename(blob.name))
    #         blob.download_to_filename(file_path)
    #         logger.info(f"Downloaded {blob.name} to {file_path}")

    for blob in blobs:
        if blob.name.endswith(".py"):
            # Remove the prefix (e.g., "code/") to maintain only the internal folder structure
            relative_path = blob.name[len(prefix):].lstrip("/")  # Remove the prefix and any leading slashes

            # Create the full path under /tmp/ with the internal folder structure preserved
            file_path = os.path.join(code_dir, relative_path)

            # Ensure the directory exists
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            # Download the file
            blob.download_to_filename(file_path)
            logger.info(f"Downloaded {blob.name} to {file_path}")

    # Log the files in /tmp/code for debugging
    logger.info(f"Files in {code_dir}: {os.listdir(code_dir)}")

    # Add code_dir to sys.path for importing modules
    sys.path.insert(0, code_dir)
    logger.info(f"sys.path updated: {sys.path}")

    # Import and execute code
    def load_module_from_file(file_path):
        module_name = os.path.splitext(os.path.basename(file_path))[0]
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module

    # Load prepare_data.py
    prepare_data_module = load_module_from_file(f"{code_dir}/prepare_data.py")

    # Execute split_and_save_data from prepare_data.py
    prepare_data_module.split_and_save_data(input_path, output_dir)

    # Upload processed data to GCS
    output_files = os.listdir(output_dir)
    for file_name in output_files:
        local_path = os.path.join(output_dir, file_name)
        blob_path = f"output/data/{file_name}"
        blob = bucket.blob(blob_path)
        blob.upload_from_filename(local_path)
        logger.info(f"Uploaded {local_path} to gs://{bucket.name}/{blob_path}")


  return component_factory.create_component_from_func(


In [17]:
# Define the Pipeline
@dsl.pipeline(
    name="data-prep-stage",
    pipeline_root=f"gs://{BUCKET_NAME}/pipeline_root/",
)
def data_pipeline():
    dynamic_code_execution_task = data_prep_stage(
        code_bucket_path=DATA_PREP_CODE,
        input_path=DATA_PATH,
        output_dir="/tmp/output/",
    )


#### Data prep and train

In [None]:
@component(
    packages_to_install=["pandas", "scikit-learn", "google-cloud-storage", "torch", "gcsfs"],
)
def data_prep_stage(
    code_bucket_path: str,
    input_path: str,
    output_dir: str,
):
    import os
    import sys
    import importlib.util
    import pandas as pd
    from google.cloud import storage

    # Logging setup
    import logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Download code from GCS
    client = storage.Client()
    bucket = client.bucket(code_bucket_path.split('/')[2])
    prefix = '/'.join(code_bucket_path.split('/')[3:])
    blobs = client.list_blobs(bucket, prefix=prefix)

    code_dir = "/tmp/code"
    os.makedirs(code_dir, exist_ok=True)
    ALLOWED_EXTENSIONS = {".py", ".json", ".yaml", ".csv", ".pkl"}

    for blob in blobs:
        if any(blob.name.endswith(ext) for ext in ALLOWED_EXTENSIONS):
            relative_path = blob.name[len(prefix):].lstrip("/")
            file_path = os.path.join(code_dir, relative_path)
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            blob.download_to_filename(file_path)
            logger.info(f"Downloaded {blob.name} to {file_path}")

    logger.info(f"Files in {code_dir}: {os.listdir(code_dir)}")
    sys.path.insert(0, code_dir)

    def load_module_from_file(file_path):
        module_name = os.path.splitext(os.path.basename(file_path))[0]
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module

    prepare_data_module = load_module_from_file(f"{code_dir}/prepare_data.py")
    prepare_data_module.split_and_save_data(input_path, output_dir)

    # Upload processed data to GCS
    output_files = os.listdir(output_dir)
    for file_name in output_files:
        local_path = os.path.join(output_dir, file_name)
        blob_path = f"output/data/{file_name}"
        blob = bucket.blob(blob_path)
        blob.upload_from_filename(local_path)
        logger.info(f"Uploaded {local_path} to gs://{bucket.name}/{blob_path}")


@component(
    packages_to_install=["torch", "google-cloud-storage", "transformers", "pandas", "scikit-learn", "gcsfs","accelerate"],
)
def train_save_stage(
    code_bucket_path: str,
    data_path: str,
    model_save_path: str,
):
    import os
    import sys
    import logging
    from google.cloud import storage
    import importlib.util
    from accelerate import Accelerator


    # Logging setup
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    # Initialize Accelerator
    accelerator = Accelerator()
    
    # Check available device
    logger.info(f"Using device: {accelerator.device}")

    # Download code from GCS
    client = storage.Client()
    bucket = client.bucket(code_bucket_path.split('/')[2])
    prefix = '/'.join(code_bucket_path.split('/')[3:])
    blobs = client.list_blobs(bucket, prefix=prefix)

    code_dir = "/tmp/code"
    os.makedirs(code_dir, exist_ok=True)
    ALLOWED_EXTENSIONS = {".py", ".json", ".yaml", ".csv", ".pkl"}

    for blob in blobs:
        if any(blob.name.endswith(ext) for ext in ALLOWED_EXTENSIONS):
            relative_path = blob.name[len(prefix):].lstrip("/")
            file_path = os.path.join(code_dir, relative_path)
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            blob.download_to_filename(file_path)
            logger.info(f"Downloaded {blob.name} to {file_path}")

    logger.info(f"Files in {code_dir}: {os.listdir(code_dir)}")
    sys.path.insert(0, code_dir)

    def load_module_from_file(file_path):
        module_name = os.path.splitext(os.path.basename(file_path))[0]
        spec = importlib.util.spec_from_file_location(module_name, file_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        return module

    train_save_module = load_module_from_file(f"{code_dir}/train_save.py")
    hyperparameters_path = os.path.join(code_dir, "best_hyperparameters.json")

    train_save_module.train_and_save_final_model(
        hyperparameters=train_save_module.load_hyperparameters(hyperparameters_path),
        data_path=data_path,
        model_save_path=model_save_path,
    )


@dsl.pipeline(
    name="data-prep-and-train",
    pipeline_root=f"gs://{BUCKET_NAME}/pipeline_root/",
)
def data_prep_and_train_pipeline():
    # Step 1: Data Preparation
    data_prep_task = data_prep_stage(
        code_bucket_path=DATA_PREP_CODE,
        input_path=DATA_PATH,
        output_dir=OUTPUT_DIR,
    )

    # Step 2: Training and Saving Model
    train_save_task = train_save_stage(
        code_bucket_path=TRAINER_CODE,
        data_path=OUTPUT_DIR,
        model_save_path=MODEL_SAVE_PATH,
    ).set_cpu_limit("8") \
     .set_memory_limit("32G") \
     .set_gpu_limit(1) \
     .set_accelerator_type("NVIDIA_TESLA_T4")

    train_save_task.after(data_prep_task)

from kfp.v2.compiler import Compiler
from google.cloud import aiplatform

# Define the pipeline file path
pipeline_file_path = "data_prep_and_train_pipeline.json"

# Compile the pipeline
Compiler().compile(pipeline_func=data_prep_and_train_pipeline, package_path=pipeline_file_path)

# Initialize Vertex AI
aiplatform.init(project=GCP_PROJECT, location=GCP_REGION)

# Submit the pipeline to Vertex AI
pipeline_job = aiplatform.PipelineJob(
    display_name="data-prep-and-train-pipeline",
    template_path=pipeline_file_path,
    pipeline_root=f"gs://{BUCKET_NAME}/pipeline_root/",
)

pipeline_job.submit()



Creating PipelineJob
PipelineJob created. Resource name: projects/661148801406/locations/us-central1/pipelineJobs/data-prep-and-train-20241128122825
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/661148801406/locations/us-central1/pipelineJobs/data-prep-and-train-20241128122825')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/data-prep-and-train-20241128122825?project=661148801406


#### Trainer

In [None]:
# import kfp
# from kfp.v2.dsl import component, pipeline
# from kfp.v2.compiler import Compiler
# from google.cloud import aiplatform

# # Environment Variables
# GCP_PROJECT = "amazonreviewssentimentanalysis"
# GCP_REGION = "us-central1"
# BUCKET_NAME = "arsa_model_deployment_uscentral"

# # Create MLflow Server Component
# @component(
#     packages_to_install=["google-cloud-compute", "mlflow", "flask"],
# )
# def create_mlflow_server_component(
#     vm_name: str,
#     region: str,
#     zone: str,
#     bucket_name: str,
#     mlflow_port: int,
# ) -> str:
#     from google.cloud import compute_v1
#     import time
#     import os

#     # Initialize logging
#     import logging
#     logging.basicConfig(level=logging.INFO)
#     logger = logging.getLogger(__name__)

#     # VM Configurations
#     project = os.getenv("GCP_PROJECT")
#     instance_client = compute_v1.InstancesClient()
#     machine_type = f"zones/{zone}/machineTypes/e2-micro"
#     disk_image = "projects/debian-cloud/global/images/family/debian-10"
#     startup_script = f"""#!/bin/bash
#     apt-get update
#     apt-get install -y python3-pip
#     pip3 install mlflow flask google-cloud-storage
#     nohup mlflow server \
#         --backend-store-uri sqlite:///mlflow.db \
#         --default-artifact-root gs://{bucket_name}/mlflow-artifacts/ \
#         --host 0.0.0.0 \
#         --port {mlflow_port} &
#     """

#     # Create the VM
#     instance = compute_v1.Instance()
#     instance.name = vm_name
#     instance.zone = zone
#     instance.machine_type = machine_type
#     instance.network_interfaces = [{"name": "global/networks/default"}]
#     instance.disks = [
#         {
#             "boot": True,
#             "auto_delete": True,
#             "initialize_params": {
#                 "source_image": disk_image,
#                 "disk_size_gb": 10,
#             },
#         }
#     ]
#     instance.metadata = {"items": [{"key": "startup-script", "value": startup_script}]}

#     # Insert the instance
#     operation = instance_client.insert_unary(
#         project=project, zone=zone, instance_resource=instance
#     )
#     logger.info(f"Creating VM {vm_name}, operation: {operation}")
#     time.sleep(60)  # Wait for the VM to start

#     # Get the external IP of the VM
#     vm = instance_client.get(project=project, zone=zone, instance=vm_name)
#     external_ip = vm.network_interfaces[0].access_configs[0].nat_ip

#     # Return the MLflow URI
#     mlflow_uri = f"http://{external_ip}:{mlflow_port}"
#     logger.info(f"MLflow server is available at {mlflow_uri}")
#     return mlflow_uri


# # Experiment Runner Component
# @component(
#     packages_to_install=[
#         "optuna",
#         "mlflow",
#         "torch",
#         "transformers",
#         "scikit-learn",
#         "pandas",
#         "google-cloud-storage",
#         "gcsfs",
#     ],
# )
# def experiment_runner_component(
#     code_bucket_path: str,
#     mlflow_tracking_uri: str,
#     dataset_path: str,
#     output_hyperparams_path: str,
# ):
#     import os
#     import sys
#     import subprocess
#     from google.cloud import storage

#     # Logging setup
#     import logging
#     logging.basicConfig(level=logging.INFO)
#     logger = logging.getLogger(__name__)

#     # Download code from GCS
#     client = storage.Client()
#     bucket_name = code_bucket_path.split("/")[2]
#     prefix = "/".join(code_bucket_path.split("/")[3:])
#     bucket = client.bucket(bucket_name)
#     blobs = bucket.list_blobs(prefix=prefix)

#     code_dir = "/tmp/code"
#     os.makedirs(code_dir, exist_ok=True)

#     for blob in blobs:
#         if blob.name.endswith(".py"):
#             # Maintain folder structure
#             relative_path = blob.name[len(prefix) :].lstrip("/")
#             file_path = os.path.join(code_dir, relative_path)
#             os.makedirs(os.path.dirname(file_path), exist_ok=True)
#             blob.download_to_filename(file_path)
#             logger.info(f"Downloaded {blob.name} to {file_path}")

#     # Add code_dir to sys.path
#     sys.path.insert(0, code_dir)
#     logger.info(f"sys.path updated: {sys.path}")

#     # Set MLflow tracking URI
#     os.environ["MLFLOW_TRACKING_URI"] = mlflow_tracking_uri
#     logger.info(f"MLflow tracking URI set to: {mlflow_tracking_uri}")

#     # Run the experiment
#     try:
#         script_path = os.path.join(code_dir, "experiment_runner_optuna.py")
#         subprocess.run(["python3", script_path, "--data_path", dataset_path], check=True)

#         # Move the best hyperparameters file to the output path
#         hyperparams_local_path = os.path.join(code_dir, "best_hyperparameters.json")
#         storage_path = os.path.join(output_hyperparams_path, "best_hyperparameters.json")
#         if os.path.exists(hyperparams_local_path):
#             output_bucket = client.bucket(bucket_name)
#             blob = output_bucket.blob(storage_path)
#             blob.upload_from_filename(hyperparams_local_path)
#             logger.info(f"Uploaded best hyperparameters to: gs://{bucket_name}/{storage_path}")
#         else:
#             logger.warning("best_hyperparameters.json not found.")
#     except Exception as e:
#         logger.error(f"Experiment failed: {e}")
#         raise


# # Define the Pipeline
# @pipeline(
#     name="mlflow-experiment-runner-pipeline",
#     pipeline_root=f"gs://{BUCKET_NAME}/pipeline_root/",
# )
# def mlflow_pipeline(
#     vm_name: str = "mlflow-server",
#     region: str = GCP_REGION,
#     zone: str = "us-central1-a",
#     bucket_name: str = BUCKET_NAME,
#     mlflow_port: int = 5000,
#     dataset_path: str = DATA_PATH,
# ):
#     # Step 1: Create MLflow Server
#     mlflow_server_task = create_mlflow_server_component(
#         vm_name=vm_name,
#         region=region,
#         zone=zone,
#         bucket_name=bucket_name,
#         mlflow_port=mlflow_port,
#     )

#     # Step 2: Run Experiment
#     experiment_runner_task = experiment_runner_component(
#         code_bucket_path=f"gs://{BUCKET_NAME}/code/",
#         mlflow_tracking_uri=mlflow_server_task.output,
#         dataset_path=dataset_path,
#         output_hyperparams_path=f"gs://{BUCKET_NAME}/output/hyperparams/",
#     ).set_cpu_limit("4").set_memory_limit("16Gi")

# # Compile the pipeline
# pipeline_file_path = "mlflow_pipeline.json"
# Compiler().compile(pipeline_func=mlflow_pipeline, package_path=pipeline_file_path)

# # Submit the pipeline
# aiplatform.init(
#     project=GCP_PROJECT,
#     location=GCP_REGION,
#     staging_bucket=f"gs://{BUCKET_NAME}",
# )

# pipeline_job = aiplatform.PipelineJob(
#     display_name="mlflow-experiment-pipeline",
#     template_path=pipeline_file_path,
#     pipeline_root=f"gs://{BUCKET_NAME}/pipeline_root/",
# )

# pipeline_job.submit()


  return component_factory.create_component_from_func(


Creating PipelineJob
PipelineJob created. Resource name: projects/661148801406/locations/us-central1/pipelineJobs/mlflow-experiment-runner-pipeline-20241124101429
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/661148801406/locations/us-central1/pipelineJobs/mlflow-experiment-runner-pipeline-20241124101429')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/mlflow-experiment-runner-pipeline-20241124101429?project=661148801406


In [22]:

# Initialize the Vertex AI client
aiplatform.init(
    project=GCP_PROJECT,
    location=GCP_REGION,
    staging_bucket=f"gs://{BUCKET_NAME}",
)

# Submit the pipeline job
pipeline_job = aiplatform.PipelineJob(
    display_name="experiment-runner-pipeline",
    template_path=pipeline_file_path,
    pipeline_root=f"gs://{BUCKET_NAME}/pipeline_root/",
    parameter_values={
        "machine_type": "e2-standard-4",  # Specify machine type dynamically
    },
)

pipeline_job.submit()


Creating PipelineJob
PipelineJob created. Resource name: projects/661148801406/locations/us-central1/pipelineJobs/experiment-runner-pipeline-20241124094851
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/661148801406/locations/us-central1/pipelineJobs/experiment-runner-pipeline-20241124094851')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/experiment-runner-pipeline-20241124094851?project=661148801406


In [17]:
from google.cloud import compute_v1

def log_active_vms(project_id, region):
    """
    Logs active VMs in the given project and region.

    Args:
        project_id (str): GCP project ID.
        region (str): GCP region (e.g., "us-central1").
    """
    client = compute_v1.InstancesClient()
    zones_client = compute_v1.ZonesClient()

    # Get all zones in the region
    zones = [
        zone.name
        for zone in zones_client.list(project=project_id)
        if zone.name.startswith(region)
    ]

    print(f"Checking active VMs in project '{project_id}' and region '{region}'...")
    for zone in zones:
        instances = client.list(project=project_id, zone=zone)
        for instance in instances:
            print(
                f"Instance: {instance.name}, Zone: {zone}, Status: {instance.status}, Machine Type: {instance.machine_type}"
            )

# Call the function to log active VMs
log_active_vms(GCP_PROJECT, GCP_REGION)


ImportError: cannot import name 'compute_v1' from 'google.cloud' (unknown location)